How to use json_normalize to create subcolumns in csv file - json

I need some help on using json_normalizer to create multiple subcolumns from main columns ? The below code is able to generate an output file but its puts everything in one column however i need something seperate columns with heading like moniker.config, moniker.type, moniker.key, moniker.keyparts for each row
from pandas.io.json import json_normalize
import os
import pandas as pd
def json_normalize_recursive(base_column, data, df=pd.DataFrame()):
if df.empty:
df = json_normalize(data, record_prefix=base_column+'.')
nested = df.select_dtypes(include='object')
for col in nested.columns:
try:
nested_df = json_normalize(nested[col].tolist())
nested_df.columns = [base_column+'.'+str(col)+'.'+str(c) for c in nested_df.columns]
df = pd.concat([df.drop(col, axis=1), nested_df], axis=1)
except ValueError:
pass
return df
data = {
"errors":[
],
"data":[
{
"moniker":{
"config":"fx.ipv.london.eod",
"type":"fx.spot",
"key":"EUR/CZK",
"keyParts":[
"EUR",
"CZK"
],
"configType":"fx.ipv.london.eod/fx.spot",
"live":True
},
"queryMoniker":{
"config":"fx.ipv.london.eod",
"type":"EUR/CZK",
"key":"EUR/CZK",
"tag":{
"owner":"official",
"type":"fx.spot",
"key":"EUR/CZK",
"tag":{
"owner":"official",
"date":13434324400999,
"cutoff":"London",
"name":"ipv",
"live":True
},
"keyParts":[
"EUR",
"CZK"
],
"configType":"fx.ipv.london.eod/fx.spot",
"live":False
},
"instance":{
"data":"<FxSpot Currency1=\"EUR\"Currency2=\"CZK\" bid=\"24.14\" ask=\"24.147\"/>",
"unmarshalled":True,
"marshalled":True,
"format":"fx/xml/1",
"valid":True,
"sequence":1643434234234,
"instanceMoniker":{
"source":"viper.tagcopy",
"config":"fx.london.official.copy",
"keyParts":[
"EUR",
"CZK"
]
}
}
}
}
]
}
df = json_normalize_recursive('', data)
print(df)
cwd = os.getcwd()
filepath = os.path.join(cwd, 'Desktop', 'output.csv')
df.to_csv(filepath, index=False)
Desired output:

try using df = pd.json_normalize(data,'data'['monier','queryMonier']

You can try:
data = {
"errors": [],
"data": [
{
"moniker": {
"config": "fx.ipv.london.eod",
"type": "fx.spot",
"key": "EUR/CZK",
"keyParts": ["EUR", "CZK"],
"configType": "fx.ipv.london.eod/fx.spot",
"live": True,
},
"queryMoniker": {
"config": "fx.ipv.london.eod",
"type": "EUR/CZK",
"key": "EUR/CZK",
"tag": {
"owner": "official",
"type": "fx.spot",
"key": "EUR/CZK",
"tag": {
"owner": "official",
"date": 13434324400999,
"cutoff": "London",
"name": "ipv",
"live": True,
},
"keyParts": ["EUR", "CZK"],
"configType": "fx.ipv.london.eod/fx.spot",
"live": False,
},
"instance": {
"data": '<FxSpot Currency1="EUR"Currency2="CZK" bid="24.14" ask="24.147"/>',
"unmarshalled": True,
"marshalled": True,
"format": "fx/xml/1",
"valid": True,
"sequence": 1643434234234,
"instanceMoniker": {
"source": "viper.tagcopy",
"config": "fx.london.official.copy",
"keyParts": ["EUR", "CZK"],
},
},
},
}
],
}
df = pd.DataFrame(data['data'])
df = pd.concat([df, df.pop('moniker').apply(pd.Series).add_prefix('moniker.')], axis=1)
df = pd.concat([df, df.pop('queryMoniker').apply(pd.Series).add_prefix('queryMoniker.')], axis=1)
df = pd.concat([df, df.pop('queryMoniker.tag').apply(pd.Series).add_prefix('queryMoniker.tag.')], axis=1)
df = pd.concat([df, df.pop('queryMoniker.instance').apply(pd.Series).add_prefix('queryMoniker.instance.')], axis=1)
df = df.explode('moniker.keyParts')
print(df)
Prints:
moniker.config moniker.type moniker.key moniker.keyParts moniker.configType moniker.live queryMoniker.config queryMoniker.type queryMoniker.key queryMoniker.tag.owner queryMoniker.tag.type queryMoniker.tag.key queryMoniker.tag.tag queryMoniker.tag.keyParts queryMoniker.tag.configType queryMoniker.tag.live queryMoniker.instance.data queryMoniker.instance.unmarshalled queryMoniker.instance.marshalled queryMoniker.instance.format queryMoniker.instance.valid queryMoniker.instance.sequence queryMoniker.instance.instanceMoniker
0 fx.ipv.london.eod fx.spot EUR/CZK EUR fx.ipv.london.eod/fx.spot True fx.ipv.london.eod EUR/CZK EUR/CZK official fx.spot EUR/CZK {'owner': 'official', 'date': 13434324400999, 'cutoff': 'London', 'name': 'ipv', 'live': True} [EUR, CZK] fx.ipv.london.eod/fx.spot False <FxSpot Currency1="EUR"Currency2="CZK" bid="24.14" ask="24.147"/> True True fx/xml/1 True 1643434234234 {'source': 'viper.tagcopy', 'config': 'fx.london.official.copy', 'keyParts': ['EUR', 'CZK']}
0 fx.ipv.london.eod fx.spot EUR/CZK CZK fx.ipv.london.eod/fx.spot True fx.ipv.london.eod EUR/CZK EUR/CZK official fx.spot EUR/CZK {'owner': 'official', 'date': 13434324400999, 'cutoff': 'London', 'name': 'ipv', 'live': True} [EUR, CZK] fx.ipv.london.eod/fx.spot False <FxSpot Currency1="EUR"Currency2="CZK" bid="24.14" ask="24.147"/> True True fx/xml/1 True 1643434234234 {'source': 'viper.tagcopy', 'config': 'fx.london.official.copy', 'keyParts': ['EUR', 'CZK']}

Related

pandas column to list for a json file

from a Dataframe, I want to have a JSON output file with one key having a list:
Expected output:
[
{
"model": "xx",
"id": 1,
"name": "xyz",
"categories": [1,2],
},
{
...
},
]
What I have:
[
{
"model": "xx",
"id": 1,
"name": "xyz",
"categories": "1,2",
},
{
...
},
]
The actual code is :
df = pd.read_excel('data_threated.xlsx')
result = df.reset_index(drop=True).to_json("output_json.json", orient='records')
parsed = json.dumps(result)
jsonfile = open("output_json.json", 'r')
data = json.load(jsonfile)
How can I achive this easily?
EDIT:
print(df['categories'].unique().tolist())
['1,2,3', 1, nan, '1,2,3,6', 9, 8, 11, 4, 5, 2, '1,2,3,4,5,6,7,8,9']
You can use:
df = pd.read_excel('data_threated.xlsx').reset_index(drop=True)
df['categories'] = df['categories'].apply(lambda x: [int(i) for i in x.split(',')] if isinstance(x, str) else '')
df.to_json('output.json', orient='records', indent=4)
Content of output.json
[
{
"model":"xx",
"id":1,
"name":"xyz",
"categories":[
1,
2
]
}
]
Note you can also use:
df['categories'] = pd.eval(df['categories'])

JSON to CSV - Go 4 level deep

I would like to extract only a small fraction of my JSON response in a .csv file. However, I need to go to 4 levels deep and I am currently only able to go to 3 level deep. My goal is to have a .csv with 3 columns campaign_id, campaign_name, cost_per_click and 3 lines for each of my campaigns.
Original JSON
{
"318429215527453": {
"conversion_events": {
"data": [
{
"id": "djdfhdf",
"name": "Total",
"cost": 328.14,
"metrics_breakdown": {
"data": [
{
"campaign_id": 2364,
"campaign_name": "uk",
"cost_per_click": 1345
},
{
"campaign_id": 7483,
"campaign_name": "fr",
"cost_per_click": 756
},
{
"campaign_id": 8374,
"campaign_name": "spain",
"cost_per_click": 545
},
{
"campaign_id": 2431,
"campaign_name": "ge",
"cost_per_click": 321
}
],
"paging": {
"cursors": {
"after": "MjUZD"
},
"next": "https://graph.facebook.com/v9.0/xxxx"
}
}
}
],
"summary": {
"count": 1,
"metric_date_range": {
"date_range": {
"begin_date": "2021-01-09T00:00:00+0100",
"end_date": "2021-02-08T00:00:00+0100",
"time_zone": "Europe/Paris"
},
"prior_period_date_range": {
"begin_date": "2020-12-10T00:00:00+0100",
"end_date": "2021-01-09T00:00:00+0100"
}
}
}
},
"id": "xxx"
}
}
reformated.py
import json
with open('campaigns.json') as json_file:
data = json.load(json_file)
reformated_json = data['318429215527453']['conversion_events']['data']
with open('data.json', 'w') as outfile:
json.dump(reformated_json, outfile)
I tried to add ['metrics_breakdown'] or another ['data'] at the end of reformated_json but I am getting TypeError: list indices must be integers or slices, not str.
{
"id": "djdfhdf",
"name": "Total",
"cost": 328.14,
"metrics_breakdown": {
"data": [
{
"campaign_id": 2364,
"campaign_name": "uk",
"cost_per_click": 1345,
},
{
"campaign_id": 7483,
"campaign_name": "fr",
"cost_per_click": 756,
},
{
"campaign_id": 8374,
"campaign_name": "spain",
"cost_per_click": 545,
},
{
"campaign_id": 2431,
"campaign_name": "ge",
"cost_per_click": 321,
},
],
"paging": {
"cursors": {
"after": "MjUZD"
},
"next": "https://graph.facebook.com/v9.0/xxxx"
}
}
}
]
import csv
import json
from typing import Dict, List, Union # typing for easy development
# read json function
def read_json(json_path: str) -> Union[Dict, List]:
with open(json_path, 'r') as file_io:
return json.load(file_io)
# write csv function
def write_csv(data: List[Dict], csv_path: str) -> None:
with open(csv_path, 'w') as file:
fieldnames = set().union(*data)
writer = csv.DictWriter(file, fieldnames=fieldnames,
lineterminator='\n')
writer.writeheader()
writer.writerows(data)
# parse campaigns using a comprehension
def parse_campaigns(data: Dict) -> List[Dict]:
return [row
for value in data.values() # first level (conversion events)
for root_data in value['conversion_events']['data'] # conversion events/data
for row in root_data['metrics_breakdown']['data']] # data/metrics_breakdown/data
json_data = read_json('./campaigns.json')
campaign_data = parse_campaigns(json_data)
write_csv(campaign_data, 'campaigns.csv')
campaigns.csv (I copied the data to multiple root dictionary objects):
cost_per_click,campaign_id,campaign_name
1345,2364,uk
756,7483,fr
545,8374,spain
321,2431,ge
1345,2364,uk
756,7483,fr
545,8374,spain
321,2431,ge
The first data subkey contains a single-element list. Dereference with [0] to get the element, then fetch the next layers of keys. Then a DictWriter can be used to write the CSV lines:
import json
import csv
with open('campaigns.json') as json_file:
data = json.load(json_file)
items = data['318429215527453']['conversion_events']['data'][0]['metrics_breakdown']['data']
with open('data.csv', 'w', newline='') as outfile:
w = csv.DictWriter(outfile,fieldnames=items[0].keys())
w.writeheader()
w.writerows(items)
Output:
campaign_id,campaign_name,cost_per_click
2364,uk,1345
7483,fr,756
8374,spain,545
2431,ge,321

Add a # to beginning of each key in Json Python2.7

I'm trying to add a "#" at the beginning to each key of a Json object (got it from RabbitMQ api calls)
here is my attempt :
#!/bin/python
# Libraries import
import requests
import json
import sys
import os
# Define URLs
overview="/api/overview"
nodes="/api/nodes"
queues="/api/queues"
# Get credentials from file
with open('/credentials') as json_file:
data = json.load(json_file)
user = data['user']
passwd = data['pass']
# Test which URL we want to call
if ''.join(sys.argv[1]) == "overview":
commande=overview
if ''.join(sys.argv[1]) == "queues":
commande=queues
if ''.join(sys.argv[1]) == "nodes":
commande=nodes
def append(mydict):
return dict(map(lambda (key, value): ("#"+str(key), value), mydict.items()))
def transform(multileveldict):
new = append(multileveldict)
for key, value in new.items():
if isinstance(value, dict):
new[key] = transform(value)
return new
def upper_keys(x):
if isinstance(x, list):
return [upper_keys(v) for v in x]
elif isinstance(x, dict):
return dict((k.upper(), upper_keys(v)) for k, v in x.iteritems())
else:
return x
# Main
response = requests.get("http://localhost:15672" + commande, auth=(user, passwd))
if(response.ok):
json_data = json.loads(response.content)
json = json.dumps(upper_keys(json_data), indent=4)
print(json)
Here is the JSON that I get in "response.content" :
[
{
"NODE": "rabbit#server567",
"EXCLUSIVE": false,
"NAME": "test-01",
"SYNCHRONISED_SLAVE_NODES": [],
"SLAVE_NODES": [],
"AUTO_DELETE": false,
"VHOST": "/",
"ARGUMENTS": {},
"TYPE": "classic",
"DURABLE": false
},
{
"NODE": "rabbit#server567",
"EXCLUSIVE": false,
"NAME": "test-02",
"SYNCHRONISED_SLAVE_NODES": [],
"SLAVE_NODES": [],
"AUTO_DELETE": false,
"VHOST": "/",
"ARGUMENTS": {},
"TYPE": "classic",
"DURABLE": false
},
{
"NODE": "rabbit#server567",
"EXCLUSIVE": false,
"NAME": "test-03",
"SYNCHRONISED_SLAVE_NODES": [],
"SLAVE_NODES": [],
"AUTO_DELETE": false,
"VHOST": "/",
"ARGUMENTS": {},
"TYPE": "classic",
"DURABLE": false
},
{
"MESSAGES_UNACKNOWLEDGED_RAM": 0,
"RECOVERABLE_SLAVES": null,
"CONSUMERS": 0,
"REDUCTIONS": 9700519,
"AUTO_DELETE": false,
"MESSAGE_BYTES_PAGED_OUT": 0,
"MESSAGE_BYTES_UNACKNOWLEDGED": 0,
"REDUCTIONS_DETAILS": {
"RATE": 0.0
},
"MESSAGE_BYTES": 0,
"MESSAGES_UNACKNOWLEDGED": 0,
"CONSUMER_UTILISATION": null,
"EXCLUSIVE": false,
"VHOST": "/",
"GARBAGE_COLLECTION": {
"MAX_HEAP_SIZE": 0,
"MIN_HEAP_SIZE": 233,
"FULLSWEEP_AFTER": 65535,
"MINOR_GCS": 15635,
"MIN_BIN_VHEAP_SIZE": 46422
},
"MESSAGES_DETAILS": {
"RATE": 0.0
},
"SLAVE_NODES": [
"rabbit#server567"
],
"MESSAGE_BYTES_PERSISTENT": 0,
"POLICY": "ha-all",
"MESSAGES_PAGED_OUT": 0,
"NODE": "rabbit#server566",
"HEAD_MESSAGE_TIMESTAMP": null,
"DURABLE": false,
"MESSAGES_READY_RAM": 0,
"STATE": "running",
"ARGUMENTS": {},
"EFFECTIVE_POLICY_DEFINITION": {
"HA-MODE": "all"
},
"MESSAGES_READY": 0,
"MESSAGES_RAM": 0,
"MESSAGE_BYTES_READY": 0,
"SINGLE_ACTIVE_CONSUMER_TAG": null,
"NAME": "test-04",
"MESSAGES_PERSISTENT": 0,
"BACKING_QUEUE_STATUS": {
"MIRROR_SENDERS": 0,
"Q1": 0,
"Q3": 0,
"Q2": 0,
"Q4": 0,
"AVG_ACK_EGRESS_RATE": 0.0,
"MIRROR_SEEN": 0,
"LEN": 0,
"TARGET_RAM_COUNT": "infinity",
"MODE": "default",
"NEXT_SEQ_ID": 0,
"DELTA": [
"delta",
"undefined",
0,
0,
"undefined"
],
"AVG_ACK_INGRESS_RATE": 0.0,
"AVG_EGRESS_RATE": 0.0,
"AVG_INGRESS_RATE": 0.0
},
"MESSAGES": 0,
"IDLE_SINCE": "2020-10-16 13:50:50",
"OPERATOR_POLICY": null,
"SYNCHRONISED_SLAVE_NODES": [
"rabbit#server567"
],
"MEMORY": 10556,
"EXCLUSIVE_CONSUMER_TAG": null,
"MESSAGES_READY_DETAILS": {
"RATE": 0.0
},
"TYPE": "classic",
"MESSAGES_UNACKNOWLEDGED_DETAILS": {
"RATE": 0.0
},
"MESSAGE_BYTES_RAM": 0
}
]
Here, I made every key in uppercase and can display it has JSON but can't find anything to add this "#" to the beginning of each key
PS : I'm new to Python development
Thank you very much
Since you mentioned that you have successfully converted every keys in a dictionary into upper case keys, why don't you reuse the method and change the part where you do upper case into prepending "#"
# the one you provided
def upper_keys(x):
if isinstance(x, list):
return [upper_keys(v) for v in x]
elif isinstance(x, dict):
return dict((k.upper(), upper_keys(v)) for k, v in x.iteritems())
else:
return x
# the modified method
def prepend_hash_keys(x):
if isinstance(x, list):
return [prepend_hash_keys(v) for v in x]
elif isinstance(x, dict):
# this part from k.upper() to "#" + k
return dict(("#" + k, prepend_hash_keys(v)) for k, v in x.iteritems())
else:
return x
Your transform function actually works fine (for Python 2), you just forgot to actually call it! Instead, you call only upper_keys, but not transform:
json = json.dumps(upper_keys(json_data), indent=4) # where's transform?
If you use both one after the other (order does not matter) it should work:
json = {"nested": {"dict": {"with": {"lowercase": "keys"}}}}
print(transform(upper_keys(json)))
# {'#NESTED': {'#DICT': {'#WITH': {'#LOWERCASE': 'keys'}}}}
However, both transform and upper_keys can be simplified a lot using dictionary comprehensions (also available in Python 2), and you can combine both in one function:
def transform_upper(d):
if isinstance(d, dict):
return {"#" + k.upper(): transform_upper(v) for k, v in d.items()}
else:
return d
print(transform_upper(json))
# {'#NESTED': {'#DICT': {'#WITH': {'#LOWERCASE': 'keys'}}}}
From the look of it you already tried something like that in append() function.
If you modify that a bit to have something like this, it may do what you are looking for:
mydict = {
'name':1,
'surname':2
}
def append(mydict):
new_dict = {}
for key, val in mydict.items():
new_dict['#'+key]=val
return new_dict
print(append(mydict))

convert pandas dataframe to json with specific format

I am trying to convert the dataframe mentioned below to desired json
enter image description here
column_id,column_name,mandatory,column_data_type,column_data_length,_id,data_format,file_type,active_ind
1,PAT_ID,FALSE,VARCHAR,2500,5f2193c39448c44f0c1b65e0,TEXT,FACT,TRUE
2,PAT_NAME,FALSE,VARCHAR,2500,5f2193c39448c44f0c1b65e0,TEXT,FACT,TRUE
3,PAT_AGE,FALSE,VARCHAR,2500,5f2193c39448c44f0c1b65e0,TEXT,FACT,TRUE
Like the json mentioned below
{
"_id": 5f2193c39448c44f0c1b65e0,
"data_format": "TEXT",
"file_type": "FACT",
"columns": [
{
"column_id": 1,
"column_name": "PAT_ID",
"mandatory": "false",
"column_data_type": "VARCHAR",
"column_data_length": 2500
},
{
"column_id": 2,
"column_name": "PAT_NAME",
"mandatory": "false",
"column_data_type": "VARCHAR",
"column_data_length": 2500
}
],
"active_ind": "true",
}
I tried to group by based on column name and column id with so many methods
this will group column but not all the values
cac= df.groupby('column_id').apply(lambda x: x.to_json(orient='records'))
cac = df.to_json(orient='records')
I am not able to separate the id and columns.
Please help me on this
here is what I would do:
# Load data
df = pd.read_csv('data.csv')
# Create list of dict for columns column
col_set = ['column_id',
'column_name',
'mandatory',
'column_data_type',
'column_data_length']
df['columns'] = df[col_set].apply(lambda x: x.to_dict(), axis=1)
reorder = ['column_id',
'column_name',
'mandatory',
'column_data_type',
'column_data_length',
'columns',
'active_ind',
'_id',
'data_format',
'file_type']
df = df[reorder]
# Group by similar rows and join sub dicts
col_set_2 = ['_id', 'data_format', 'file_type', 'columns', 'active_ind']
col_set_3 = ['_id', 'data_format', 'file_type', 'active_ind']
df2 = df[col_set_2].groupby(col_set_3)['columns'].apply(lambda x: list(x)).reset_index()
df2 = df2[col_set_2]
# Dataframe to json
parsed = json.loads(df2.to_json(orient='records', indent=4))
result = json.dumps(parsed[0], indent=4)
print(result)
{
"_id": "5f2193c39448c44f0c1b65e0",
"data_format": "TEXT",
"file_type": "FACT",
"columns": [
{
"column_id": 1,
"column_name": "PAT_ID",
"mandatory": false,
"column_data_type": "VARCHAR",
"column_data_length": 2500
},
{
"column_id": 2,
"column_name": "PAT_NAME",
"mandatory": false,
"column_data_type": "VARCHAR",
"column_data_length": 2500
},
{
"column_id": 3,
"column_name": "PAT_AGE",
"mandatory": false,
"column_data_type": "VARCHAR",
"column_data_length": 2500
}
],
"active_ind": true
}

How to make dataframe table from json file

I have json file I want to convert it to pandas dataframe, take some variables from 'tags' and take some variables from 'fields'
{
"tags": {
"ID": "99909",
"type": "fff",
"ID2": "565789"
},
"timestamp": 1500079519064,
"tenant": "dxy",
"tstable": "data",
"user": "writer",
"fields": {
"a": "0.003",
"b": "0.011",
}
}
Required output:
df_out=pd.DataFrame({'ID':[99909],'type':["fff"],'ID2':[565789],"timestamp": [1500079519064],"tenant": ["dxy"],"tstable": ["data"],"user": ["writer"],"a": ["0.003"],"b": ["0.011"]})
print(df_out)
ID type ID2 timestamp tenant tstable user a b
0 99909 fff 565789 1577078519064 dxy data writer 0.003 0.011
Use json.json_normalize:
j = {
"tags": {
"ID": "99909",
"type": "fff",
"ID2": "565789"
},
"timestamp": 1500079519064,
"tenant": "dxy",
"tstable": "data",
"user": "writer",
"fields": {
"a": "0.003",
"b": "0.011",
}
}
from pandas.io.json import json_normalize
df = json_normalize(j)
print (df)
timestamp tenant tstable user tags.ID tags.type tags.ID2 fields.a \
0 1500079519064 dxy data writer 99909 fff 565789 0.003
fields.b
0 0.011
Last if necessary change columns names add rename:
f = lambda x: x.split('.')[-1]
df = json_normalize(j).rename(columns=f)
print (df)
timestamp tenant tstable user ID type ID2 a b
0 1500079519064 dxy data writer 99909 fff 565789 0.003 0.011
If you have nested columns then you first need to normalize the data:
import pandas as pd
from pandas.io.json import json_normalize
data = [
{
"tags": {
"ID": "99909",
"type": "fff",
"ID2": "565789"
},
"timestamp": 1500079519064,
"tenant": "dxy",
"tstable": "data",
"user": "writer",
"fields": {
"a": "0.003",
"b": "0.011",
}
}]
df = pd.DataFrame.from_dict(json_normalize(data), orient='columns')
print(df)