JSON to CSV - Go 4 level deep - json

I would like to extract only a small fraction of my JSON response in a .csv file. However, I need to go to 4 levels deep and I am currently only able to go to 3 level deep. My goal is to have a .csv with 3 columns campaign_id, campaign_name, cost_per_click and 3 lines for each of my campaigns.
Original JSON
{
"318429215527453": {
"conversion_events": {
"data": [
{
"id": "djdfhdf",
"name": "Total",
"cost": 328.14,
"metrics_breakdown": {
"data": [
{
"campaign_id": 2364,
"campaign_name": "uk",
"cost_per_click": 1345
},
{
"campaign_id": 7483,
"campaign_name": "fr",
"cost_per_click": 756
},
{
"campaign_id": 8374,
"campaign_name": "spain",
"cost_per_click": 545
},
{
"campaign_id": 2431,
"campaign_name": "ge",
"cost_per_click": 321
}
],
"paging": {
"cursors": {
"after": "MjUZD"
},
"next": "https://graph.facebook.com/v9.0/xxxx"
}
}
}
],
"summary": {
"count": 1,
"metric_date_range": {
"date_range": {
"begin_date": "2021-01-09T00:00:00+0100",
"end_date": "2021-02-08T00:00:00+0100",
"time_zone": "Europe/Paris"
},
"prior_period_date_range": {
"begin_date": "2020-12-10T00:00:00+0100",
"end_date": "2021-01-09T00:00:00+0100"
}
}
}
},
"id": "xxx"
}
}
reformated.py
import json
with open('campaigns.json') as json_file:
data = json.load(json_file)
reformated_json = data['318429215527453']['conversion_events']['data']
with open('data.json', 'w') as outfile:
json.dump(reformated_json, outfile)
I tried to add ['metrics_breakdown'] or another ['data'] at the end of reformated_json but I am getting TypeError: list indices must be integers or slices, not str.
{
"id": "djdfhdf",
"name": "Total",
"cost": 328.14,
"metrics_breakdown": {
"data": [
{
"campaign_id": 2364,
"campaign_name": "uk",
"cost_per_click": 1345,
},
{
"campaign_id": 7483,
"campaign_name": "fr",
"cost_per_click": 756,
},
{
"campaign_id": 8374,
"campaign_name": "spain",
"cost_per_click": 545,
},
{
"campaign_id": 2431,
"campaign_name": "ge",
"cost_per_click": 321,
},
],
"paging": {
"cursors": {
"after": "MjUZD"
},
"next": "https://graph.facebook.com/v9.0/xxxx"
}
}
}
]

import csv
import json
from typing import Dict, List, Union # typing for easy development
# read json function
def read_json(json_path: str) -> Union[Dict, List]:
with open(json_path, 'r') as file_io:
return json.load(file_io)
# write csv function
def write_csv(data: List[Dict], csv_path: str) -> None:
with open(csv_path, 'w') as file:
fieldnames = set().union(*data)
writer = csv.DictWriter(file, fieldnames=fieldnames,
lineterminator='\n')
writer.writeheader()
writer.writerows(data)
# parse campaigns using a comprehension
def parse_campaigns(data: Dict) -> List[Dict]:
return [row
for value in data.values() # first level (conversion events)
for root_data in value['conversion_events']['data'] # conversion events/data
for row in root_data['metrics_breakdown']['data']] # data/metrics_breakdown/data
json_data = read_json('./campaigns.json')
campaign_data = parse_campaigns(json_data)
write_csv(campaign_data, 'campaigns.csv')
campaigns.csv (I copied the data to multiple root dictionary objects):
cost_per_click,campaign_id,campaign_name
1345,2364,uk
756,7483,fr
545,8374,spain
321,2431,ge
1345,2364,uk
756,7483,fr
545,8374,spain
321,2431,ge

The first data subkey contains a single-element list. Dereference with [0] to get the element, then fetch the next layers of keys. Then a DictWriter can be used to write the CSV lines:
import json
import csv
with open('campaigns.json') as json_file:
data = json.load(json_file)
items = data['318429215527453']['conversion_events']['data'][0]['metrics_breakdown']['data']
with open('data.csv', 'w', newline='') as outfile:
w = csv.DictWriter(outfile,fieldnames=items[0].keys())
w.writeheader()
w.writerows(items)
Output:
campaign_id,campaign_name,cost_per_click
2364,uk,1345
7483,fr,756
8374,spain,545
2431,ge,321

Related

How to identify and explode a nested json file as columns of a dataframe?

I am reframing my question again so that it would be more clear.
My data looks like this .
{
"Research": {
"#xmlns": "http://www.xml.org/2013/2/XML",
"#language": "eng",
"#createDateTime": "2022-03-25T10:12:39Z",
"#researchID": "abcd",
"Product": {
"#productID": "abcd",
"StatusInfo": {
"#currentStatusIndicator": "Yes",
"#statusDateTime": "2022-03-25T12:18:41Z",
"#statusType": "Published"
},
"Source": {
"Organization": {
"#primaryIndicator": "Yes",
"#type": "SellSideFirm",
"OrganizationID": [
{
"#idType": "L1",
"#text": "D827C98E315F"
},
{
"#idType": "TR",
"#text": "3202"
},
{
"#idType": "TR",
"#text": "SZA"
}
],
"OrganizationName": {
"#nameType": "Legal",
"#text": "Citi"
},
"PersonGroup": {
"PersonGroupMember": {
"#primaryIndicator": "Yes",
"#sequence": "1",
"Person": {
"#personID": "tr56",
"FamilyName": "Wang",
"GivenName": "Bond",
"DisplayName": "Bond Wang",
"Biography": "Bond Wang is a",
"BiographyFormatted": "Bond Wang",
"PhotoResourceIdRef": "AS44556"
}
}
}
}
},
"Content": {
"Title": "Premier",
"Abstract": "None",
"Synopsis": "Premier’s solid 1H22 result .",
"Resource": [
{
"#language": "eng",
"#primaryIndicator": "Yes",
"#resourceID": "9553",
"Length": {
"#lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "text/html",
"URL": "https://www.DFKJG.com/rendition/eppublic"
},
{
"#language": "eng",
"#primaryIndicator": "No",
"#resourceID": "4809",
"Length": {
"#lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "ABS/pdf",
"Name": "asdf.pdf",
"Comments": "fr5.pdf"
},
{
"#language": "eng",
"#primaryIndicator": "No",
"#resourceID": "6d13a965723e",
"Length": {
"#lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "text/html",
"URL": "https://www.dfgdfg.com/"
},
{
"#primaryIndicator": "No",
"#resourceID": "709c7bdb1c99",
"MIMEType": "tyy/image",
"URL": "https://ir.ght.com"
},
{
"#primaryIndicator": "No",
"#resourceID": "gfjhgj",
"MIMEType": "gtty/image",
"URL": "https://ir.gtty.com"
}
]
},
"Context": {
"#external": "Yes",
"IssuerDetails": {
"Issuer": {
"#issuerType": "Corporate",
"#primaryIndicator": "Yes",
"SecurityDetails": {
"Security": {
"#estimateAction": "Revision",
"#primaryIndicator": "Yes",
"#targetPriceAction": "Increase",
"SecurityID": [
{
"#idType": "RIC",
"#idValue": "PMV.AX",
"#publisherDefinedValue": "RIC"
},
{
"#idType": "Bloomberg",
"#idValue": "PMV#AU"
},
{
"#idType": "SEDOL",
"#idValue": "6699781"
}
],
"SecurityName": "Premier Investments Ltd",
"AssetClass": {
"#assetClass": "Equity"
},
"AssetType": {
"#assetType": "Stock"
},
"SecurityType": {
"#securityType": "Common"
},
"Rating": {
"#rating": "NeutralSentiment",
"#ratingType": "Rating",
"#aspect": "Investment",
"#ratingDateTime": "2020-07-31T08:24:37Z",
"RatingEntity": {
"#ratingEntity": "PublisherDefined",
"PublisherDefinedValue": "Citi"
}
}
}
},
"IssuerID": {
"#idType": "PublisherDefined",
"#idValue": "PMV.AX",
"#publisherDefinedValue": "TICKER"
},
"IssuerName": {
"#nameType": "Legal",
"NameValue": "Premier Investments Ltd"
}
}
},
"ProductDetails": {
"#periodicalIndicator": "No",
"#publicationDateTime": "2022-03-25T12:18:41Z",
"ProductCategory": {
"#productCategory": "Report"
},
"ProductFocus": {
"#focus": "Issuer",
"#primaryIndicator": "Yes"
},
"EntitlementGroup": {
"Entitlement": [
{
"#includeExcludeIndicator": "Include",
"#primaryIndicator": "No",
"AudienceTypeEntitlement": {
"#audienceType": "PublisherDefined",
"#entitlementContext": "TR",
"#text": "20012"
}
},
{
"#includeExcludeIndicator": "Include",
"#primaryIndicator": "No",
"AudienceTypeEntitlement": {
"#audienceType": "PublisherDefined",
"#entitlementContext": "TR",
"#text": "2001"
}
}
]
}
},
"ProductClassifications": {
"Discipline": {
"#disciplineType": "Investment",
"#researchApproach": "Fundamental"
},
"Subject": {
"#publisherDefinedValue": "TREPS",
"#subjectValue": "PublisherDefined"
},
"Country": {
"#code": "AU",
"#primaryIndicator": "Yes"
},
"Region": {
"#primaryIndicator": "Yes",
"#emergingIndicator": "No",
"#regionType": "Australasia"
},
"AssetClass": {
"#assetClass": "Equity"
},
"AssetType": {
"#assetType": "Stock"
},
"SectorIndustry": [
{
"#classificationType": "GICS",
"#code": "25201040",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Household Appliances"
},
{
"#classificationType": "GICS",
"#code": "25504020",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Computer & Electronics Retail"
},
{
"#classificationType": "GICS",
"#code": "25504040",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Specialty Stores"
},
{
"#classificationType": "GICS",
"#code": "25504030",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Home Improvement Retail"
},
{
"#classificationType": "GICS",
"#code": "25201050",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Housewares & Specialties"
}
]
}
}
}
}
}
I want to explode all of its elements into data frame .
The no of columns that has list like structure can change also.
Basically we will not be knowing if next input will have few column or more columns to be exploded .
This is what i have tried so far but it looks like it does not give me correct answer .
Also the column values i have hardcoded but it should identify and then explode.
import xmltodict as xmltodict
from pprint import pprint
import pandas as pd
import json
from tabulate import tabulate
dict =(xmltodict.parse("""xml data"""))
json_str = json.dumps(dict)
resp = json.loads(json_str)
print(resp)
df = pd.json_normalize(resp)
cols=['Research.Product.Source.Organization.OrganizationID','Research.Product.Content.Resource','Research.Product.Context.IssuerDetails.Issuer.SecurityDetails.Security.SecurityID','Research.Product.Context.ProductDetails.EntitlementGroup.Entitlement','Research.Product.Context.ProductClassifications.SectorIndustry']
def expplode_columns(df, cols):
df_e = df.copy()
for c in cols:
df_e = df_e.explode(c, ignore_index=True)
return df_e
df2 = expplode_columns(df, cols)
print(tabulate(df2, headers="keys", tablefmt="psql"))
# df2.to_csv('dataframe.csv', header=True, index=False)
As suggested in the comments, you can define a helper function in pure Python to recursively flatten the nested values of your data.
So, with the json file you provided, here is one way to do it:
def flatten(data, new_data):
"""Recursive helper function.
Args:
data: nested dictionary.
new_data: empty dictionary.
Returns:
Flattened dictionary.
"""
for key, value in data.items():
if isinstance(value, dict):
flatten(value, new_data)
if isinstance(value, str) or isinstance(value, int) or isinstance(value, list):
new_data[key] = value
return new_data
And then:
import json
import pandas as pd
with open("file.json") as f:
content = json.load(f)
df = pd.DataFrame.from_dict(flatten(content, {}), orient="index").T
From here, you can deal with columns which contains lists of dictionaries with identical keys, but different values, by exploding them and repeating the other values, like this:
cols_with_lists = [col for col in df.columns if isinstance(df.loc[0, col], list)]
for col in cols_with_lists:
temp_df = pd.concat(
[pd.DataFrame(item, index=[i]) for i, item in enumerate(df.loc[0, col])],
axis=0,
)
df = pd.concat([df.drop(columns=[col]), temp_df], axis=1).fillna(method="ffill")
So that, finally, the json file is entirely flattened:
print(df)
# Output
#xmlns #language ... #primaryIndicator Name
0 http://www.xml.org/2013/2/XML eng ... Yes Household Appliances
1 http://www.xml.org/2013/2/XML eng ... Yes Computer & Electronics Retail
2 http://www.xml.org/2013/2/XML eng ... Yes Specialty Stores
3 http://www.xml.org/2013/2/XML eng ... Yes Home Improvement Retail
4 http://www.xml.org/2013/2/XML eng ... Yes Housewares & Specialties
[5 rows x 73 columns]
Little hacky but you can extract columns that has a list type in it. Then use reduce to recursively explode and normalize all columns until there are no more list/object.
I haven't tested well but something like this.
from functools import reduce
def full_explode_normalize(df):
# Extract list columns
explode_cols = [x for x in df.columns if isinstance(df.iloc[0][x], list)]
if len(explode_cols) < 1:
return df
# Explode and normalize the list
df = reduce(_explode, explode_cols, df)
return df
def _explode(df, col):
df = df.explode(col)
if isinstance(df.iloc[0][col], list):
df = _explode(df, col)
elif isinstance(df.iloc[0][col], object):
df_child = pd.json_normalize(df[col])
# To prevent column name collision, add the parent column name as prefix.
df_child.columns = [f'{col}.{x}' for x in df_child.columns]
df = pd.concat([df.loc[:, ~df.columns.isin([col])].reset_index(drop=True), df_child], axis=1)
return df

How to get required json output from complex nested json format.?

My original file is in CSV format which I have converted to python JSON array to JSON Sring.
jsonfile
<class 'list'>
<class 'dict'>
[
{
"key": "timestamp",
"source": "eia007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc.reg": "nord000",
"loc.count": "abs39i5",
"loc.town": "cold54",
"co.gdp": "nscrt77",
"co.pop.min": "min50",
"co.pop.max": "max75",
"co.rev": "",
"chain.system": "5t5t5",
"chain.type": "765ef",
"chain.strat": "",
}
]
I would like to get the output as below:
{
"timestamp001": {
"key": "timestamp001",
"phNo": "ner007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc": {
"reg": "nord000",
"count": "abs39i5",
"town": "cold54"
},
"co": {
"form": "nscrt77",
"pop": {
"min": "min50",
"max": "max75"
},
"rev: ""
},
"chain":{
"system": "5t5t5",
"type": "765ef",
"strat": ""
}
...
}
...
}
]
I have tried different options; tried to enumerate, but cannot get the required output. Please help me with this. Thanks in advance.
You can use something like this to create the nested dict:
import json
def unflatten(somedict):
unflattened = {}
for key, value in somedict.items():
splitkey = key.split(".")
print(f"doing {key} {value} {splitkey}")
# subdict is the dict that goes deeper in the nested structure
subdict = unflattened
for subkey in splitkey[:-1]:
# if this is the first time we see this key, add it
if subkey not in subdict:
subdict[subkey] = {}
# shift the subdict a level deeper
subdict = subdict[subkey]
# add the value
subdict[splitkey[-1]] = value
return unflattened
data = {
"key": "timestamp",
"source": "eia007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc.reg": "nord000",
"loc.count": "abs39i5",
"loc.town": "cold54",
"co.gdp": "nscrt77",
"co.pop.min": "min50",
"co.pop.max": "max75",
"co.rev": "",
"chain.system": "5t5t5",
"chain.type": "765ef",
"chain.strat": "",
}
unflattened = unflatten(data)
print(json.dumps(unflattened, indent=4))
Which produces:
{
"key": "timestamp",
"source": "eia007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc": {
"reg": "nord000",
"count": "abs39i5",
"town": "cold54"
},
"co": {
"gdp": "nscrt77",
"pop": {
"min": "min50",
"max": "max75"
},
"rev": ""
},
"chain": {
"system": "5t5t5",
"type": "765ef",
"strat": ""
}
}
Cheers!

Parsing JSON to CSV in Groovy

I'm trying to parse json file to csv. Could you help?
example json:
{
"expand": "schema,names",
"startAt": 0,
"maxResults": 50,
"total": 21,
"issues": [
{
"expand": "operations,versionedRepresentations",
"id": "217580",
"self": "issue/217580",
"key": "ART-4070",
"fields": {"summary": "#[ART] Pre.3 Verification \\"S\\""}
},
{
"expand": "operations,versionedRepresentations",
"id": "217579",
"self": "issue/217579",
"key": "ART-4069",
"fields": {"summary": "Verification \\"C\\""}
},
{
"expand": "operations,versionedRepresentations",
"id": "217577",
"self": "issue/217577",
"key": "ART-4068",
"fields": {"summary": "#[ART] Enum type"}
}
]
}
result csv should be like:
key;summary
ART-4070;#[ART] Pre.3 Verification \"S\"
ART-4069;Verification \"C\"
ART-4068;#[ART] Enum type
I've tried such a code:
import groovy.json.*
def jsonSlurper = new JsonSlurper()
def json = '''
{
"expand": "schema,names",
"startAt": 0,
"maxResults": 50,
"total": 21,
"issues": [
{
"expand": "operations,versionedRepresentations",
"id": "217580",
"self": "issue/217580",
"key": "ART-4070",
"fields": {"summary": "#[ART] Pre.3 Verification \\"S\\""}
},
{
"expand": "operations,versionedRepresentations",
"id": "217579",
"self": "issue/217579",
"key": "ART-4069",
"fields": {"summary": "Verification \\"C\\""}
},
{
"expand": "operations,versionedRepresentations",
"id": "217577",
"self": "issue/217577",
"key": "ART-4068",
"fields": {"summary": "#[ART] Enum type"}
}
]
}
'''
def obj = jsonSlurper.parse(json)
def columns = obj.issues*.keySet().flatten().unique()
// remove nulls
def encode = { e -> e == null ? '' : e }
// Print all the column names
println columns.collect { c -> encode( c ) }.join( ';' )
// create all the rows
println obj.issues.collect { row ->
// A row at a time
columns.collect { colName -> encode( row[ colName ] ) }.join( ';' )
}.join( '\n' )
but result is wrong:
expand;id;self;key;fields
operations,versionedRepresentations;217580;issue/217580;ART-4070;[summary:#[ART] Pre.3 Verification "S"]
operations,versionedRepresentations;217579;issue/217579;ART-4069;[summary:Verification "C"]
operations,versionedRepresentations;217577;issue/217577;ART-4068;[summary:#[ART] Enum type]
how can i extract only what i want from json file? I need only two columns:key,summary and values for them.
You want to extract only specific information from your list of issues
and you need different strategies to extract those. So I'd use
a "configuration" to describe the extraction (see the map config
below). Then the code is quite close to your original one (extracted
some common code etc)
import groovy.json.*
def config = [ // header -> extractor
"key": { it.key },
"summary": { it.fields.summary }
]
def encode(e) { // help with nulls; quote the separator
(e ?: "").replaceAll(";", "\\;")
}
def csvLine(items) { // write items as "CSV"
println(items.collect{ encode it }.join(";"))
}
// main
def obj = new JsonSlurper().parse("data.json" as File)
csvLine(config.keySet())
obj.issues.each{ issue ->
csvLine(config.values().collect{ f -> f issue })
}

Python: Combine multiple lists into one JSON array

I want to merge several lists into one JSON array.
These are my two lists:
address = ['address1','address2']
temp = ['temp1','temp2']
I combine both lists by the following call and create a JSON .
new_list = list(map(list, zip(address, temp)))
jsonify({
'data': new_list
})
This is my result for the call:
{
"data": [
[
"address1",
"temp1"
],
[
"address2",
"temp2"
]
]
}
However, I would like to receive the following issue. How do I do that and how can I insert the identifier address and hello.
{
"data": [
{
"address": "address1",
"temp": "temp1"
},
{
"address": "address2",
"temp": "temp2"
}
]
}
You can use a list-comprehension:
import json
address = ['address1','address2']
temp = ['temp1','temp2']
d = {'data': [{'address': a, 'temp': t} for a, t in zip(address, temp)]}
print( json.dumps(d, indent=4) )
Prints:
{
"data": [
{
"address": "address1",
"temp": "temp1"
},
{
"address": "address2",
"temp": "temp2"
}
]
}
You can just change your existing code like this. That lambda function will do the trick of converting it into a dict.
address = ['address1','address2']
temp = ['temp1','temp2']
new_list = list(map(lambda x : {'address': x[0], 'temp': x[1]}, zip(address, temp)))
jsonify({
'data': new_list
})

Json Array to Map using JsonSlurper

I want to create a map of data with the pid value as the key and the name as the array for a json array. Here is the json structure:
{
"measurements": [
{
"pid": 6691,
"name": "lung",
"measurement": "qualityFactor",
},
{
"pid": 1106,
"name": "kidney",
"measurement": "qualityFactor",
},
{
"id": 119,
"name": "pancreas",
"measurement": "qualityFactor",
},
]
}
Here is my attempt with Groovy but I'm stuck:
def jsonSlurper= new JsonSlurper()
Object objs=jsonSlurper.parseText(jsonData)
List pp =objs.data
Map<String,String> m=new HashMap()
pp.each{ it ->
it.collect{Map mm ->
println "Map m is ${mm}"
}
}
I want the map to look like
["6691" : "lung" , "1106" :"kidney" ....] etc.
Hiw can I accomplish this?
As a full answer with corrected JSON:
def json = '''{
"measurements": [
{
"pid": 6691,
"name": "lung",
"measurement": "qualityFactor",
},
{
"pid": 1106,
"name": "kidney",
"measurement": "qualityFactor",
},
{
"pid": 119,
"name": "pancreas",
"measurement": "qualityFactor",
}
]
}
'''
import groovy.json.*
def parsed = new JsonSlurper().parseText(json)
def map = map.measurements.collectEntries { [it.pid.toString(), it.name] }