How to identify and explode a nested json file as columns of a dataframe?

How to identify and explode a nested json file as columns of a dataframe? - json

I am reframing my question again so that it would be more clear.
My data looks like this .
{
"Research": {
"#xmlns": "http://www.xml.org/2013/2/XML",
"#language": "eng",
"#createDateTime": "2022-03-25T10:12:39Z",
"#researchID": "abcd",
"Product": {
"#productID": "abcd",
"StatusInfo": {
"#currentStatusIndicator": "Yes",
"#statusDateTime": "2022-03-25T12:18:41Z",
"#statusType": "Published"
},
"Source": {
"Organization": {
"#primaryIndicator": "Yes",
"#type": "SellSideFirm",
"OrganizationID": [
{
"#idType": "L1",
"#text": "D827C98E315F"
},
{
"#idType": "TR",
"#text": "3202"
},
{
"#idType": "TR",
"#text": "SZA"
}
],
"OrganizationName": {
"#nameType": "Legal",
"#text": "Citi"
},
"PersonGroup": {
"PersonGroupMember": {
"#primaryIndicator": "Yes",
"#sequence": "1",
"Person": {
"#personID": "tr56",
"FamilyName": "Wang",
"GivenName": "Bond",
"DisplayName": "Bond Wang",
"Biography": "Bond Wang is a",
"BiographyFormatted": "Bond Wang",
"PhotoResourceIdRef": "AS44556"
}
}
}
}
},
"Content": {
"Title": "Premier",
"Abstract": "None",
"Synopsis": "Premier’s solid 1H22 result .",
"Resource": [
{
"#language": "eng",
"#primaryIndicator": "Yes",
"#resourceID": "9553",
"Length": {
"#lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "text/html",
"URL": "https://www.DFKJG.com/rendition/eppublic"
},
{
"#language": "eng",
"#primaryIndicator": "No",
"#resourceID": "4809",
"Length": {
"#lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "ABS/pdf",
"Name": "asdf.pdf",
"Comments": "fr5.pdf"
},
{
"#language": "eng",
"#primaryIndicator": "No",
"#resourceID": "6d13a965723e",
"Length": {
"#lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "text/html",
"URL": "https://www.dfgdfg.com/"
},
{
"#primaryIndicator": "No",
"#resourceID": "709c7bdb1c99",
"MIMEType": "tyy/image",
"URL": "https://ir.ght.com"
},
{
"#primaryIndicator": "No",
"#resourceID": "gfjhgj",
"MIMEType": "gtty/image",
"URL": "https://ir.gtty.com"
}
]
},
"Context": {
"#external": "Yes",
"IssuerDetails": {
"Issuer": {
"#issuerType": "Corporate",
"#primaryIndicator": "Yes",
"SecurityDetails": {
"Security": {
"#estimateAction": "Revision",
"#primaryIndicator": "Yes",
"#targetPriceAction": "Increase",
"SecurityID": [
{
"#idType": "RIC",
"#idValue": "PMV.AX",
"#publisherDefinedValue": "RIC"
},
{
"#idType": "Bloomberg",
"#idValue": "PMV#AU"
},
{
"#idType": "SEDOL",
"#idValue": "6699781"
}
],
"SecurityName": "Premier Investments Ltd",
"AssetClass": {
"#assetClass": "Equity"
},
"AssetType": {
"#assetType": "Stock"
},
"SecurityType": {
"#securityType": "Common"
},
"Rating": {
"#rating": "NeutralSentiment",
"#ratingType": "Rating",
"#aspect": "Investment",
"#ratingDateTime": "2020-07-31T08:24:37Z",
"RatingEntity": {
"#ratingEntity": "PublisherDefined",
"PublisherDefinedValue": "Citi"
}
}
}
},
"IssuerID": {
"#idType": "PublisherDefined",
"#idValue": "PMV.AX",
"#publisherDefinedValue": "TICKER"
},
"IssuerName": {
"#nameType": "Legal",
"NameValue": "Premier Investments Ltd"
}
}
},
"ProductDetails": {
"#periodicalIndicator": "No",
"#publicationDateTime": "2022-03-25T12:18:41Z",
"ProductCategory": {
"#productCategory": "Report"
},
"ProductFocus": {
"#focus": "Issuer",
"#primaryIndicator": "Yes"
},
"EntitlementGroup": {
"Entitlement": [
{
"#includeExcludeIndicator": "Include",
"#primaryIndicator": "No",
"AudienceTypeEntitlement": {
"#audienceType": "PublisherDefined",
"#entitlementContext": "TR",
"#text": "20012"
}
},
{
"#includeExcludeIndicator": "Include",
"#primaryIndicator": "No",
"AudienceTypeEntitlement": {
"#audienceType": "PublisherDefined",
"#entitlementContext": "TR",
"#text": "2001"
}
}
]
}
},
"ProductClassifications": {
"Discipline": {
"#disciplineType": "Investment",
"#researchApproach": "Fundamental"
},
"Subject": {
"#publisherDefinedValue": "TREPS",
"#subjectValue": "PublisherDefined"
},
"Country": {
"#code": "AU",
"#primaryIndicator": "Yes"
},
"Region": {
"#primaryIndicator": "Yes",
"#emergingIndicator": "No",
"#regionType": "Australasia"
},
"AssetClass": {
"#assetClass": "Equity"
},
"AssetType": {
"#assetType": "Stock"
},
"SectorIndustry": [
{
"#classificationType": "GICS",
"#code": "25201040",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Household Appliances"
},
{
"#classificationType": "GICS",
"#code": "25504020",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Computer & Electronics Retail"
},
{
"#classificationType": "GICS",
"#code": "25504040",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Specialty Stores"
},
{
"#classificationType": "GICS",
"#code": "25504030",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Home Improvement Retail"
},
{
"#classificationType": "GICS",
"#code": "25201050",
"#focusLevel": "Yes",
"#level": "4",
"#primaryIndicator": "Yes",
"Name": "Housewares & Specialties"
}
]
}
}
}
}
}
I want to explode all of its elements into data frame .
The no of columns that has list like structure can change also.
Basically we will not be knowing if next input will have few column or more columns to be exploded .
This is what i have tried so far but it looks like it does not give me correct answer .
Also the column values i have hardcoded but it should identify and then explode.
import xmltodict as xmltodict
from pprint import pprint
import pandas as pd
import json
from tabulate import tabulate
dict =(xmltodict.parse("""xml data"""))
json_str = json.dumps(dict)
resp = json.loads(json_str)
print(resp)
df = pd.json_normalize(resp)
cols=['Research.Product.Source.Organization.OrganizationID','Research.Product.Content.Resource','Research.Product.Context.IssuerDetails.Issuer.SecurityDetails.Security.SecurityID','Research.Product.Context.ProductDetails.EntitlementGroup.Entitlement','Research.Product.Context.ProductClassifications.SectorIndustry']
def expplode_columns(df, cols):
df_e = df.copy()
for c in cols:
df_e = df_e.explode(c, ignore_index=True)
return df_e
df2 = expplode_columns(df, cols)
print(tabulate(df2, headers="keys", tablefmt="psql"))
# df2.to_csv('dataframe.csv', header=True, index=False)

As suggested in the comments, you can define a helper function in pure Python to recursively flatten the nested values of your data.
So, with the json file you provided, here is one way to do it:
def flatten(data, new_data):
"""Recursive helper function.
Args:
data: nested dictionary.
new_data: empty dictionary.
Returns:
Flattened dictionary.
"""
for key, value in data.items():
if isinstance(value, dict):
flatten(value, new_data)
if isinstance(value, str) or isinstance(value, int) or isinstance(value, list):
new_data[key] = value
return new_data
And then:
import json
import pandas as pd
with open("file.json") as f:
content = json.load(f)
df = pd.DataFrame.from_dict(flatten(content, {}), orient="index").T
From here, you can deal with columns which contains lists of dictionaries with identical keys, but different values, by exploding them and repeating the other values, like this:
cols_with_lists = [col for col in df.columns if isinstance(df.loc[0, col], list)]
for col in cols_with_lists:
temp_df = pd.concat(
[pd.DataFrame(item, index=[i]) for i, item in enumerate(df.loc[0, col])],
axis=0,
)
df = pd.concat([df.drop(columns=[col]), temp_df], axis=1).fillna(method="ffill")
So that, finally, the json file is entirely flattened:
print(df)
# Output
#xmlns #language ... #primaryIndicator Name
0 http://www.xml.org/2013/2/XML eng ... Yes Household Appliances
1 http://www.xml.org/2013/2/XML eng ... Yes Computer & Electronics Retail
2 http://www.xml.org/2013/2/XML eng ... Yes Specialty Stores
3 http://www.xml.org/2013/2/XML eng ... Yes Home Improvement Retail
4 http://www.xml.org/2013/2/XML eng ... Yes Housewares & Specialties
[5 rows x 73 columns]

Little hacky but you can extract columns that has a list type in it. Then use reduce to recursively explode and normalize all columns until there are no more list/object.
I haven't tested well but something like this.
from functools import reduce
def full_explode_normalize(df):
# Extract list columns
explode_cols = [x for x in df.columns if isinstance(df.iloc[0][x], list)]
if len(explode_cols) < 1:
return df
# Explode and normalize the list
df = reduce(_explode, explode_cols, df)
return df
def _explode(df, col):
df = df.explode(col)
if isinstance(df.iloc[0][col], list):
df = _explode(df, col)
elif isinstance(df.iloc[0][col], object):
df_child = pd.json_normalize(df[col])
# To prevent column name collision, add the parent column name as prefix.
df_child.columns = [f'{col}.{x}' for x in df_child.columns]
df = pd.concat([df.loc[:, ~df.columns.isin([col])].reset_index(drop=True), df_child], axis=1)
return df

Related

How to search a JSON hash using a Regex?

I'm using an API to return all Macros to me, I am trying to return all the "macros" where "actions" contains a "value" matching my Regexp Pattern which I will link below.
I've tried below and other methods, but it returns me nil for present values. Any tips appreciated
macros["value"].select { |m| m['key'] == 'value' }.first['/^DE([0-9a-zA-Z]\s?){20}$/gm']
API result snippet:
jsObj
=> {"macros"=>
[{"url"=>"https://s/1900002708354.json",
"id"=>1900002708354,
"title"=>"Append Signature",
"active"=>true,
"updated_at"=>"2021-10-22T14:11:15Z",
"created_at"=>"2021-10-22T14:11:15Z",
"position"=>10001,
"description"=>"This macro appends a signature to the message ",
"actions"=>[{"field"=>"comment_value_html", "value"=>"<p>Mit besten Grüßen,</p><p>{{current_user.name}} [{{ticket.account}}] <br></p><p><br></p><p>{{dc.signature_email}}<br></p><p><br></p>"}],
"restriction"=>nil},
{"url"=>"949.json",
"id"=>59071949,
"title"=>"information",
"description"=>nil,
"actions"=>[{"field"=>"priority", "value"=>"low"}, {"field"=>"comment_value", "value"=>"DE89370400440532013000" "DE89 3704
0044 0532 0130 00"
"}],
"restriction"=>nil},
Desired Result:
{
"macros": [
{
"url": "x.json",
"id": 1900002708354,
"actions": [
{
"field": "comment_value_html",
"value": "DE89 3704 0044 0532 0130 00"
}
],
"restriction": null
},
{
"url": "x.json",
"id": 59071949,
"actions": [
{
"field": "priority",
"value": "low"
},
{
"field": "comment_value",
"value": "DE89 3704 0044 0532 0130 00
"
}
],
"restriction": null
},

Given that macros is the JSON object containing the macros data, you can use
macros.select { |m| m["actions"].any? { |w| /\ADE(?:[0-9a-zA-Z]\s?){20}\z/.match?(w["value"]) } }
Here is a Ruby demo:
require 'json'
j = <<-DATA
{
"macros": [
{
"url": "x.json",
"id": 1900002708354,
"actions": [
{
"field": "comment_value_html",
"value": "DE11111111111222222220"
}
],
"restriction": null
},
{
"url": "x.json",
"id": 59071949,
"actions": [
{
"field": "priority",
"value": "low"
},
{
"field": "comment_value",
"value": "DE12345678901234567890"
}
],
"restriction": null
}
]}
DATA
jsObj = JSON.parse(j)
macros = jsObj['macros']
puts jsObj['macros'].select { |m| m["actions"].any? { |w| /\ADE(?:[0-9a-zA-Z]\s?){20}\z/.match?(w["value"]) } }
Output:
{"url"=>"x.json", "id"=>1900002708354, "actions"=>[{"field"=>"comment_value_html", "value"=>"DE11111111111222222220"}], "restriction"=>nil}
{"url"=>"x.json", "id"=>59071949, "actions"=>[{"field"=>"priority", "value"=>"low"}, {"field"=>"comment_value", "value"=>"DE12345678901234567890"}], "restriction"=>nil}
The .select { |m| m["actions"].any? { |w| /\ADE(?:[0-9a-zA-Z]\s?){20}\z/.match?(w["value"]) } } main part gets all actions nodes that contain an array with a value key whose value matches the regex given.

How to get required json output from complex nested json format.?

My original file is in CSV format which I have converted to python JSON array to JSON Sring.
jsonfile
<class 'list'>
<class 'dict'>
[
{
"key": "timestamp",
"source": "eia007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc.reg": "nord000",
"loc.count": "abs39i5",
"loc.town": "cold54",
"co.gdp": "nscrt77",
"co.pop.min": "min50",
"co.pop.max": "max75",
"co.rev": "",
"chain.system": "5t5t5",
"chain.type": "765ef",
"chain.strat": "",
}
]
I would like to get the output as below:
{
"timestamp001": {
"key": "timestamp001",
"phNo": "ner007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc": {
"reg": "nord000",
"count": "abs39i5",
"town": "cold54"
},
"co": {
"form": "nscrt77",
"pop": {
"min": "min50",
"max": "max75"
},
"rev: ""
},
"chain":{
"system": "5t5t5",
"type": "765ef",
"strat": ""
}
...
}
...
}
]
I have tried different options; tried to enumerate, but cannot get the required output. Please help me with this. Thanks in advance.

You can use something like this to create the nested dict:
import json
def unflatten(somedict):
unflattened = {}
for key, value in somedict.items():
splitkey = key.split(".")
print(f"doing {key} {value} {splitkey}")
# subdict is the dict that goes deeper in the nested structure
subdict = unflattened
for subkey in splitkey[:-1]:
# if this is the first time we see this key, add it
if subkey not in subdict:
subdict[subkey] = {}
# shift the subdict a level deeper
subdict = subdict[subkey]
# add the value
subdict[splitkey[-1]] = value
return unflattened
data = {
"key": "timestamp",
"source": "eia007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc.reg": "nord000",
"loc.count": "abs39i5",
"loc.town": "cold54",
"co.gdp": "nscrt77",
"co.pop.min": "min50",
"co.pop.max": "max75",
"co.rev": "",
"chain.system": "5t5t5",
"chain.type": "765ef",
"chain.strat": "",
}
unflattened = unflatten(data)
print(json.dumps(unflattened, indent=4))
Which produces:
{
"key": "timestamp",
"source": "eia007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc": {
"reg": "nord000",
"count": "abs39i5",
"town": "cold54"
},
"co": {
"gdp": "nscrt77",
"pop": {
"min": "min50",
"max": "max75"
},
"rev": ""
},
"chain": {
"system": "5t5t5",
"type": "765ef",
"strat": ""
}
}
Cheers!

Snowflake - Querying Nested JSON

I need some help querying this JSON file I've ingested into a temp table in Snowflake. So, I've created a JSON_DATA variant column and plan to query and do a COPY INTO another table, but my query isn't working yet... I feel I'm close (possibly?)
JSON layout:
{
"nextPage": "01",
"page": "0",
"status": "ok",
"transactions": [
{
"id": "65985",
"recordTp": "vendorbill",
"values": {
"account": [
{
"text": "14500 Deferred Expenses",
"value": "249"
}
],
"account.number": "1450",
"account.type": [
{
"text": "Deferred Expense",
"value": "DeferExpense"
}
],
"amount": "51733",
"classnohierarchy": [
{
"text": "901 Corporate",
"value": "139"
}
],
"currency": [
{
"text": "Canadian Dollar",
"value": "3"
}
],
"customer.altname": "V Sties expenses (Tor)",
"customer.custate": "12/31/2019",
"customer.custentient": "ada Inc.",
"customer.custendate": "1/1/2019",
"customer.entyid": "PR781",
"departmentnohierarchy": [
{
"text": "8rity",
"value": "37"
}
],
"fxamount": "689",
"location": [
{
"text": "Othad Projects",
"value": "48"
}
],
"postingperiod": [
{
"text": "Jan 2020",
"value": "1"
}
],
"subsidiary.custrecord_region": [
{
"text": "CANADA",
"value": "3"
}
],
"subsidiarynohierarchy": [
{
"text": "ada Inc.",
"value": "25"
}
]
}
},
I've been able to query the values that are not (deeply) nested but I need help getting, for example, the values from 'classnohierarchy', to get both the 'text' and 'value' I tried:
transactions.value:"values".classnohierarchy.text::string as class_txt,
transactions.value:"values".classnohierarchy.value::string as class_val,
but it's returning NULL values.
Below is my entire query:
SELECT
JSON_DATA:status::string as connection_status,
transactions.value:id::string as id,
transactions.value:recordType::string as record_type,
transactions.value:"values"::variant as trans_val,
transactions.value:"values".account as acc,
transactions.value:"values".account.text as text,
transactions.value:"values".account.value as val,
transactions.value:"values"."account.number"::string as acc_num,
transactions.value:"values"."account.type".text::string as acc_type_txt,
transactions.value:"values"."account.type".value::string as acc_type_val,
transactions.value:"values".amount::string as amount,
**transactions.value:"values".classnohierarchy.text::string as class_txt,
transactions.value:"values".classnohierarchy.value::string as class_val,**
transactions.value:"values".currency.text::string as currency_text,
transactions.value:"values".currency.value::string as currency_val,
transactions.value:"values"."customer.altname"::string as customer_project_name,
transactions.value:"values"."customer.custate"::string as customer_end_date,
transactions.value:"values"."customer.custentient"::string as customer_end_client,
transactions.value:"values"."customer.custendate"::string as customer_start_date,
transactions.value:"values"."customer.entyid"::string as customer_project_id,
transactions.value:"values".departmentnohierarchy.text::string as department_name,
transactions.value:"values".departmentnohierarchy.value::string as department_value,
transactions.value:"values".fxamount::string as fx_amount,
transactions.value:"values".location.text::string as product_name,
transactions.value:"values".postingperiod.text::string as postingperiod,
transactions.value:"values".postingperiod.value::string as postingperiod,
transactions.value:"values"."subsidiary.custrecord_region".text::string as region_name,
transactions.value:"values"."subsidiary.custrecord_region".value::string as region_value,
transactions.value:"values".subsidiarynohierarchy.text::string as entity_name,
transactions.value:"values".subsidiarynohierarchy.value::string as entity_value,
FROM MY_TABLE,
LATERAL FLATTEN (JSON_DATA:transactions) as transactions
and here's a picture of whats showing in Snowflake:
SNOWFLAKE_SCREENSHOT

departmentnohierarchy is an array. you need to mention the index as below.
select *,transactions.VALUE:"values".departmentnohierarchy[0].value::text as department_name
FROM jsont1,
LATERAL FLATTEN (JSON_DATA:transactions) as transactions

Parse Complex JSON -- Map

I need to parse the complex JSON (below) IN SCALA to get the values of "expression" and "value" in "measure" key i.e I need List (COUNT, COUNT_DISTINCT ...) and List (1,tbl1.USER_ID ...).
I tried multiple options, but it is not working. Any help is appreciated
{
"uuid": "uuidddd",
"last_modified": 1559080222953,
"version": "2.6.1.0",
"name": "FULL_DAY_2_mand_date",
"is_draft": false,
"model_name": "FULL_DAY_1_may05",
"description": "",
"null_string": null,
"dimensions": [
{
"name": "PLATFORM",
"table": "tbl1",
"column": "PLATFORM",
"derived": null
},
{
"name": "OS_VERSION",
"table": "tbl1",
"column": "OS_VERSION",
"derived": null
}
],
"measures": [
{
"name": "_COUNT_",
"function": {
"expression": "COUNT",
"parameter": {
"type": "constant",
"value": "1"
},
"returntype": "bigint"
}
},
{
"name": "UU",
"function": {
"expression": "COUNT_DISTINCT",
"parameter": {
"type": "column",
"value": "tbl1.USER_ID"
},
"returntype": "hllc(12)"
}
},
{
"name": "CONT_SIZE",
"function": {
"expression": "SUM",
"parameter": {
"type": "column",
"value": "tbl1.SIZE"
},
"returntype": "bigint"
}
},
{
"name": "CONT_COUNT",
"function": {
"expression": "SUM",
"parameter": {
"type": "column",
"value": "tbl1.COUNT"
},
"returntype": "bigint"
}
}
],
"dictionaries": [],
"rowkey": {
"rowkey_columns": [
{
"column": "tbl1.OS_VERSION",
"encoding": "dict",
"encoding_version": 1,
"isShardBy": false
},
{
"column": "tbl1.PLATFORM",
"encoding": "dict",
"encoding_version": 1,
"isShardBy": false
},
{
"column": "tbl1.DEVICE_FAMILY",
"encoding": "dict",
"encoding_version": 1,
"isShardBy": false
}
]
},
"hbase_mapping": {
"column_family": [
{
"name": "F1",
"columns": [
{
"qualifier": "M",
"measure_refs": [
"_COUNT_",
"CONT_SIZE",
"CONT_COUNT"
]
}
]
},
{
"name": "F2",
"columns": [
{
"qualifier": "M",
"measure_refs": [
"UU"
]
}
]
}
]
},
"aggregation_groups": [
{
"includes": [
"tbl1.PLATFORM",
"tbl1.OS_VERSION"
],
"select_rule": {
"hierarchy_dims": [],
"mandatory_dims": [
"tbl1.DATE_HR"
],
"joint_dims": []
}
}
],
"signature": "ttrrs==",
"notify_list": [],
"status_need_notify": [
"ERROR",
"DISCARDED",
"SUCCEED"
],
"partition_date_start": 0,
"partition_date_end": 3153600000000,
"auto_merge_time_ranges": [
604800000,
2419200000
],
"volatile_range": 0,
"retention_range": 0,
"engine_type": 4,
"storage_type": 2,
"override_kylin_properties": {
"job.queuename": "root.production.P0",
"is-mandatory-only-valid": "true"
},
"cuboid_black_list": [],
"parent_forward": 3,
"mandatory_dimension_set_list": [],
"snapshot_table_desc_list": []
}
This is a snippet of the code I tried, and it is giving a null list
import org.json4s._
import org.json4s.jackson.JsonMethods._
implicit val formats = org.json4s.DefaultFormats
case class Function (
expression: String,
parameter: Parameter,
returntype: String
)
case class Parameter (
`type`: String,
value: String
)
case class Measures (
name: String,
function: Function
)
case class AllMeasuresData(uuid: String, measure: List[Measures])
val data = parse(tmp).extract[AllMeasuresData]
val names = data.measure.map(_.name)
println(names)
case class AllMeasuresData(uuid: String, measure: List[Measures])
val data = parse(tmp).extract[AllMeasuresData]
val names = data.measure.map(_.name)
println(names)

There are couple typos in your ADT:
Here is what you need:
case class Function (
expression: String,
parameter: Parameter,
returntype: String
)
case class Parameter (
`type`: String,
value: String
)
case class Measures (
name: String,
function: Function
)
case class AllMeasuresData(uuid: String, measures: List[Measures])
There is also an extra comma int the json, here is the correct:
{
"uuid":"uuidddd",
"last_modified":1559080222953,
"version":"2.6.1.0",
"name":"FULL_DAY_2_mand_date",
"is_draft":false,
"model_name":"FULL_DAY_1_may05",
"description":"",
"null_string":null,
"dimensions":[
{
"name":"PLATFORM",
"table":"tbl1",
"column":"PLATFORM",
"derived":null
},
{
"name":"OS_VERSION",
"table":"tbl1",
"column":"OS_VERSION",
"derived":null
} // There was an extra trailing comma here
],
"measures":[
{
"name":"_COUNT_",
"function":{
"expression":"COUNT",
"parameter":{
"type":"constant",
"value":"1"
},
"returntype":"bigint"
}
},
{
"name":"UU",
"function":{
"expression":"COUNT_DISTINCT",
"parameter":{
"type":"column",
"value":"tbl1.USER_ID"
},
"returntype":"hllc(12)"
}
},
{
"name":"CONT_SIZE",
"function":{
"expression":"SUM",
"parameter":{
"type":"column",
"value":"tbl1.SIZE"
},
"returntype":"bigint"
}
},
{
"name":"CONT_COUNT",
"function":{
"expression":"SUM",
"parameter":{
"type":"column",
"value":"tbl1.COUNT"
},
"returntype":"bigint"
}
}
],
"dictionaries":[
],
"rowkey":{
"rowkey_columns":[
{
"column":"tbl1.OS_VERSION",
"encoding":"dict",
"encoding_version":1,
"isShardBy":false
},
{
"column":"tbl1.PLATFORM",
"encoding":"dict",
"encoding_version":1,
"isShardBy":false
},
{
"column":"tbl1.DEVICE_FAMILY",
"encoding":"dict",
"encoding_version":1,
"isShardBy":false
}
]
},
"hbase_mapping":{
"column_family":[
{
"name":"F1",
"columns":[
{
"qualifier":"M",
"measure_refs":[
"_COUNT_",
"CONT_SIZE",
"CONT_COUNT"
]
}
]
},
{
"name":"F2",
"columns":[
{
"qualifier":"M",
"measure_refs":[
"UU"
]
}
]
}
]
},
"aggregation_groups":[
{
"includes":[
"tbl1.PLATFORM",
"tbl1.OS_VERSION"
],
"select_rule":{
"hierarchy_dims":[
],
"mandatory_dims":[
"tbl1.DATE_HR"
],
"joint_dims":[
]
}
}
],
"signature":"ttrrs==",
"notify_list":[
],
"status_need_notify":[
"ERROR",
"DISCARDED",
"SUCCEED"
],
"partition_date_start":0,
"partition_date_end":3153600000000,
"auto_merge_time_ranges":[
604800000,
2419200000
],
"volatile_range":0,
"retention_range":0,
"engine_type":4,
"storage_type":2,
"override_kylin_properties":{
"job.queuename":"root.production.P0",
"is-mandatory-only-valid":"true"
},
"cuboid_black_list":[
],
"parent_forward":3,
"mandatory_dimension_set_list":[
],
"snapshot_table_desc_list":[
]
}
Now you can run:
val data = parse(tmp).extract[AllMeasuresData]
val names = data.measures.map(_.name)
println(names)
// Displays
// List(_COUNT_, UU, CONT_SIZE, CONT_COUNT)

Your Parameter class does not match the JSON because you have used type1 rather than type as the field name. Use backticks to use "type" as a field name even though it is a reserved word:
case class Parameter (
`type`: String,
value: String
)
You also need to change the Function class as it has returntype1 rather than returntype:
case class Function (
expression: String,
parameter: Parameter,
returntype: String
)
The names of the fields in Scala must exactly match the names of the fields in the JSON. Extra fields in the JSON are ignored, but all the fields in the Scala must have matching fields in the JSON. If there are optional fields in the JSON then the Scala field type should be Option[...].

How to convert a json file without the same length of json objects into csv

I have a json file and I want to convert it to csv format.
The problem I face is that every json object in the file has not the same length of the converted columns I have. For example the one object have 49 columnns and the next have 50.
I provide here an example of 2 data from which the first one has not the creator.slug but the next has it is and so there is the problem with data. The problem is that the process create all 50 columns but for the object which don't have the value creator.slug it takes the next price.
{
"id": 301852363,
"name": "Song of the Sea",
"blurb": "One evening, two shows: SIRENS and The Girl From Bare Cove. Building a community. Giving voice to survivors of sexual violence.",
"goal": 5000,
"pledged": 671,
"state": "live",
"slug": "song-of-the-sea",
"disable_communication": false,
"country": "US",
"currency": "USD",
"currency_symbol": "$",
"currency_trailing_code": true,
"deadline": 1399293386,
"state_changed_at": 1397133386,
"created_at": 1396672480,
"launched_at": 1397133386,
"backers_count": 20,
"photo": {
"full": "https://s3.amazonaws.com/ksr/projects/939387/photo-full.jpg?1397874930",
"ed": "https://s3.amazonaws.com/ksr/projects/939387/photo-ed.jpg?1397874930",
"med": "https://s3.amazonaws.com/ksr/projects/939387/photo-med.jpg?1397874930",
"little": "https://s3.amazonaws.com/ksr/projects/939387/photo-little.jpg?1397874930",
"small": "https://s3.amazonaws.com/ksr/projects/939387/photo-small.jpg?1397874930",
"thumb": "https://s3.amazonaws.com/ksr/projects/939387/photo-thumb.jpg?1397874930",
"1024x768": "https://s3.amazonaws.com/ksr/projects/939387/photo-1024x768.jpg?1397874930",
"1536x1152": "https://s3.amazonaws.com/ksr/projects/939387/photo-1536x1152.jpg?1397874930"
},
"creator": {
"id": 1714048992,
"name": "Maridee Slater",
"slug": "maridee",
"avatar": {
"thumb": "https://s3.amazonaws.com/ksr/avatars/996153/DSC_0310.thumb.jpg?1337713264",
"small": "https://s3.amazonaws.com/ksr/avatars/996153/DSC_0310.small.jpg?1337713264",
"medium": "https://s3.amazonaws.com/ksr/avatars/996153/DSC_0310.medium.jpg?1337713264"
},
"urls": {
"web": {
"user": "https://www.kickstarter.com/profile/maridee"
},
"api": {
"user": "https://api.kickstarter.com/v1/users/1714048992?signature=1398256877.e6d63adcca055cd041a5920368b197d40459f748"
}
}
},
"location": {
"id": 2459115,
"name": "New York",
"slug": "new-york-ny",
"short_name": "New York, NY",
"displayable_name": "New York, NY",
"country": "US",
"state": "NY",
"urls": {
"web": {
"discover": "https://www.kickstarter.com/discover/places/new-york-ny",
"location": "https://www.kickstarter.com/locations/new-york-ny"
},
"api": {
"nearby_projects": "https://api.kickstarter.com/v1/discover?signature=1398256786.89b2c4539aeab4ad25982694dd7e659e8c12028f&woe_id=2459115"
}
}
},
"category": {
"id": 17,
"name": "Theater",
"slug": "theater",
"position": 14,
"urls": {
"web": {
"discover": "http://www.kickstarter.com/discover/categories/theater"
}
}
},
"urls": {
"web": {
"project": "https://www.kickstarter.com/projects/maridee/song-of-the-sea"
}
}
},
{
"id": 967108708,
"name": "Good Bread Alley",
"blurb": "A play by April Yvette Thompson. A Gullah Healer Woman and an Afro-Cuban Priest forge a new world of magic & dreams in Jim Crow Miami.",
"goal": 100000,
"pledged": 33242,
"state": "live",
"slug": "good-bread-alley",
"disable_communication": false,
"country": "US",
"currency": "USD",
"currency_symbol": "$",
"currency_trailing_code": true,
"deadline": 1399271911,
"state_changed_at": 1396334313,
"created_at": 1393278556,
"launched_at": 1396334311,
"backers_count": 261,
"photo": {
"full": "https://s3.amazonaws.com/ksr/projects/883489/photo-full.jpg?1397869394",
"ed": "https://s3.amazonaws.com/ksr/projects/883489/photo-ed.jpg?1397869394",
"med": "https://s3.amazonaws.com/ksr/projects/883489/photo-med.jpg?1397869394",
"little": "https://s3.amazonaws.com/ksr/projects/883489/photo-little.jpg?1397869394",
"small": "https://s3.amazonaws.com/ksr/projects/883489/photo-small.jpg?1397869394",
"thumb": "https://s3.amazonaws.com/ksr/projects/883489/photo-thumb.jpg?1397869394",
"1024x768": "https://s3.amazonaws.com/ksr/projects/883489/photo-1024x768.jpg?1397869394",
"1536x1152": "https://s3.amazonaws.com/ksr/projects/883489/photo-1536x1152.jpg?1397869394"
},
"creator": {
"id": 749318998,
"name": "April Yvette Thompson",
"avatar": {
"thumb": "https://s3.amazonaws.com/ksr/avatars/9751919/kick_thumb.thumb.jpg?1396128151",
"small": "https://s3.amazonaws.com/ksr/avatars/9751919/kick_thumb.small.jpg?1396128151",
"medium": "https://s3.amazonaws.com/ksr/avatars/9751919/kick_thumb.medium.jpg?1396128151"
},
"urls": {
"web": {
"user": "https://www.kickstarter.com/profile/749318998"
},
"api": {
"user": "https://api.kickstarter.com/v1/users/749318998?signature=1398256877.af4db50c53f93339b05c7813f4534e833eaca270"
}
}
},
"location": {
"id": 2459115,
"name": "New York",
"slug": "new-york-ny",
"short_name": "New York, NY",
"displayable_name": "New York, NY",
"country": "US",
"state": "NY",
"urls": {
"web": {
"discover": "https://www.kickstarter.com/discover/places/new-york-ny",
"location": "https://www.kickstarter.com/locations/new-york-ny"
},
"api": {
"nearby_projects": "https://api.kickstarter.com/v1/discover?signature=1398256786.89b2c4539aeab4ad25982694dd7e659e8c12028f&woe_id=2459115"
}
}
},
"category": {
"id": 17,
"name": "Theater",
"slug": "theater",
"position": 14,
"urls": {
"web": {
"discover": "http://www.kickstarter.com/discover/categories/theater"
}
}
},
"urls": {
"web": {
"project": "https://www.kickstarter.com/projects/749318998/good-bread-alley"
}
}
}
Here is the code I run
#open the json file
require(RJSONIO)
require(rjson)
library("rjson")
filename2 <- "C:/Users/Desktop/in.json"
json_data <- fromJSON(file = filename2)
#unlist the json because it has a problem
unlisted <- unlist(unlist(json_data,recursive=FALSE),recursive=FALSE)
use to fill the NA but as I can understand now it is for already existed nulls http://stackoverflow.com/questions/16947643/getting-imported-json-data-into-a-data-frame-in-r/16948174#16948174
unlisted <- lapply(unlisted, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
json <- do.call("rbind", unlisted)
Here is a full list with the columns of the output csv and after that I provide what I would like to keep from every object of json, less columns
id
name
blurb
goal
pledged
state
slug
disable_communication
country
currency
currency_symbol
currency_trailing_code
deadline
state_changed_at
created_at
launched_at
backers_count
photo.full
photo.ed
photo.med
photo.little
photo.small
photo.thumb
photo.1024x768
photo.1536x1152
creator.id
creator.name
creator.slug
creator.avatar.thumb
creator.avatar.small
creator.avatar.medium
creator.urls.web.user
creator.urls.api.user
location.id
location.name
location.slug
location.short_name
location.displayable_name
location.country
location.state
location.urls.web.discover
location.urls.web.location
location.urls.api.nearby_projects
category.id
category.name
category.slug
category.position
category.urls.web.discover
category.urls.web.project
category.urls.web.rewards
Here it is the list of columns I would try to have in the output csv:
id
name
blurb
goal
pledged
state
slug
disable_communication
country
currency
currency_symbol
currency_trailing_code
deadline
state_changed_at
created_at
launched_at
backers_count
creator.id
creator.name
creator.slug
location.id
location.name
location.slug
location.short_name
location.displayable_name
location.country
location.state
category.id
category.name
category.slug
category.position

Looks like there's a very similar question (with answer, though not pure R) here: convert json to csv format
However, since you do seem to want most, if not all, the JSON in a "wide CSV" format you can use fromJSON from jsonlite, rbindlist from data.table (which gets you the fill=TRUE parameter to handle uneven lists nicely) and unlist:
library(jsonlite)
library(data.table)
# tell fromJSON we want a list back
json_data <- fromJSON("in.json", simplifyDataFrame=FALSE)
# iterate over the list we have so we can "flatten" it then
# covert it back to a data.frame-like object
dat <- rbindlist(lapply(json_data, function(x) {
as.list(unlist(x))
}), fill=TRUE)
You may need to tweak column names, but I think this gets you what you're looking for.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

How to identify and explode a nested json file as columns of a dataframe? - json

Related

How to search a JSON hash using a Regex?

How to get required json output from complex nested json format.?

Snowflake - Querying Nested JSON

Parse Complex JSON -- Map

How to convert a json file without the same length of json objects into csv

Categories

Resources