How to merge two different JSON (using python) - json

I'm totally new to python. I want to merge two JSON files who have the same objects but different keys.
Here is a basic example of the result I would love to get :
JSON1 :
{
"json1" : {
"1" : {
"id": 1,
"name": "first_artist",
"imageUrl": "https://1.jpg",
"genre": "Rap "
},
"2" : {
"id": 2,
"name": "second_artist",
"imageUrl": "https://2.jpg",
"genre": "Hip-Hop"
}
}
}
JSON2:
{
"json2" : {
"1" : {
"date": 17/07/19,
"venue": "venue1"
},
"2" : {
"date": 19/07/19,
"venue": "venue2"
}
}
}
Expected JSON:
{
"expected_json" : {
"1" : {
"id": 1,
"name": "first_artist",
"imageUrl": "https://1.jpg",
"genre": "Rap "
"date": 17/07/19,
"venue": "venue1"
},
"2" : {
"id": 2,
"name": "second_artist",
"imageUrl": "https://2.jpg",
"genre": "Hip-Hop"
"date": 19/07/19,
"venue": "venue2"
}
}
}
Can someone give tips and direction to make this possible ? Thanks

You can simplify your input to:
A.json:
{
"1" : {
"id": 1,
"name": "first_artist",
"imageUrl": "https://1.jpg",
"genre": "Rap "
},
"2" : {
"id": 2,
"name": "second_artist",
"imageUrl": "https://2.jpg",
"genre": "Hip-Hop"
}
}
B.json:
{
"1" : {
"date": "17/07/19",
"venue": "venue1"
},
"2" : {
"date": "19/07/19",
"venue": "venue2"
}
}
and you have to change 19/07/19 to "19/07/19" for it to be valid json.
Now you can use the json module:
import json
#from pprint import pprint
# load json from files
with open('A.json') as A_file:
A = json.load(A_file) # returns a dict()
#print('A:')
#pprint(A)
with open('B.json') as B_file:
B = json.load(B_file)
#print('\nB:')
#pprint(A)
# get a list of unique keys -> {'1', '2'}
keys = set()
keys.update(A.keys())
keys.update(B.keys())
#print(f'\nkeys: {keys}')
# for each key merge values from dicts A and B
result = {}
for key in keys:
#print(f'\n{key}:')
merge = {}
if key in A:
merge.update(A[key])
if key in B:
merge.update(B[key])
#pprint(merge)
result[key] = merge
#print('\nresult:')
#pprint(result)
# write the result to expected.json
with open('expected.json', 'w+') as expected_file:
expected_file.write(json.dumps(result, sort_keys=True, indent='\t'))
This writes:
expected.json:
{
"1": {
"date": "17/07/19",
"genre": "Rap ",
"id": 1,
"imageUrl": "https://1.jpg",
"name": "first_artist",
"venue": "venue1"
},
"2": {
"date": "19/07/19",
"genre": "Hip-Hop",
"id": 2,
"imageUrl": "https://2.jpg",
"name": "second_artist",
"venue": "venue2"
}
}

Related

Flattening Json In Snowflake using regexp in Json Path?

I met a problem in flattening the json into a relational table.
for example I have a json file like the below,
How can I flatten the table content in both sheets:'sheet:1':'section 1':table
and sheets:'sheet:2':'section 1':table ??
the number of sheet and section changes in each json file.
is there any way to use regular expression in the json path?
each json path of each file is following the same pattern, but the keys are not the same...
{
"extraction date": {
"month": "OCTOBER",
"monthValue": 10,
"year": 2020
},
"fileName": "test_1.xls",
"number of sheets": 2,
"sheets": {
"sheet:1": {
"content": {
"conversion state": "Success",
"section 1": {
"meta": {
"Remark": "This is the remark",
"Row: 4": "this is the title"
},
"table": [
{
"col1": null,
"col2": "2020-07-14"
"Row": 9
},
{
"col1": null,
"col2": "2020-07-14"
"Row": 10
}
]
}
},
"name": "Sheet1",
"sections": 1
},
"sheet:2": {
"content": {
"conversion state": "Success",
"section 1": {
"meta": {
"Remark": " null",
"Row: 4": "title a"
},
"table": [
{
"col1": null,
"col2": "2020-07-14",
"Row": 8
},
{
"col1": null,
"col2": "2020-07-14",
"Row": 9
}
]
}
},
"name": "mySheetName",
"sections": 1
}
}
}

Nested json - store values in csv

I am trying to convert a nested json file into csv. It's data from a darts API and the structure is always the same. Nevertheless I got some problems flattening and storing the values in a csv because of the nested structure.
json:
{
"summaries": [{
"sport_event": {
"id": "sr:sport_event:12967512",
"start_time": "2017-11-11T13:15:00+00:00",
"start_time_confirmed": true,
"sport_event_context": {
"sport": {
"id": "sr:sport:22",
"name": "Darts"
},
"category": {
"id": "sr:category:104",
"name": "International"
},
"competition": {
"id": "sr:competition:597",
"name": "Grand Slam of Darts"
},
"season": {
"id": "sr:season:47332",
"name": "Grand Slam of Darts 2017",
"start_date": "2017-11-11",
"end_date": "2017-11-20",
"year": "2017",
"competition_id": "sr:competition:597"
},
"stage": {
"order": 1,
"type": "league",
"phase": "stage_1",
"start_date": "2017-11-11",
"end_date": "2017-11-15",
"year": "2017"
},
"round": {
"number": 1
},
"groups": [{
"id": "sr:league:29766",
"name": "Grand Slam of Darts 2017, Group G",
"group_name": "G"
}]
},
"coverage": {
"live": true
},
"competitors": [{
"id": "sr:competitor:35936",
"name": "Smith, Michael",
"abbreviation": "SMI",
"qualifier": "home"
}, {
"id": "sr:competitor:83895",
"name": "Wilson, James",
"abbreviation": "WIL",
"qualifier": "away"
}]
},
"sport_event_status": {
"status": "closed",
"match_status": "ended",
"home_score": 5,
"away_score": 3,
"winner_id": "sr:competitor:35936"
}
}, {
"sport_event": {
"id": "sr:sport_event:12967508",
"start_time": "2017-11-11T13:40:00+00:00",
"start_time_confirmed": true,
"sport_event_context": {
"sport": {
"id": "sr:sport:22",
"name": "Darts"
},
"category": {
"id": "sr:category:104",
"name": "International"
},
"competition": {
"id": "sr:competition:597",
"name": "Grand Slam of Darts"
},
"season": {
"id": "sr:season:47332",
"name": "Grand Slam of Darts 2017",
"start_date": "2017-11-11",
"end_date": "2017-11-20",
"year": "2017",
"competition_id": "sr:competition:597"
},
"stage": {
"order": 1,
"type": "league",
"phase": "stage_1",
"start_date": "2017-11-11",
"end_date": "2017-11-15",
"year": "2017"
},
"round": {
"number": 1
},
"groups": [{
"id": "sr:league:29764",
"name": "Grand Slam of Darts 2017, Group F",
"group_name": "F"
}]
},
"coverage": {
"live": true
},
"competitors": [{
"id": "sr:competitor:70916",
"name": "Bunting, Stephen",
"abbreviation": "BUN",
"qualifier": "home"
}, {
"id": "sr:competitor:191262",
"name": "de Zwaan, Jeffrey",
"abbreviation": "DEZ",
"qualifier": "away"
}]
},
"sport_event_status": {
"status": "closed",
"match_status": "ended",
"home_score": 5,
"away_score": 4,
"winner_id": "sr:competitor:70916"
}
}
So for each sport_event I would like to store the variables:
"start_time"
from "season" the variable "name"
from "competitors" both "id" and "name"
from "sport_event_status" the "winner_id"
I have already tried to flatten the json file with this code:
import json
f = open(r'path of file.json')
data = json.load(f)
def flatten(data):
for key,value in data.items():
print (str(key)+'->'+str(value))
if type(value) == type(dict()):
flatten(value)
elif type(value) == type(list()):
for val in value:
if type(val) == type(str()):
pass
elif type(val) == type(list()):
pass
else:
flatten(val)
flatten(data)
print(data)
This actually prints out the following:
id->sr:season:47332
name->Grand Slam of Darts 2017
start_date->2017-11-11
end_date->2017-11-20
year->2017
competition_id->sr:competition:597
Now my question is how to store the values I mentioned above in a csv file.
Thanks in advance for your support.
Using jq, you basically just have to transcribe your specification, adding a bit of context and taking care of an embedded array:
.summaries[]
| .sport_event # Your specification:
| [.start_time, # start_time
.sport_event_context.season.name] # from "season" the variable "name"
+ [.competitors[] | .id, .name] # from "competitors" both "id" and "name"
+ [.sport_event_status.winner_id] # from "sport_event_status" the "winner_id"
| #csv
Invocation
E.g.
jq -rf program.jq my.json

JsonPath - Extract object meeting multiple criteria?

In the Json string given below, I want to find all elements in which category = m AND the "middle" array contains elements which match this condition - the element's "middle" array has objects whose itemType = Executable.
I would like to use jsonpath to get the desired objects. I prefer to not use jmespath because it can be too complex for my purpose. But, I am new to jsonpath and I am not able to figure out the json query from online tutorials which are too trivial or basic. I wonder if its better to use a programming language instead to get the data I need. Please advise.
So far, I was able to only extract elements in which category = m by using this jsonpath query $.[?(#.category=="m")]. How do I do the remaining part ?
Json :
Overview - Every object has a "content" object. Each content object generally has a start, middle and end array besides other fields. Middle arrays can have multiple content objects inside them and so on. Some of the content objects have only a middle array. I am interested in locating items in such content objects as mentioned above.
Note that this is not the actual json which I have to process. It is an imitation which has been sanitized for SO.
{
"id": "123",
"contents": {
"title": "B1",
"start": [],
"middle": [
{
"level": "1",
"contents": {
"title": "C1",
"category": "c",
"start": [],
"middle": [
{
"level": "2",
"contents": {
"title": "M1",
"category": "m",
"start": [],
"middle": [
{
"level": "3",
"contents": {
"title": "MAT1",
"middle": [
{
"itemType": "Data"
}
]
}
},
{
"level": "3",
"contents": {
"title": "MAT2",
"middle": [
{
"itemType": "Executable",
"id": "exec1"
}
]
}
},
{
"level": "3",
"contents": {
"title": "MAT3",
"middle": [
{
"itemType": "Data"
}
]
}
}
],
"end": []
}
},
{
"level": "2",
"contents": {
"title": "M2",
"category": "m",
"start": [],
"middle": [
{
"level": "3",
"contents": {
"title": "MAT1",
"middle": [
{
"itemType": "Data"
}
]
}
},
{
"level": "3",
"contents": {
"title": "MAT2",
"middle": [
{
"itemType": "Executable",
"id": "exec2"
}
]
}
}
],
"end": []
}
}
],
"end": []
}
},
{
"level": "1",
"contents": {
"title": "C2",
"category": "c",
"start": [],
"middle": [
{
"level": "2",
"contents": {
"title": "M1",
"category": "m",
"start": [],
"middle": [
{
"level": "3",
"contents": {
"title": "MAT1",
"middle": [
{
"itemType": "Data"
}
]
}
},
{
"level": "3",
"contents": {
"title": "MAT2",
"middle": [
{
"itemType": "Executable",
"id": "exec3"
}
]
}
},
{
"level": "3",
"contents": {
"title": "MAT3",
"middle": [
{
"itemType": "Data"
}
]
}
}
],
"end": []
}
},
{
"level": "2",
"contents": {
"title": "M2",
"category": "m",
"start": [],
"middle": [
{
"level": "3",
"contents": {
"title": "MAT1",
"middle": [
{
"itemType": "Data"
}
]
}
},
{
"level": "3",
"contents": {
"title": "MAT2",
"middle": [
{
"itemType": "Executable",
"id": "exec4"
}
]
}
},
{
"level": "3",
"contents": {
"title": "MAT3",
"middle": [
{
"itemType": "Data"
}
]
}
}
],
"end": []
}
}
],
"end": []
}
}
],
"end": []
}
}
Context
json with nested objects1
jsonpath expression language
choosing between jsonpath and jmespath (or other JSON expression engine)
Problem
DeveMasterJoe2 wants to extract some values from nested JSON
Discussion
There are lots of implementations of jsonpath out there, and they do not all support the same features
The structure and normalization of the source JSON is going to influence how easily this can be done with pure jsonpath
In choosing a JSON expression engine, one has to weigh multiple factors
how consistent are the implementations across languages?
how many choices are there within a given language?
how clear is the specification?
how many examples, unit-tests or tutorials are available?
who is supporting it?
Example solution using Python and jsonpath-ng
Here is an example solution using python 3.7 and jsonpath-ng
This example uses a mix of jsonpath and python instead of just pure jsonpath, because of the heavily-nested JSON
I will leave it for someone else to provide an answer that relies on pure jsonpath
Note that the source JSON arguably could stand to be cleaned up a bit
(for example, why is there no id field attached to itemType==Data elements?)
(for example, why is category not found on all contents elements?)
(for example, if you expressly specify level why complicate things with heavily nested objects when you can determine depth by level ?)
This example:
## import libraries
import codecs
import json
import jsonpath_ng
from jsonpath_ng.ext import parse
##;;
## init vars
href="path/to/my/jsonfile/nested_dict.json"
json_string = codecs.open(href, 'rb', encoding='utf8').read()
json_dataroot = json.loads(json_string)
final_result = []
##;;
## init jsonpath outer-query
match = parse('$..contents.middle[*]').find(json_dataroot)
##;;
## iterate through outer-query and gather subelements
for ijj,item in enumerate(match):
## restrict to desired category == 'm'
if(match[ijj].value.get('contents',{}).get('category','') == 'm'):
## extract out desired subelements
json_datafrag001 = [item.get('contents',{}).get('middle',{})[0]
for item in match[ijj].value.get('contents',{}).get('middle',{})
]
match001 = parse("$[?(#.itemType=='Executable')]").find(json_datafrag001)
final_result.extend(list(match001[ikk].value for ikk,item in enumerate(match001)))
pass
##;;
## show final result
vout = json.dumps(final_result, sort_keys=True,indent=4, separators=(',', ': '))
print(vout)
##;;
... produces this result ...
[
{
"id": "exec1",
"itemType": "Executable"
},
{
"id": "exec2",
"itemType": "Executable"
},
{
"id": "exec3",
"itemType": "Executable"
},
{
"id": "exec4",
"itemType": "Executable"
}
]
1 (aka dictionary, associative-array, hash)

split json log of type array that starts with [] into multiple arrays based on the number of elements in python

I am trying to create multiple files by reading my JSON log file and separate file has to be created per each element in my JSON array. Any help would be appreciated.
Approach : Able to read the json log file using json.load() and read the number of elements inside the json array and then copy first 5 elements in to file1 as an json array , and rest 5 into second file as an json array and so on till the length of my json log file array.
update :
I got it working till splitting the list/array in to 10 individual lists/array till end of the file. now have to write those list to separate json.gz file and upload to s3.
import json
json_data = json.dumps([
{
"a": "1",
"b": "2"
},
{
"d": "3"
},
{
"d": "4"
},
{
"e": "5"
},
{
"e": "6"
},
{
"e": "7"
},
{
"e": "8"
},
{
"e": "9"
},
{
"e": "5"
},
{
"e": "10"
},
{
"e": "11"
},
{
"e": "12"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"s": "y"
},
{
"abc": [
{
"te": "tre"
}
]
}
])
item_dict = json.loads(json_data)
lens = len(item_dict)
first = 0
last = 10
b = item_dict[first:last]
#print (item_dict[0:2])
print (lens)
for i in range(lens):
if last < lens:
n = 0
res = item_dict[first:last]
print(range(lens))
print(res)
first = last
last = first + 10
i = i + 1
print(first)
print(last)
print(i)
if last > lens:
diff = last-lens
last = last-diff
print("second" + str(first))
print("second" + str(last))
ins = item_dict[first:last]
print(ins)```
my json log file:
```[
{
"id": 1000,
"type": "BigBox",
"name": "Mall of America",
"address": "340 W Market",
"address2": "",
"city": "Bloomington",
"state": "MN",
"zip": "55425",
"location": {
"lat": 44.85466,
"lon": -93.24565
},
"hours": "Mon: 10-9:30; Tue: 10-9:30; Wed: 10-9:30; Thurs: 10-9:30; Fri: 10-9:30; Sat: 10-9:30; Sun: 11-7",
"services": [
"Geek Squad Services",
"Best Buy Mobile",
"Best Buy For Business"
]
},
{
"id": 1002,
"type": "BigBox",
"name": "Tempe Marketplace",
"address": "1900 E Rio Salado Pkwy",
"address2": "",
"city": "Tempe",
"state": "AZ",
"zip": "85281",
"location": {
"lat": 33.430729,
"lon": -111.89966
},
"hours": "Mon: 10-9; Tue: 10-9; Wed: 10-9; Thurs: 10-9; Fri: 10-10; Sat: 10-10; Sun: 10-8",
"services": [
"Windows Store",
"Geek Squad Services",
"Best Buy Mobile",
"Best Buy For Business"
]}
]```

How to exclude specific fields from JSON using groovy

I would like to exclude the items which don't have productModel property in the below JSON. How can we achieve this in groovy
I tried using hasProperty but not worked for me as expected. If possible can I get some sample snippet
I tried below code - but didn't work as I expected.
response.getAt('myData').getAt('data').getAt('product').hasProperty('productModel').each { println "result ${it}" }
Any help would be really appreciated.
{
"myData": [{
"data": {
"product": {
"productId": "apple",
"productName": "iPhone",
"productModel": "6s"
},
"statusCode": "active",
"date": "2018-08-07T00:00:00.000Z"
},
"links": [{
"productUrl": "test"
},
{
"productImage": "test"
}
],
"info": {}
},
{
"data": {
"product": {
"productId": "apple",
"productName": "iPhone",
"productModel": "7"
},
"statusCode": "active",
"date": "2018-08-07T00:00:00.000Z"
},
"links": [{
"productUrl": "test"
},
{
"productImage": "test"
}
],
"info": {}
},
{
"data": {
"product": {
"productId": "apple",
"productName": "Macbook"
},
"statusCode": "active",
"date": "2018-08-07T00:00:00.000Z"
},
"links": [{
"productUrl": "test"
},
{
"productImage": "test"
}
],
"info": {}
}
],
"metadata": {
"count": 3,
"offset": 0
}
}
If you want to exclude specific fields from JSON object then you have to recreate it using filtered data. The crucial part takes these two lines (assuming that json variable in the below example stores your JSON as text):
def root = new JsonSlurper().parseText(json)
def myData = root.myData.findAll { it.data.product.containsKey('productModel') }
What happens here is we access root.myData list and we filter it using findAll(predicate) method and predicate in this case says that only objects that have key productModel in path data.product are accepted. This findAll() method does not mutate existing list and that is why we store the result in variable myData - after running this method we will end up with a list of size 2.
In next step you have to recreate the object you want to represent as a JSON:
def newJsonObject = [
myData: myData,
metadata: [
count: myData.size(),
offset: 0
]
]
println JsonOutput.prettyPrint(JsonOutput.toJson(newJsonObject))
In this part we create newJsonObject and in the end we convert it to a JSON representation.
Here is the full example:
import groovy.json.JsonOutput
import groovy.json.JsonSlurper
def json = '''{
"myData": [{
"data": {
"product": {
"productId": "apple",
"productName": "iPhone",
"productModel": "6s"
},
"statusCode": "active",
"date": "2018-08-07T00:00:00.000Z"
},
"links": [{
"productUrl": "test"
},
{
"productImage": "test"
}
],
"info": {}
},
{
"data": {
"product": {
"productId": "apple",
"productName": "iPhone",
"productModel": "7"
},
"statusCode": "active",
"date": "2018-08-07T00:00:00.000Z"
},
"links": [{
"productUrl": "test"
},
{
"productImage": "test"
}
],
"info": {}
},
{
"data": {
"product": {
"productId": "apple",
"productName": "Macbook"
},
"statusCode": "active",
"date": "2018-08-07T00:00:00.000Z"
},
"links": [{
"productUrl": "test"
},
{
"productImage": "test"
}
],
"info": {}
}
],
"metadata": {
"count": 3,
"offset": 0
}
}'''
def root = new JsonSlurper().parseText(json)
def myData = root.myData.findAll { it.data.product.containsKey('productModel') }
def newJsonObject = [
myData: myData,
metadata: [
count: myData.size(),
offset: 0
]
]
println JsonOutput.prettyPrint(JsonOutput.toJson(newJsonObject))
And here is the output it produces:
{
"myData": [
{
"data": {
"product": {
"productId": "apple",
"productName": "iPhone",
"productModel": "6s"
},
"statusCode": "active",
"date": "2018-08-07T00:00:00.000Z"
},
"links": [
{
"productUrl": "test"
},
{
"productImage": "test"
}
],
"info": {
}
},
{
"data": {
"product": {
"productId": "apple",
"productName": "iPhone",
"productModel": "7"
},
"statusCode": "active",
"date": "2018-08-07T00:00:00.000Z"
},
"links": [
{
"productUrl": "test"
},
{
"productImage": "test"
}
],
"info": {
}
}
],
"metadata": {
"count": 2,
"offset": 0
}
}