recursive search json elements - json

I'm having trouble with finding json elements in a nested json.
It seems that my code only finds the element on the root level.
My code is not able to find the elements recursively it seems.
import json
import pandas as pd
jsonString = '{"airplane": {"wings": {}, "wheels": {}, "cockpit": {}}}'
jsonObj = json.loads(jsonString)
data = ['airplane','wings','wheels','cockpit']
dfProp = pd.DataFrame(data, columns=['object'])
# find elements in JSON
for index, row in dfProp.iterrows():
if row['object'] in jsonObj:
print(row['object'] + ' ' + 'FOUND')
else:
print(row['object'] + ' ' + 'NOT FOUND')
I want to find all elements regardless of how many nesting levels there are in json files.
Can someone point me into the right direction?

If I understand you correctly, you want to check if all values from list data is found as a key in jsonObj:
import json
jsonString = '{"airplane": {"wings": {}, "wheels": {}, "cockpit": {}}}'
jsonObj = json.loads(jsonString)
data = ["airplane", "wings", "wheels", "cockpit"]
def find(o):
if isinstance(o, dict):
for k, v in o.items():
yield k
yield from find(v)
elif isinstance(o, list):
for v in o:
yield from find(v)
s = set(data).difference(find(jsonObj))
if not s:
print("All values from data found in jsonObj")
else:
print("Not all values from data found in jsonObj", s)
Prints:
All values from data found in jsonObj

Related

How to convert simple JSON to DynamoDB JSON?

I have a simple JSON and want to convert it to DynamoDB JSON. Is there any easy way to do that?
If you mean JsonString to Dynamodb Map, you can use boto3.
Here is the example.
import boto3
import json
json_string = '{"key1": 1, "key2": "value"}'
json_obj = json.loads(json_string)
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('test-table')
table.put_item(Item={'pk': 'pk-value', 'map': json_obj})
If you just want to update the while Map attribute, you can use just JSON format the same as put_item.
json_string = '{"key1": 2, "key2": "value2"}'
json_obj = json.loads(json_string2)
rsp = table.update_item(
Key={'pk': 'pk-value'},
AttributeUpdates={'map': {'Value': json_obj2, 'Action': 'PUT'}}
)
However, If you want to update only specific nested attribute, you need to use UpdateExpression. For example, the below is code to update only key1 attribute to 'value3'.
nested_json_string = '{"nested": "key3"}'
nested_json_obj = json.loads(nested_json_string)
rsp = table.update_item(
Key={'pk': 'pk-value'},
UpdateExpression='SET #map.#key1 = :val3',
ExpressionAttributeNames={'#map': 'map', '#key1': 'key1'},
ExpressionAttributeValues={':val3': nested_json_obj}
)
I know this is an old question, but I came across it and the accepted answer didn't help me (it seems to suggest that you can feed boto3 with plain JSON, but it didn't work for me) and the library mentioned in the comments didn't help me either.
What did work for me was using the serializer/deserializer from boto3.dynamodb.types, basically as suggested by this answer on a very similar topic.
from boto3.dynamodb.types import TypeSerializer, TypeDeserializer
import json
serializer = TypeSerializer()
deserializer = TypeDeserializer()
# for building a DynamoDB JSON from a Python object
def serialize(item):
serialized_item = serializer.serialize(vars(item) if hasattr(item, '__dict__') else item)
return item if 'M' not in serialized_item else serialized_item['M']
# for building a plain JSON from a DynamoDB JSON
def deserialize(dynamodb_json_string):
return deserializer.deserialize({'M': dynamodb_json_string})
class MyItem:
def __init__(self, some_string_value=None, some_numeric_value=None):
self.my_key = some_string_value
self.my_other_key = some_numeric_value
def __str__(self):
return json.dumps(self, default=lambda x: x.__dict__)
if __name__ == '__main__':
my_classy_item = MyItem("my_string_value", 5)
my_string_item = json.loads('{"my_key": "my_string_value", "my_other_key" : 5}')
print(serialize(my_classy_item))
print(serialize(my_string_item))

Take Input Dynamically from user in Python Dictionary

I've created a Python Dictionary Structure as below:
import pprint
log_data = {
'Date':'',
'Prayers':{
'Fajr':'',
'Dhuhr/Jumu\'ah':'',
'Asr':'',
'Maghrib':'',
'Isha\'a':''
},
'Task List':[{
'Task':'',
'Timeline':'',
'Status':''
}],
'Meals':{
'Breakfast':{
'Menu':'',
'Place':'',
'Time':''
},
'Lunch':{
'Menu':'',
'Place':'',
'Time':''
},
'Evening Snacks':{
'Menu':'',
'Place':'',
'Time':''
},
'Dinner':{
'Menu':'',
'Place':'',
'Time':''
}
},
'Exercises':[{
'Exercise':'',
'Duration':''
}]
}
pprint.pprint(log_data)
As you see this is just an dictionary structure without data. I want to iterate over all the keys and take input data as value from user using input().
Then I would like to save this dictionary as json file.
Could you please help on how I can iterate over all keys and take input from user.
Thanks.
Searched but couldn't found exact type of help that I need.
For this kind of thing, one needs to use recursion.
This is not fancy, but will get the job done:
from copy import deepcopy
import json
import pprint
log_data = {
'Date':'',
'Prayers':{
'Fajr':'',
'Dhuhr/Jumu\'ah':'',
'Asr':'',
'Maghrib':'',
'Isha\'a':''
},
'Task List':[{
'Task':'',
'Timeline':'',
'Status':''
}],
# ...
}
def input_fields(substruct, path=""):
print(f"Inputing values '{path}':")
for fieldname, value in substruct.items():
if isinstance(value, (str, int)):
substruct[fieldname] = input(f"{path}.{fieldname}: ")
elif isinstance(value, dict):
input_fields(value, f"{path}.{fieldname}")
elif isinstance(value, list):
original = value[0]
value.pop()
counter = 0
if not isinstance(original, dict):
raise ValueError("Not supported: A list should contain a dictionary-substructure")
while True:
item = deepcopy(original)
input_fields(item, f"{path}.{fieldname}.[{counter}]")
value.append(item)
continue_ = input(f"Enter one more {path}.{fieldname} item? (y/n) ").lower().strip()[0] == "y"
if not continue_:
break
counter+=1
return substruct
def main():
values = input_fields(deepcopy(log_data))
json.dump(values, open("myfile.json", "wt"), indent=4)
if __name__ == "__main__":
main()

Inverse of Pandas json_normalize

I just discovered the json_normalize function which works great in taking a JSON object and giving me a pandas Dataframe. Now I want the reverse operation which takes that same Dataframe and gives me a json (or json-like dictionary which I can easily turn to json) with the same structure as the original json.
Here's an example: https://hackersandslackers.com/json-into-pandas-dataframes/.
They take a JSON object (or JSON-like python dictionary) and turn it into a dataframe, but I now want to take that dataframe and turn it back into a JSON-like dictionary (to later dump to json file).
I implemented it with a couple functions
def set_for_keys(my_dict, key_arr, val):
"""
Set val at path in my_dict defined by the string (or serializable object) array key_arr
"""
current = my_dict
for i in range(len(key_arr)):
key = key_arr[i]
if key not in current:
if i==len(key_arr)-1:
current[key] = val
else:
current[key] = {}
else:
if type(current[key]) is not dict:
print("Given dictionary is not compatible with key structure requested")
raise ValueError("Dictionary key already occupied")
current = current[key]
return my_dict
def to_formatted_json(df, sep="."):
result = []
for _, row in df.iterrows():
parsed_row = {}
for idx, val in row.iteritems():
keys = idx.split(sep)
parsed_row = set_for_keys(parsed_row, keys, val)
result.append(parsed_row)
return result
#Where df was parsed from json-dict using json_normalize
to_formatted_json(df, sep=".")
A simpler approach:
Uses only 1 function...
def df_to_formatted_json(df, sep="."):
"""
The opposite of json_normalize
"""
result = []
for idx, row in df.iterrows():
parsed_row = {}
for col_label,v in row.items():
keys = col_label.split(sep)
current = parsed_row
for i, k in enumerate(keys):
if i==len(keys)-1:
current[k] = v
else:
if k not in current.keys():
current[k] = {}
current = current[k]
# save
result.append(parsed_row)
return result
df.to_json(path)
or
df.to_dict()
I just implemented this using 2 functions.
Get a full list of fields from the DataFrame that are part of a nested field. Only the parent i.e. if location.city.code fits the criteria, we only care about location.city. Sort it by the deepest level of nesting, i.e. location.city is nested further than location.
Starting with the deepest nested parent field, find all child fields by searching in the column name. Create a field in the DataFrame for the parent field, which is a combination of all child fields (renamed so that they lose the nesting structure, e.g. location.city.code becomes code) converted to JSON and then loaded to a dictionary value. Finally, drop all of the child fields.
def _get_nested_fields(df: pd.DataFrame) -> List[str]:
"""Return a list of nested fields, sorted by the deepest level of nesting first."""
nested_fields = [*{field.rsplit(".", 1)[0] for field in df.columns if "." in field}]
nested_fields.sort(key=lambda record: len(record.split(".")), reverse=True)
return nested_fields
def df_denormalize(df: pd.DataFrame) -> pd.DataFrame:
"""
Convert a normalised DataFrame into a nested structure.
Fields separated by '.' are considered part of a nested structure.
"""
nested_fields = _get_nested_fields(df)
for field in nested_fields:
list_of_children = [column for column in df.columns if field in column]
rename = {
field_name: field_name.rsplit(".", 1)[1] for field_name in list_of_children
}
renamed_fields = df[list_of_children].rename(columns=rename)
df[field] = json.loads(renamed_fields.to_json(orient="records"))
df.drop(list_of_children, axis=1, inplace=True)
return df
let me throw in my two cents
after backward converting you might need to drop empty columns from your generated jsons
therefore, i checked if val != np.nan. but u cant directly do it, instead you need to check val == val or not, because np.nan != itself.
my version:
def to_formatted_json(df, sep="."):
result = []
for _, row in df.iterrows():
parsed_row = {}
for idx, val in row.iteritems():
if val == val:
keys = idx.split(sep)
parsed_row = set_for_keys(parsed_row, keys, val)
result.append(parsed_row)
return result
This is a solution which looks working to me. It is designed to work on a dataframe with one line, but it can be easily looped over large dataframes.
class JsonRecreate():
def __init__(self, df):
self.df = df
def pandas_to_json(self):
df = self.df
# determine the number of nesting levels
number_levels = np.max([len(i.split('.')) for i in df.columns])
# put all the nesting levels in an a list
levels = []
for level_idx in np.arange(number_levels):
levels.append(np.array([i.split('.')[level_idx] if len(i.split('.')) > level_idx else ''
for i in df.columns.tolist()]))
self.levels = levels
return self.create_dict(upper_bound = self.levels[0].shape[0])
def create_dict(self, level_idx = 0, lower_bound = 0, upper_bound = 100):
''' Function to create the dictionary starting from a pandas dataframe generated by json_normalize '''
levels = self.levels
dict_ = {}
# current nesting level
level = levels[level_idx]
# loop over all the relevant elements of the level (relevant w.r.t. its parent)
for key in [i for i in np.unique(level[lower_bound: upper_bound]) if i != '']:
# find where a particular key occurs in the level
correspondence = np.where(level[lower_bound: upper_bound] == key)[0] + lower_bound
# check if the value(s) corresponding to the key appears once (multiple times)
if correspondence.shape[0] == 1:
# if the occurence is unique, append the value to the dictionary
dict_[key] = self.df.values[0][correspondence[0]]
else:
# otherwhise, redefine the relevant bounds and call the function recursively
lower_bound_, upper_bound_ = correspondence.min(), correspondence.max() + 1
dict_[key] = self.create_dict(level_idx + 1, lower_bound_, upper_bound_)
return dict_
I tested it with a simple dataframe such as:
df = pd.DataFrame({'a.b': [1], 'a.c.d': [2], 'a.c.e': [3], 'a.z.h1': [-1], 'a.z.h2': [-2], 'f': [4], 'g.h': [5], 'g.i.l': [6], 'g.i.m': [7], 'g.z.h1': [-3], 'g.z.h2': [-4]})
The order in the json is not exactly preserved in the resulting json, but it can be easily handled if needed.

AWS Athena export array of structs to JSON

I've got an Athena table where some fields have a fairly complex nested format. The backing records in S3 are JSON. Along these lines (but we have several more levels of nesting):
CREATE EXTERNAL TABLE IF NOT EXISTS test (
timestamp double,
stats array<struct<time:double, mean:double, var:double>>,
dets array<struct<coords: array<double>, header:struct<frame:int,
seq:int, name:string>>>,
pos struct<x:double, y:double, theta:double>
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES ('ignore.malformed.json'='true')
LOCATION 's3://test-bucket/test-folder/'
Now we need to be able to query the data and import the results into Python for analysis. Because of security restrictions I can't connect directly to Athena; I need to be able to give someone the query and then they will give me the CSV results.
If we just do a straight select * we get back the struct/array columns in a format that isn't quite JSON.
Here's a sample input file entry:
{"timestamp":1520640777.666096,"stats":[{"time":15,"mean":45.23,"var":0.31},{"time":19,"mean":17.315,"var":2.612}],"dets":[{"coords":[2.4,1.7,0.3], "header":{"frame":1,"seq":1,"name":"hello"}}],"pos": {"x":5,"y":1.4,"theta":0.04}}
And example output:
select * from test
"timestamp","stats","dets","pos"
"1.520640777666096E9","[{time=15.0, mean=45.23, var=0.31}, {time=19.0, mean=17.315, var=2.612}]","[{coords=[2.4, 1.7, 0.3], header={frame=1, seq=1, name=hello}}]","{x=5.0, y=1.4, theta=0.04}"
I was hoping to get those nested fields exported in a more convenient format - getting them in JSON would be great.
Unfortunately it seems that cast to JSON only works for maps, not structs, because it just flattens everything into arrays:
SELECT timestamp, cast(stats as JSON) as stats, cast(dets as JSON) as dets, cast(pos as JSON) as pos FROM "sampledb"."test"
"timestamp","stats","dets","pos"
"1.520640777666096E9","[[15.0,45.23,0.31],[19.0,17.315,2.612]]","[[[2.4,1.7,0.3],[1,1,""hello""]]]","[5.0,1.4,0.04]"
Is there a good way to convert to JSON (or another easy-to-import format) or should I just go ahead and do a custom parsing function?
I have skimmed through all the documentation and unfortunately there seems to be no way to do this as of now. The only possible workaround is
converting a struct to a json when querying athena
SELECT
my_field,
my_field.a,
my_field.b,
my_field.c.d,
my_field.c.e
FROM
my_table
Or I would convert the data to json using post processing. Below script shows how
#!/usr/bin/env python
import io
import re
pattern1 = re.compile(r'(?<={)([a-z]+)=', re.I)
pattern2 = re.compile(r':([a-z][^,{}. [\]]+)', re.I)
pattern3 = re.compile(r'\\"', re.I)
with io.open("test.csv") as f:
headers = list(map(lambda f: f.strip(), f.readline().split(",")))
for line in f.readlines():
orig_line = line
data = []
for i, l in enumerate(line.split('","')):
data.append(headers[i] + ":" + re.sub('^"|"$', "", l))
line = "{" + ','.join(data) + "}"
line = pattern1.sub(r'"\1":', line)
line = pattern2.sub(r':"\1"', line)
print(line)
The output on your input data is
{"timestamp":1.520640777666096E9,"stats":[{"time":15.0, "mean":45.23, "var":0.31}, {"time":19.0, "mean":17.315, "var":2.612}],"dets":[{"coords":[2.4, 1.7, 0.3], "header":{"frame":1, "seq":1, "name":"hello"}}],"pos":{"x":5.0, "y":1.4, "theta":0.04}
}
Which is a valid JSON
The python code from #tarun almost got me there, but I had to modify it in several ways due to my data. In particular, I have:
json structures saved in Athena as strings
Strings that contain multiple words, and therefore need to be in between double quotes. Some of them contain "[]" and "{}" symbols.
Here is the code that worked for me, hopefully will be useful for others:
#!/usr/bin/env python
import io
import re, sys
pattern1 = re.compile(r'(?<={)([a-z]+)=', re.I)
pattern2 = re.compile(r':([a-z][^,{}. [\]]+)', re.I)
pattern3 = re.compile(r'\\"', re.I)
with io.open(sys.argv[1]) as f:
headers = list(map(lambda f: f.strip(), f.readline().split(",")))
print(headers)
for line in f.readlines():
orig_line = line
#save the double quote cases, which mean there is a string with quotes inside
line = re.sub('""', "#", orig_line)
data = []
for i, l in enumerate(line.split('","')):
item = re.sub('^"|"$', "", l.rstrip())
if (item[0] == "{" and item[-1] == "}") or (item[0] == "[" and item[-1] == "]"):
data.append(headers[i] + ":" + item)
else: #we have a string
data.append(headers[i] + ": \"" + item + "\"")
line = "{" + ','.join(data) + "}"
line = pattern1.sub(r'"\1":', line)
line = pattern2.sub(r':"\1"', line)
#restate the double quotes to single ones, once inside the json
line = re.sub("#", '"', line)
print(line)
This method is not by modifying the Query.
Its by Post Processing For Javascript/Nodejs we can use the npm package athena-struct-parser.
Detailed Answer with Example
https://stackoverflow.com/a/67899845/6662952
Reference - https://www.npmjs.com/package/athena-struct-parser
I used a simple approach to get around the struct -> json Athena limitation. I created a second table where the json columns were saved as raw strings. Using presto json and array functions I was able to query the data and return the valid json string to my program:
--Array transform functions too
select
json_extract_scalar(dd, '$.timestamp') as timestamp,
transform(cast(json_extract(json_parse(dd), '$.stats') as ARRAY<JSON>), x -> json_extract_scalar(x, '$.time')) as arr_stats_time,
transform(cast(json_extract(json_parse(dd), '$.stats') as ARRAY<JSON>), x -> json_extract_scalar(x, '$.mean')) as arr_stats_mean,
transform(cast(json_extract(json_parse(dd), '$.stats') as ARRAY<JSON>), x -> json_extract_scalar(x, '$.var')) as arr_stats_var
from
(select '{"timestamp":1520640777.666096,"stats":[{"time":15,"mean":45.23,"var":0.31},{"time":19,"mean":17.315,"var":2.612}],"dets":[{"coords":[2.4,1.7,0.3], "header":{"frame":1,"seq":1,"name":"hello"}}],"pos": {"x":5,"y":1.4,"theta":0.04}}' as dd);
I know the query will take longer to execute but there are ways to optimize.
I worked around this by creating a second table using the same S3 location, but changed the field's data type to string. The resulting CSV then had the string that Athena pulled from the object in the JSON file and I was able to parse the result.
I also had to adjust the #tarun code, because I had more complex data and nested structures. Here is the solution I've got, I hope it helps:
import re
import json
import numpy as np
pattern1 = re.compile(r'(?<=[{,\[])\s*([^{}\[\],"=]+)=')
pattern2 = re.compile(r':([^{}\[\],"]+|()(?![{\[]))')
pattern3 = re.compile(r'"null"')
def convert_metadata_to_json(value):
if type(value) is str:
value = pattern1.sub('"\\1":', value)
value = pattern2.sub(': "\\1"', value)
value = pattern3.sub('null', value)
elif np.isnan(value):
return None
return json.loads(value)
df = pd.read_csv('test.csv')
df['metadata_json'] = df.metadata.apply(convert_metadata_to_json)

Python: Append JSON objects to nested list

I'm trying to iterate through a list of IP addresses, and extracting the JSON data from my url, and trying to put that JSON data into a nested list.
It seems as if my code is overwriting my list over and over, and will only show one JSON object, instead of the many I have specified.
Here's my code:
for x in range(0, 10):
try:
url = 'http://' + ip_addr[x][0] + ':8080/system/ids/'
response = urlopen(url)
json_obj = json.load(response)
except:
continue
camera_details = [[i['name'], i['serial']] for i in json_obj['cameras']]
for x in camera_details:
#This only prints one object, and not 10.
print x
How can I append my JSON objects into a list, and then extract the 'name' and 'serial' values into a nested list?
try this
camera_details = []
for x in range(0, 10):
try:
url = 'http://' + ip_addr[x][0] + ':8080/system/ids/'
response = urlopen(url)
json_obj = json.load(response)
except:
continue
camera_details.extend([[i['name'], i['serial']] for i in json_obj['cameras']])
for x in camera_details:
print x
in your code you where only getting the last requests data
Best would be using append and avoiding list comprehension
camera_details = []
for x in range(0, 10):
try:
url = 'http://' + ip_addr[x][0] + ':8080/system/ids/'
response = urlopen(url)
json_obj = json.load(response)
except:
continue
for i in json_obj['cameras']:
camera_details.append([i['name'], i['serial']])
for x in camera_details:
print x
Try breaking up your code into smaller, easier to digest parts. This will help you to diagnose what's going on.
camera_details = []
for obj in json_obj['cameras']:
if 'name' in obj and 'serial' in obj:
camera_details.append([obj['name'], obj['serial']])