I'm having trouble with finding json elements in a nested json.
It seems that my code only finds the element on the root level.
My code is not able to find the elements recursively it seems.
import json
import pandas as pd
jsonString = '{"airplane": {"wings": {}, "wheels": {}, "cockpit": {}}}'
jsonObj = json.loads(jsonString)
data = ['airplane','wings','wheels','cockpit']
dfProp = pd.DataFrame(data, columns=['object'])
# find elements in JSON
for index, row in dfProp.iterrows():
if row['object'] in jsonObj:
print(row['object'] + ' ' + 'FOUND')
else:
print(row['object'] + ' ' + 'NOT FOUND')
I want to find all elements regardless of how many nesting levels there are in json files.
Can someone point me into the right direction?
If I understand you correctly, you want to check if all values from list data is found as a key in jsonObj:
import json
jsonString = '{"airplane": {"wings": {}, "wheels": {}, "cockpit": {}}}'
jsonObj = json.loads(jsonString)
data = ["airplane", "wings", "wheels", "cockpit"]
def find(o):
if isinstance(o, dict):
for k, v in o.items():
yield k
yield from find(v)
elif isinstance(o, list):
for v in o:
yield from find(v)
s = set(data).difference(find(jsonObj))
if not s:
print("All values from data found in jsonObj")
else:
print("Not all values from data found in jsonObj", s)
Prints:
All values from data found in jsonObj
I have some code which collects the description, price, and old price(if on sale) from online retailers over multiple pages. I'm looking to export this into a DataFrame and have had a go but run into the following error:
ValueError: Shape of passed values is (1, 3210), indices imply (3, 3210).
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
# Start Timer
then = time.time()
# Headers
headers = {"User-Agent": "Mozilla/5.0"}
# Set HTTPCode = 200 and Counter = 1
Code = 200
i = 1
scraped_data = []
while Code == 200:
# Put url together
url = "https://www.asos.com/women/jumpers-cardigans/cat/?cid=2637&page="
url = url + str(i)
# Request URL
r = requests.get(url, allow_redirects=False, headers=headers) # No redirects to allow infinite page count
data = r.text
Code = r.status_code
# Soup
soup = BeautifulSoup(data, 'lxml')
# For loop each product then scroll through title price, old price and description
divs = soup.find_all('article', attrs={'class': '_2qG85dG'}) # want to cycle through each of these
for div in divs:
# Get Description
Description = div.find('div', attrs={'class': '_3J74XsK'})
Description = Description.text.strip()
scraped_data.append(Description)
# Fetch TitlePrice
NewPrice = div.find('span', attrs={'data-auto-id':'productTilePrice'})
NewPrice = NewPrice.text.strip("£")
scraped_data.append(NewPrice)
# Fetch OldPrice
try:
OldPrice = div.find('span', attrs={'data-auto-id': 'productTileSaleAmount'})
OldPrice = OldPrice.text.strip("£")
scraped_data.append(OldPrice)
except AttributeError:
OldPrice = ""
scraped_data.append(OldPrice)
print('page', i, 'scraped')
# Print Array
#array = {"Description": str(Description), "CurrentPrice": str(NewPrice), "Old Price": str(OldPrice)}
#print(array)
i = i + 1
else:
i = i - 2
now = time.time()
pd.DataFrame(scraped_data, columns=["A", "B", "C"])
print('Parse complete with', i, 'pages' + ' in', now-then, 'seconds')
Right now your data is appended to list based on an algorithm that I can describe like this:
Load the web page
Append to list value A
Append to list value B
Append to list value C
What this creates for each run through the dataset is:
[A1, B1, C1, A2, B2, C2]
There exists only one column with data, which is what pandas is telling you. To construct the dataframe properly, either you need to swap it into a format where you have, on each row entry, a tuple of three values (heh) like:
[
(A1, B1, C1),
(A2, B2, C2)
]
Or, in my preferred way because it's far more robust to coding errors and inconsistent lengths to your data: creating each row as a dictionary of columns. Thus,
rowdict_list = []
for row in data_source:
a = extract_a()
b = extract_b()
c = extract_c()
rowdict_list.append({'column_a': a, 'column_b': b, 'column_c': c})
And the data frame is constructed easily without having to explicitly specify columns in the constructor with df = pd.DataFrame(rowdict_list).
You can create a DataFrame using the array dictionary.
You would want to set the values of the array dict to empty lists that way you can append the values from the webpage into the correct list. Also move the array variable outside of the while loop.
array = {"Description": [], "CurrentPrice": [], "Old Price": []}
scraped_data = []
while Code == 200:
...
On the line where you were previously defining the array variable you would then want to append the desciption, price and old price values like so.
array['Description'].append(str(Description))
array['CurrentPrice'].append(str(NewPrice))
array['Old Price'].append(str(OldPrice))
Then you can to create a DataFrame using the array variable
pd.DataFrame(array)
So the final solution would look something like
array = {"Description": [], "CurrentPrice": [], "Old Price": []}
scraped_data = []
while Code == 200:
...
# For loop
for div in divs:
# Get Description
Description = div.find('h3', attrs={'class': 'product__title'})
Description = Description.text.strip()
# Fetch TitlePrice
try:
NewPrice = div.find('div', attrs={'class': 'price product__price--current'})
NewPrice = NewPrice.text.strip()
except AttributeError:
NewPrice = div.find('p', attrs={'class': 'price price--reduced'})
NewPrice = NewPrice.text.strip()
# Fetch OldPrice
try:
OldPrice = div.find('p', attrs={'class': 'price price--previous'})
OldPrice = OldPrice.text.strip()
except AttributeError:
OldPrice = ""
array['Description'].append(str(Description))
array['CurrentPrice'].append(str(NewPrice))
array['Old Price'].append(str(OldPrice))
# Print Array
print(array)
df = pd.DataFrame(array)
i = i + 1
else:
i = i - 2
now = time.time()
print('Parse complete with', i, 'pages' + ' in', now - then, 'seconds')
Finally make sure you've imported pandas at the top of the module
import pandas as pd
I've got an Athena table where some fields have a fairly complex nested format. The backing records in S3 are JSON. Along these lines (but we have several more levels of nesting):
CREATE EXTERNAL TABLE IF NOT EXISTS test (
timestamp double,
stats array<struct<time:double, mean:double, var:double>>,
dets array<struct<coords: array<double>, header:struct<frame:int,
seq:int, name:string>>>,
pos struct<x:double, y:double, theta:double>
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES ('ignore.malformed.json'='true')
LOCATION 's3://test-bucket/test-folder/'
Now we need to be able to query the data and import the results into Python for analysis. Because of security restrictions I can't connect directly to Athena; I need to be able to give someone the query and then they will give me the CSV results.
If we just do a straight select * we get back the struct/array columns in a format that isn't quite JSON.
Here's a sample input file entry:
{"timestamp":1520640777.666096,"stats":[{"time":15,"mean":45.23,"var":0.31},{"time":19,"mean":17.315,"var":2.612}],"dets":[{"coords":[2.4,1.7,0.3], "header":{"frame":1,"seq":1,"name":"hello"}}],"pos": {"x":5,"y":1.4,"theta":0.04}}
And example output:
select * from test
"timestamp","stats","dets","pos"
"1.520640777666096E9","[{time=15.0, mean=45.23, var=0.31}, {time=19.0, mean=17.315, var=2.612}]","[{coords=[2.4, 1.7, 0.3], header={frame=1, seq=1, name=hello}}]","{x=5.0, y=1.4, theta=0.04}"
I was hoping to get those nested fields exported in a more convenient format - getting them in JSON would be great.
Unfortunately it seems that cast to JSON only works for maps, not structs, because it just flattens everything into arrays:
SELECT timestamp, cast(stats as JSON) as stats, cast(dets as JSON) as dets, cast(pos as JSON) as pos FROM "sampledb"."test"
"timestamp","stats","dets","pos"
"1.520640777666096E9","[[15.0,45.23,0.31],[19.0,17.315,2.612]]","[[[2.4,1.7,0.3],[1,1,""hello""]]]","[5.0,1.4,0.04]"
Is there a good way to convert to JSON (or another easy-to-import format) or should I just go ahead and do a custom parsing function?
I have skimmed through all the documentation and unfortunately there seems to be no way to do this as of now. The only possible workaround is
converting a struct to a json when querying athena
SELECT
my_field,
my_field.a,
my_field.b,
my_field.c.d,
my_field.c.e
FROM
my_table
Or I would convert the data to json using post processing. Below script shows how
#!/usr/bin/env python
import io
import re
pattern1 = re.compile(r'(?<={)([a-z]+)=', re.I)
pattern2 = re.compile(r':([a-z][^,{}. [\]]+)', re.I)
pattern3 = re.compile(r'\\"', re.I)
with io.open("test.csv") as f:
headers = list(map(lambda f: f.strip(), f.readline().split(",")))
for line in f.readlines():
orig_line = line
data = []
for i, l in enumerate(line.split('","')):
data.append(headers[i] + ":" + re.sub('^"|"$', "", l))
line = "{" + ','.join(data) + "}"
line = pattern1.sub(r'"\1":', line)
line = pattern2.sub(r':"\1"', line)
print(line)
The output on your input data is
{"timestamp":1.520640777666096E9,"stats":[{"time":15.0, "mean":45.23, "var":0.31}, {"time":19.0, "mean":17.315, "var":2.612}],"dets":[{"coords":[2.4, 1.7, 0.3], "header":{"frame":1, "seq":1, "name":"hello"}}],"pos":{"x":5.0, "y":1.4, "theta":0.04}
}
Which is a valid JSON
The python code from #tarun almost got me there, but I had to modify it in several ways due to my data. In particular, I have:
json structures saved in Athena as strings
Strings that contain multiple words, and therefore need to be in between double quotes. Some of them contain "[]" and "{}" symbols.
Here is the code that worked for me, hopefully will be useful for others:
#!/usr/bin/env python
import io
import re, sys
pattern1 = re.compile(r'(?<={)([a-z]+)=', re.I)
pattern2 = re.compile(r':([a-z][^,{}. [\]]+)', re.I)
pattern3 = re.compile(r'\\"', re.I)
with io.open(sys.argv[1]) as f:
headers = list(map(lambda f: f.strip(), f.readline().split(",")))
print(headers)
for line in f.readlines():
orig_line = line
#save the double quote cases, which mean there is a string with quotes inside
line = re.sub('""', "#", orig_line)
data = []
for i, l in enumerate(line.split('","')):
item = re.sub('^"|"$', "", l.rstrip())
if (item[0] == "{" and item[-1] == "}") or (item[0] == "[" and item[-1] == "]"):
data.append(headers[i] + ":" + item)
else: #we have a string
data.append(headers[i] + ": \"" + item + "\"")
line = "{" + ','.join(data) + "}"
line = pattern1.sub(r'"\1":', line)
line = pattern2.sub(r':"\1"', line)
#restate the double quotes to single ones, once inside the json
line = re.sub("#", '"', line)
print(line)
This method is not by modifying the Query.
Its by Post Processing For Javascript/Nodejs we can use the npm package athena-struct-parser.
Detailed Answer with Example
https://stackoverflow.com/a/67899845/6662952
Reference - https://www.npmjs.com/package/athena-struct-parser
I used a simple approach to get around the struct -> json Athena limitation. I created a second table where the json columns were saved as raw strings. Using presto json and array functions I was able to query the data and return the valid json string to my program:
--Array transform functions too
select
json_extract_scalar(dd, '$.timestamp') as timestamp,
transform(cast(json_extract(json_parse(dd), '$.stats') as ARRAY<JSON>), x -> json_extract_scalar(x, '$.time')) as arr_stats_time,
transform(cast(json_extract(json_parse(dd), '$.stats') as ARRAY<JSON>), x -> json_extract_scalar(x, '$.mean')) as arr_stats_mean,
transform(cast(json_extract(json_parse(dd), '$.stats') as ARRAY<JSON>), x -> json_extract_scalar(x, '$.var')) as arr_stats_var
from
(select '{"timestamp":1520640777.666096,"stats":[{"time":15,"mean":45.23,"var":0.31},{"time":19,"mean":17.315,"var":2.612}],"dets":[{"coords":[2.4,1.7,0.3], "header":{"frame":1,"seq":1,"name":"hello"}}],"pos": {"x":5,"y":1.4,"theta":0.04}}' as dd);
I know the query will take longer to execute but there are ways to optimize.
I worked around this by creating a second table using the same S3 location, but changed the field's data type to string. The resulting CSV then had the string that Athena pulled from the object in the JSON file and I was able to parse the result.
I also had to adjust the #tarun code, because I had more complex data and nested structures. Here is the solution I've got, I hope it helps:
import re
import json
import numpy as np
pattern1 = re.compile(r'(?<=[{,\[])\s*([^{}\[\],"=]+)=')
pattern2 = re.compile(r':([^{}\[\],"]+|()(?![{\[]))')
pattern3 = re.compile(r'"null"')
def convert_metadata_to_json(value):
if type(value) is str:
value = pattern1.sub('"\\1":', value)
value = pattern2.sub(': "\\1"', value)
value = pattern3.sub('null', value)
elif np.isnan(value):
return None
return json.loads(value)
df = pd.read_csv('test.csv')
df['metadata_json'] = df.metadata.apply(convert_metadata_to_json)
I am very new to Json files. I have a json file with multiple json objects such as following:
{"ID":"12345","Timestamp":"20140101", "Usefulness":"Yes",
"Code":[{"event1":"A","result":"1"},…]}
{"ID":"1A35B","Timestamp":"20140102", "Usefulness":"No",
"Code":[{"event1":"B","result":"1"},…]}
{"ID":"AA356","Timestamp":"20140103", "Usefulness":"No",
"Code":[{"event1":"B","result":"0"},…]}
…
I want to parse these json objects like a stream. The end game for me however is to create pairwise combinations of event1 and result. like so:
[AB, AB, BB],[11,10,10]
What I know:
The exact structure of the dict
What I do not know: How to extract these dict by dict to perform this operation.
I cannot modify the existing file, so don't tell me to add '[ ], and ','
Additional Help:
I might run into files that I cannot store directly in memory, so a stream solution is more apreciated.
The easiest thing there is to feed the file stream into a custom generator, that would "pre-parse" the json objects. That can be done with some state variables counting somewhat naively the number of open { and [ - each time it reaches zero, it yields a string with a full JSON object.
I could not figure out your desired final intent from the example you provided. I suppose you have other dicts inside "code", and what you want in the end is a pair of the combined "event1, result" inside each "code" value for the outermost dicts. If it is not that, suit yourself to change the code.
(An ordered dict is good enough for storing the results you need - and you can retrieve the separate lists for keys and values if you need)
from collections import OrderedDict
import json
import string
import sys
def char_streamer(stream):
for line in stream:
for char in line:
yield char
def json_source(stream):
result = []
curly_count = 0
bracket_count = 0
nonwhitespace_count = 0
inside_string = False
previous_is_escape = False
for char in char_streamer(stream):
if not result and char in string.whitespace:
continue
result.append(char)
if char == '"':
if inside_string:
inside_string = True
elif not previous_is_escape:
inside_string = False
if inside_string:
if char == "\\": # single '\' character
previous_is_escape = not previous_is_escape
else:
previous_is_escape = False
continue
if char == "{":
curly_count += 1
if char == "[":
bracket_count += 1
if char == "}":
curly_count -= 1
if char == "]":
bracket_count -= 1
if curly_count == 0 and bracket_count== 0 and result:
yield(json.loads("".join(result)))
result = []
def main(filename):
result = OrderedDict()
with open(filename) as file:
for data_part in json_source(file):
# agregate your data here
print (result.keys(), result.values())
main(sys.argv[1])
As mentioned in the title, i'm trying to make a simple py script that can be run from terminal to do the following:
Find all JSON files in current working directory and nested folders (this part works well)
Load said files
Recursively search them for a specific value or a substring
If the value is matching, replace it with a new established value by the user
Once finished, save all modified json files to a "converted" folder in the current directory.
That said, the issue is when i try the recursive search method posted below, since i'm pretty much new to python i would appreciate any help with this issue, what i suppose it is... either the json files i'm using or the search method i'm employing.
Simplifying the issue, the value i search for never matches with anything inside the object, be that a key or purely some string value. Tried multiple methods to perform a recursive search but can't get a match.
For example: taking in account the sample json, i want to replace the value "selectable_parts" or "static_parts" or even deeper in the structure "1h_mod310_door_00" but seems like my method of searching can't reach this value in "object[object][children][0][children][5][name]" (hope this helps).
Sample JSON: (https://drive.google.com/open?id=0B2-Bn2b0ujjVdW5YVGg3REg3OWs)
"""KEYWORD REPLACING MODULE."""
import os
import json
# functions
def get_files():
"""lists files"""
exclude = set(['.vscode', 'sample'])
json_files = []
for root, dirs, files in os.walk(os.getcwd(), topdown=True):
dirs[:] = [d for d in dirs if d not in exclude]
for name in files:
if name.endswith('.json'):
json_files.append(os.path.join(root, name))
return json_files
def load_files(json_files):
"""works files"""
for js_file in json_files:
with open(js_file) as json_file:
loaded_json = json.load(json_file)
replace_key_value(loaded_json, os.path.basename(js_file))
def write_file(data_file, new_file_name):
"""writes the file"""
if not os.path.exists('converted'):
os.makedirs('converted')
with open('converted/' + new_file_name, 'w') as json_file:
json.dump(data_file, json_file)
def replace_key_value(js_file, js_file_name):
"""replace and initiate save"""
recursive_replace(js_file, SKEY, '')
# write_file(js_file, js_file_name)
def recursive_replace(data, match, repl):
"""search for needed value and replace its value"""
for key, value in data.items():
if value == match:
print data[key]
print "AHHHHHHHH"
elif isinstance(value, dict):
recursive_replace(value, match, repl)
# main
print "\n" + '- on ' + os.getcwd()
NEW_DIR = raw_input('Work dir (leave empty if current): ')
if not NEW_DIR:
print NEW_DIR
NEW_DIR = os.getcwd()
else:
print NEW_DIR
os.chdir(NEW_DIR)
# get_files()
JS_FILES = get_files()
print '- files on ' + os.getcwd()
# print "\n".join(JS_FILES)
SKEY = raw_input('Value to search: ')
RKEY = raw_input('Replacement value: ')
load_files(JS_FILES)
The issue was the way i navigated the json obj because the method didn't considerate if it was a dict or a list (i believe...).
So to answer my own question here's the recursive search i'm using to check the values:
def get_recursively(search_dict, field):
"""
Takes a dict with nested lists and dicts,
and searches all dicts for a key of the field
provided.
"""
fields_found = []
for key, value in search_dict.iteritems():
if key == field:
print value
fields_found.append(value)
elif isinstance(value, dict):
results = get_recursively(value, field)
for result in results:
if SEARCH_KEY in result:
fields_found.append(result)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
more_results = get_recursively(item, field)
for another_result in more_results:
if SEARCH_KEY in another_result:
fields_found.append(another_result)
return fields_found
# write_file(js_file, js_file_name)
Hope this helps someone.