parse specific key in json to dataframe column - json

How do you parse a specific key to multiple rows and column from a response?
I am getting a response:
response = requests.post('https://api.kewords.com/v1/get_keyword_data', data=my_data, headers=my_headers)
which returns:
{
"data": {
"heart disease": {
"vol": 18100,
"cpc": {
"currency": "$",
"value": "2.01"
},
"keyword": "keyword planner",
"competition": 0.21,
"trend": [
{
"month": "January",
"year": 2022,
"value": 18100
},
{
"month": "February",
"year": 2022,
However, when I normalize it before I save it to a .csv, it places everything into a single row:
How do you parse a specific field, like data.heart disease.trend in my example, with separate columns for year and month based on value to a pandas dataframe? Every method from normalizing it to dealing with it as a dict has failed.
my code:
my_data = {
'country': 'us',
'currency': 'USD',
'dataSource': 'cli',
f'kw[{keyword}]': ["keywords tool", "keyword planner"]
}
my_headers = {
'Accept': 'application/json',
'Authorization': ''
}
response = requests.post('https://api.keywords.com/v1/get_keyword_data', data=my_data, headers=my_headers)
#write to json
with open('output.json', 'wb') as outf:
outf.write(response.content)
if response.status_code == 200:
print('success\n\n', response.content.decode('utf-8'))
else:
print("An error occurred\n\n", response.content.decode('utf-8'))

Using json_normalize:
data = json.loads(response.text)
df = pd.json_normalize(
data=data["data"]["heart disease"],
meta=["vol", "keyword", "competition", "cpc"],
record_path="trend"
)
df = pd.concat([df.drop(["cpc"], axis=1), df["cpc"].apply(pd.Series)], axis=1)
print(df)
Output:
month year value vol keyword competition currency value
0 January 2022 18100 18100 keyword planner 0.21 $ 2.01
1 February 2022 20000 18100 keyword planner 0.21 $ 2.01

Related

Why generating json data from a list (array of arrays) results in a quotation mark problem?

Considering the dataframe below:
timestamp coordinates
0 [402, 404] [[2.5719,49.0044], [2.5669,49.0043]]
1 [345, 945] [[2.5719,49.0044], [2.5669,49.0043]]
I'd like to generate a json file like below:
[
{
"vendor": 1,
"path": [
[2.5719,49.0044],
[2.5669,49.0043]
],
"timestamps": [402, 404]
},
{
"vendor": 1,
"path": [
[2.5719,49.0044],
[2.5669,49.0043]
],
"timestamps": [345, 945]
}]
To do so, my idea is:
For each row of my df, generate a new column geometry
containing row json data
Then append all geometries in a json
However, my function below doesn't work.
df["geometry"] = df.apply(lambda row: {
"vendor": 1,
"path": row["coordinates"],
"timestamps": row["timestamp"]
},
axis = 1)
Indeed, the result is (for example):
Note the quote marks (') around arrays in path
{
'vendor': 1,
'path': ['[2.5719,49.0044]', '[2.5669,49.0043]'],
'timestamps': [402, 404]
}
Any idea?
Thanks
Presumably the values in coordinates column are of type string. You can use ast.literal_eval to convert it to list:
from ast import literal_eval
df["geometry"] = df.apply(
lambda row: {
"vendor": 1,
"path": literal_eval(row["coordinates"]),
"timestamps": row["timestamp"],
},
axis=1,
)
print(df)
Prints:
timestamp coordinates geometry
0 [402, 404] [[2.5719,49.0044], [2.5669,49.0043]] {'vendor': 1, 'path': [[2.5719, 49.0044], [2.5669, 49.0043]], 'timestamps': [402, 404]}
1 [345, 945] [[2.5719,49.0044], [2.5669,49.0043]] {'vendor': 1, 'path': [[2.5719, 49.0044], [2.5669, 49.0043]], 'timestamps': [345, 945]}

Finding Values in Python json.loads Dictionary

I'm working with a REST API that returns data in the following format:
{
"id": "2902cbad6da44459ad05abd1305eed14",
"displayName": "",
"sourceHost": "dev01.test.lan",
"sourceIP": "192.168.145.1",
"messagesPerSecond": 0,
"messages": 2733,
"size": 292062,
"archiveSize": 0,
"dates": [
{
"date": 1624921200000,
"messages": 279,
"size": 29753,
"archiveSize": 0
},
{
"date": 1625007600000,
"messages": 401,
"size": 42902,
"archiveSize": 0
}
]
}
I'm using json.loads to successfully pull the data from the API, and I now need to search for a particular "date:" value and read the corresponding "messages", "size" and "archiveSize" values.
I'm trying to use the "if-in" method to find the value I'm interested in, for example:
response = requests.request("GET", apiQuery, headers=headers, data=payload)
json_response = json.loads(response.text)
test = 2733
if test in json_response.values():
print(f"Yes, value: '{test}' exist in dictionary")
else:
print(f"No, value: '{test}' does not exist in dictionary")
This works fine for any value in the top section of the JSON return, but it never finds any values in the "dates" sub-branches.
I have two questions, firstly, how do I find the target "date" value? Secondly, once I find that "sub-branch" what would be the best way to extract the three values I need?
Thanks.
from json import load
def list_dates_whose_message_count_equals(dates=None, message_count=0):
return list(filter(
lambda date: date.get("messages") == message_count, dates
))
def main():
json_ = {}
with open("values.json", "r") as fp:
json_ = load(fp)
print(list_dates_whose_message_count_equals(json_["dates"], message_count=279))
print(list_dates_whose_message_count_equals(json_["dates"], message_count=401))
if __name__ == "__main__":
main()
Returns this
[{'date': 1624921200000, 'messages': 279, 'size': 29753, 'archiveSize': 0}]
[{'date': 1625007600000, 'messages': 401, 'size': 42902, 'archiveSize': 0}]

json_normalize does not read all data

I have a json file that I want to flatten and retrieve all the information into a pandas dataframe. The json file looks like this:
jsonstr = {
"calculation": {
"id": "3k3k3k3kwk3kwk",
"Id": 23,
"submissionDate": 1622428064679,
"serverVersion": "3.3.5.6.r",
"tag": [
{
"code": "qq4059331155113278",
"manual": {
"location": {
"x": 26.5717,
"y": 59.4313,
"z": 0.0,
"floor": 0
},
"timestamp": 1599486138000
},
"device": null,
"measurements": [
{
"Address": "D_333",
"subcell": "",
"frequency": 14.0,
"dfId": 0
},
{
"trxAddress": "D_334",
"subcell": "",
"frequency": 11.0,
"dfId": 0
}]
}]
}
}
Now, as usual, I do the following. I thought that this would return all the "fields", including id, Id, submissionDate and so on
import os, json
import pandas as pd
import numpy as np
import glob
pd.set_option('display.max_columns', None)
file = './Testjson.json'
#file = './jsondumps/ff80818178f93bd90179ab51781e1c95.json'
with open(file) as json_string:
jsonstr = json.load(json_string)
labels = pd.json_normalize(jsonstr, record_path=['calculation','tag'])
But in fact, it returns:
code device \
0 qq4059331155113278 None
measurements manual.location.x \
0 [{'Address': 'D_333', 'subcell': '', 'frequenc... 26.5717
manual.location.y manual.location.z manual.location.floor \
0 59.4313 0.0 0
manual.timestamp
0 1599486138000
and trying the following
labels = pd.json_normalize(jsonstr, record_path=['calculation','tag'], meta=['id', 'Id'])
returns an error:
KeyError: 'id'
which makes sense. But What am I doing wrong to begin with? Why can I not get all the fields under calculation since they are in the path?
Greatful for any insights!
Your syntax is slightly off on the meta argument. id and Id are at the end of the dataframe.
If you are looking to flatten the entire json, look into flatten_json. It's a pretty good library to use with nested json.
pd.json_normalize(jsonstr, record_path=['calculation','tag'], meta=[['calculation','id'],['calculation','Id']])
code device measurements manual.location.x manual.location.y manual.location.z manual.location.floor manual.timestamp calculation.id calculation.Id
0 qq4059331155113278 null [{'Address': 'D_333', 'subcell': '', 'frequenc... 26.5717 59.4313 0.0 0 1599486138000 3k3k3k3kwk3kwk 23

Join nested JSON dataframe and another dataframe

I am trying to join a dataframe1 generated by the JSON with dataframe2 using the field order_id, then assign the "status" from dataframe2 to the "status" of dataframe1. Anyone knows how to do this. Many thanks for your help.
dataframe1
[{
"client_id": 1,
"name": "Test01",
"olist": [{
"order_id": 10000,
"order_dt_tm": "2012-12-01",
"status": "" <== use "status" from dataframe2 to populate this field
},
{
"order_id": 10000,
"order_dt_tm": "2012-12-01",
"status": ""
}
]
},
{
"client_id": 2,
"name": "Test02",
"olist": [{
"order_id": 10002,
"order_dt_tm": "2012-12-01",
"status": ""
},
{
"order_id": 10003,
"order_dt_tm": "2012-12-01",
"status": ""
}
]
}
]
dataframe2
order_id status
10002 "Delivered"
10001 "Ordered"
Here is your raw dataset as a json string:
d = """[{
"client_id": 1,
"name": "Test01",
"olist": [{
"order_id": 10000,
"order_dt_tm": "2012-12-01",
"status": ""
},
{
"order_id": 10000,
"order_dt_tm": "2012-12-01",
"status": ""
}
]
},
{
"client_id": 2,
"name": "Test02",
"olist": [{
"order_id": 10002,
"order_dt_tm": "2012-12-01",
"status": ""
},
{
"order_id": 10003,
"order_dt_tm": "2012-12-01",
"status": ""
}
]
}
]"""
Firstly, I would load it as json:
import json
data = json.loads(d)
Then, I would turn it into a Pandas dataframe, notice that I remove status field as it will be populated by the join step :
df1 = pd.json_normalize(data, 'olist')[['order_id', 'order_dt_tm']]
Then, from the second dataframe sample, I would do a left join using merge function:
data = {'order_id':[10002, 10001],'status':['Delivered', 'Ordered']}
df2 = pd.DataFrame(data)
result = df1.merge(df2, on='order_id', how='left')
Good luck
UPDATE
# JSON to Dataframe
df1 = pd.json_normalize(data)
# Sub JSON to dataframe
df1['sub_df'] = df1['olist'].apply(lambda x: pd.json_normalize(x).drop('status', axis=1))
# Build second dataframe
data2 = {'order_id':[10002, 10001],'status':['Delivered', 'Ordered']}
df2 = pd.DataFrame(data2)
# Populates status in sub dataframes
df1['sub_df'] = df1['sub_df'].apply(lambda x: x.merge(df2, on='order_id', how='left').fillna(''))
# Sub dataframes back to JSON
def back_to_json_str(df):
# turns a df back to string json
return str(df.to_json(orient="records", indent=4))
df1['olist'] = df1['sub_df'].apply(lambda x: back_to_json_str(x))
# Global DF back to JSON string
parsed = str(df1.drop('sub_df', axis=1).to_json(orient="records", indent=4))
parsed = parsed.replace(r'\n', '\n')
parsed = parsed.replace(r'\"', '\"')
# Print result
print(parsed)
UPDATE 2
here is a way to add index colum to a dataframe:
df1['index'] = [e for e in range(df1.shape[0])]
This is my code assigning title values from a dataframe back to the JSON object. The assignment operation takes a bit time if the number records in the JSON object is 100000. Anyone knows how to improve the performance of this code. Many thanks.
import json
import random
import pandas as pd
import pydash as _
data = [{"pid":1,"name":"Test1","title":""},{"pid":2,"name":"Test2","title":""}] # 5000 records
# dataframe1
df = pd.json_normalize(data)
# dataframe2
pid = [x for x in range(1, 5000)]
title_set = ["Boss", "CEO", "CFO", "PMO", "Team Lead"]
titles = [title_set[random.randrange(0, 5)] for x in range(1, 5000)]
df2 = pd.DataFrame({'pid': pid, 'title': titles})
#left join dataframe1 and dataframe2
df3 = df.merge(df2, on='pid', how='left')
#assign title values from dataframe back to the json object
for row in df3.iterrows():
idx = _.find_index(data, lambda x: x['pid'] == row[1]['pid'])
data[idx]['title'] = row[1]['title_y']
print(data)

How to Get JSON values Python

Learning Days
Code to the get the data in JSON Format
#...
cursor.execute("SELECT * FROM user")
response = {
"version": "5.2",
"user_type": "online",
"user": list(cursor),
}
response = json.dumps(response, sort_keys=False, indent=4, separators=(',', ': '))
print(response)
# ...
This produces output as
{
"version": "5.2",
"user_type": "online",
"user":
[
{
"name": "John",
"id": 50
},
{
"name": "Mark",
"id": 57
}
]
}
print(response["user"]) - TypeError: string indices must be integers
How do i access the values in JSON
json.dumps return a string, need a small conversion something like this, not sure is this the exact method to do
Solution:
response = JSONEncoder().encode(response )
response = JSONDecoder().decode(response )
response = json.loads(response )
print(response['user'[0]['id'])