pandas json normalize key error with a particular json attribute - json

I have a json as:
mytestdata = {
"success": True,
"message": "",
"data": {
"totalCount": 95,
"goal": [
{
"user_id": 123455,
"user_email": "john.smith#test.com",
"user_first_name": "John",
"user_last_name": "Smith",
"people_goals": [
{
"goal_id": 545555,
"goal_name": "test goal name",
"goal_owner": "123455",
"goal_narrative": "",
"goal_type": {
"id": 1,
"name": "Team"
},
"goal_create_at": "1595874095",
"goal_modified_at": "1595874095",
"goal_created_by": "123455",
"goal_updated_by": "123455",
"goal_start_date": "1593561600",
"goal_target_date": "1601424000",
"goal_progress": "34",
"goal_progress_color": "#ff9933",
"goal_status": "1",
"goal_permission": "internal,team",
"goal_category": [],
"goal_owner_full_name": "John Smith",
"goal_team_id": "766754",
"goal_team_name": "",
"goal_workstreams": []
}
]
}
]
}
}
I am trying to display all details in "people_goals" along with "user_last_name", "user_first_name","user_email", "user_id" with json_normalize.
So far I am able to display "people_goals", "user_first_name","user_email" with the code
df2 = pd.json_normalize(data=mytestdata['data'], record_path=['goal', 'people_goals'],
meta=[['goal','user_first_name'], ['goal','user_last_name'], ['goal','user_email']], errors='ignore')
However I am having issue when trying to include ['goal', 'user_id'] in the meta=[]
The error is:
TypeError Traceback (most recent call last)
<ipython-input-192-b7a124a075a0> in <module>
7 df2 = pd.json_normalize(data=mytestdata['data'], record_path=['goal', 'people_goals'],
8 meta=[['goal','user_first_name'], ['goal','user_last_name'], ['goal','user_email'], ['goal','user_id']],
----> 9 errors='ignore')
10
11 # df2 = pd.json_normalize(data=mytestdata['data'], record_path=['goal', 'people_goals'])
The only difference I see for 'user_id' is that it is not a string
Am I missing something here?

Your code works on my platform. I've migrated away from using record_path and meta parameters for two reasons. a) they are difficult to work out b) there are compatibility issues between versions of pandas
Therefore I now use approach of use json_normalize() multiple times to progressively expand JSON. Or use pd.Series. Have included both as examples.
df = pd.json_normalize(data=mytestdata['data']).explode("goal")
df = pd.concat([df, df["goal"].apply(pd.Series)], axis=1).drop(columns="goal").explode("people_goals")
df = pd.concat([df, df["people_goals"].apply(pd.Series)], axis=1).drop(columns="people_goals")
df = pd.concat([df, df["goal_type"].apply(pd.Series)], axis=1).drop(columns="goal_type")
df.T
df2 = pd.json_normalize(pd.json_normalize(
pd.json_normalize(data=mytestdata['data']).explode("goal").to_dict(orient="records")
).explode("goal.people_goals").to_dict(orient="records"))
df2.T
print(df.T.to_string())
output
0
totalCount 95
user_id 123455
user_email john.smith#test.com
user_first_name John
user_last_name Smith
goal_id 545555
goal_name test goal name
goal_owner 123455
goal_narrative
goal_create_at 1595874095
goal_modified_at 1595874095
goal_created_by 123455
goal_updated_by 123455
goal_start_date 1593561600
goal_target_date 1601424000
goal_progress 34
goal_progress_color #ff9933
goal_status 1
goal_permission internal,team
goal_category []
goal_owner_full_name John Smith
goal_team_id 766754
goal_team_name
goal_workstreams []
id 1
name Team

Related

Pulling specific Parent/Child JSON data with Python

I'm having a difficult time figuring out how to pull specific information from a json file.
So far I have this:
# Import json library
import json
# Open json database file
with open('jsondatabase.json', 'r') as f:
data = json.load(f)
# assign variables from json data and convert to usable information
identifier = data['ID']
identifier = str(identifier)
name = data['name']
name = str(name)
# Collect data from user to compare with data in json file
print("Please enter your numerical identifier and name: ")
user_id = input("Numerical identifier: ")
user_name = input("Name: ")
if user_id == identifier and user_name == name:
print("Your inputs matched. Congrats.")
else:
print("Your inputs did not match our data. Please try again.")
And that works great for a simple JSON file like this:
{
"ID": "123",
"name": "Bobby"
}
But ideally I need to create a more complex JSON file and can't find deeper information on how to pull specific information from something like this:
{
"Parent": [
{
"Parent_1": [
{
"Name": "Bobby",
"ID": "123"
}
],
"Parent_2": [
{
"Name": "Linda",
"ID": "321"
}
]
}
]
}
Here is an example that you might be able to pick apart.
You could either:
Make a custom de-jsonify object_hook as shown below and do something with it. There is a good tutorial here.
Just gobble up the whole dictionary that you get without a custom de-jsonify and drill down into it and make a list or set of the results. (not shown)
Example:
import json
from collections import namedtuple
data = '''
{
"Parents":
[
{
"Name": "Bobby",
"ID": "123"
},
{
"Name": "Linda",
"ID": "321"
}
]
}
'''
Parent = namedtuple('Parent', ['name', 'id'])
def dejsonify(json_str: dict):
if json_str.get("Name"):
parent = Parent(json_str.get('Name'), int(json_str.get('ID')))
return parent
return json_str
res = json.loads(data, object_hook=dejsonify)
print(res)
# then we can do whatever... if you need lookups by name/id,
# we could put the result into a dictionary
all_parents = {(p.name, p.id) : p for p in res['Parents']}
lookup_from_input = ('Bobby', 123)
print(f'found match: {all_parents.get(lookup_from_input)}')
Result:
{'Parents': [Parent(name='Bobby', id=123), Parent(name='Linda', id=321)]}
found match: Parent(name='Bobby', id=123)

dataframe to JSON conversion

I have a dataframe that i need to convert to JSON. The data currently looks like
text ids
0 add a car None
1 None 695f1
2 None a86b5c
3 add another car to my log None
4 None 1ba0
5 Concerts None
6 None a4f7
7 None fea
8 None 410
I need the JSON dict to look something like this and ignore the none
{
"text": "add a car",
"ids": [
"695f1",
"a86b5c"
]
}
The steps I have are:
Figured it out.
first set NaN Values where None
df1 = df1.fillna(value=np.nan)
Fill with NaN with previous known value
df1['text'] = df['text'].fillna(method='ffill')
Drop NaN
df1 = df1.dropna()
convert to json
df1.to_json('temp.json', orient='records', lines=True)
The problem I have is the format appears incorrect. I am seeing
{"text":"add a car","ids":" 695f1"}
{"text":"add a car","ids":"a86b5c"}
{"text":"add another car to my log","ids":"1ba0"}
{"text":"Concerts","ids":"a4f7"}
I want :
{
"text": "add a car",
"ids":
[
"695f1",
"a86b5c"
]
}
{
"text": "add another car to my log",
"ids":
[
"1ba0",
],
}
{
"text": "Concerts",
"ids":
[
"a4f7",
"fea",
"410",
]
}
I think you are close, need aggregate list first:
df['text'] = df['text'].ffill()
df = df.dropna()
df1 = df.groupby('text', sort=False).agg(list).reset_index()
print (df1)
text ids
0 add a car [695f1, a86b5c]
1 add another car to my log [1ba0]
2 Concerts [a4f7, fea, 410]
df1.to_json('temp.json', orient='records')

How to convert dataframe output to json format and then Normalize the data?

I am running a sql and output i am reading as pandas df. Now i need to convert the data in to json and need to normalize the data. I tried to_json but this give partial solution.
Dataframe output:
| SalesPerson | ContactID |
|12345 |Tom|
|12345 |Robin|
|12345 |Julie|
Expected JSON:
{"SalesPerson": "12345", "ContactID":"Tom","Robin","Julie"}
Please see below code which i tried.
q = Select COL1, SalesPerson , ContactIDfrom table;
df = pd.read_sql(q, sqlconn)
df1=df.iloc[:, 1:2]
df2 = df1.to_json(orient='records')
also to_json result bracket which i also dont need.
Try this:
df.groupby('SalesPerson').apply(lambda x: pd.Series({
'ContactID': x['ContactID'].values
})).reset_index().to_json(orient='records')
Output (pretty printed):
[
{
"SalesPerson": 1,
"ContactID": ["Tom", "Robin", "Julie"]
},
{
"SalesPerson": 2,
"ContactID": ["Jack", "Mike", "Mary"]
}
]

How to Change a value in a Dataframe based on a lookup from a json file

I want to practice building models and I figured that I'd do it with something that I am familiar with: League of Legends. I'm having trouble replacing an integer in a dataframe with a value in a json.
The datasets I'm using come off of the kaggle. You can grab it and run it for yourself.
https://www.kaggle.com/datasnaek/league-of-legends
I have json file of the form: (it's actually must bigger, but I shortened it)
{
"type": "champion",
"version": "7.17.2",
"data": {
"1": {
"title": "the Dark Child",
"id": 1,
"key": "Annie",
"name": "Annie"
},
"2": {
"title": "the Berserker",
"id": 2,
"key": "Olaf",
"name": "Olaf"
}
}
}
and dataframe of the form
print df
gameDuration t1_champ1id
0 1949 1
1 1851 2
2 1493 1
3 1758 1
4 2094 2
I want to replace the ID in t1_champ1id with the lookup value in the json.
If both of these were dataframe, then I could use the merge option.
This is what I've tried. I don't know if this is the best way to read in the json file.
import pandas
df = pandas.read_csv("lol_file.csv",header=0)
champ = pandas.read_json("champion_info.json", typ='series')
for i in champ.data[0]:
for j in df:
if df.loc[j,('t1_champ1id')] == i:
df.loc[j,('t1_champ1id')] = champ[0][i]['name']
I get the below error:
the label [gameDuration] is not in the [index]'
I'm not sure that this is the most efficient way to do this, but I'm not sure how to do it at all either.
What do y'all think?
Thanks!
for j in df: iterates over the column names in df, which is unnecessary, since you're only looking to match against the column 't1_champ1id'. A better use of pandas functionality is to condense the id:name pairs from your JSON file into a dictionary, and then map it to df['t1_champ1id'].
player_names = {v['id']:v['name'] for v in json_file['data'].itervalues()}
df.loc[:, 't1_champ1id'] = df['t1_champ1id'].map(player_names)
# gameDuration t1_champ1id
# 0 1949 Annie
# 1 1851 Olaf
# 2 1493 Annie
# 3 1758 Annie
# 4 2094 Olaf
Created a dataframe from the 'data' in the json file (also transposed the resulting dataframe and then set the index to what you want to map, the id) then mapped that to the original df.
import json
with open('champion_info.json') as data_file:
champ_json = json.load(data_file)
champs = pd.DataFrame(champ_json['data']).T
champs.set_index('id',inplace=True)
df['champ_name'] = df.t1_champ1id.map(champs['name'])

How to convert pandas Series to desired JSON format?

I am having the following data on which I need to do apply aggregation function followed by groupby.
My data is as follows: data.csv
id,category,sub_category,count
0,x,sub1,10
1,x,sub2,20
2,x,sub2,10
3,y,sub3,30
4,y,sub3,5
5,y,sub4,15
6,z,sub5,20
Here I'm trying to get the count by sub-category wise. After that I need to store the result in JSON format. The following piece of code helps me in achieving that. test.py
import pandas as pd
df = pd.read_csv('data.csv')
sub_category_total = df['count'].groupby([df['category'], df['sub_category']]).sum()
print sub_category_total.reset_index().to_json(orient = "records")
The above code gives me the following format.
[{"category":"x","sub_category":"sub1","count":10},{"category":"x","sub_category":"sub2","count":30},{"category":"y","sub_category":"sub3","count":35},{"category":"y","sub_category":"sub4","count":15},{"category":"z","sub_category":"sub5","count":20}]
But, my desired format is as follows:
{
"x":[{
"sub_category":"sub1",
"count":10
},
{
"sub_category":"sub2",
"count":30}],
"y":[{
"sub_category":"sub3",
"count":35
},
{
"sub_category":"sub4",
"count":15}],
"z":[{
"sub_category":"sub5",
"count":20}]
}
By following the discussions # How to convert pandas DataFrame result to user defined json format, I replaced the last 2 lines of test.py with,
g = df.groupby('category')[["sub_category","count"]].apply(lambda x: x.to_dict(orient='records'))
print g.to_json()
It gives me the following output.
{"x":[{"count":10,"sub_category":"sub1"},{"count":20,"sub_category":"sub2"},{"count":10,"sub_category":"sub2"}],"y":[{"count":30,"sub_category":"sub3"},{"count":5,"sub_category":"sub3"},{"count":15,"sub_category":"sub4"}],"z":[{"count":20,"sub_category":"sub5"}]}
Though the above result is somewhat similar to my desired format, I couldn't perform any aggregation function over here as it throws error saying 'numpy.int64' object has no attribute 'to_dict'. Hence, I end up getting all of the rows in the data file.
Can somebody help me in achieving the above JSON format?
I think you can first aggregate with sum, parameter as_index=False was added to groupby, so output is Dataframe df1 and then use other solution:
df1 = (df.groupby(['category','sub_category'], as_index=False)['count'].sum())
print (df1)
category sub_category count
0 x sub1 10
1 x sub2 30
2 y sub3 35
3 y sub4 15
4 z sub5 20
g = df1.groupby('category')[["sub_category","count"]]
.apply(lambda x: x.to_dict(orient='records'))
print (g.to_json())
{
"x": [{
"sub_category": "sub1",
"count": 10
}, {
"sub_category": "sub2",
"count": 30
}],
"y": [{
"sub_category": "sub3",
"count": 35
}, {
"sub_category": "sub4",
"count": 15
}],
"z": [{
"sub_category": "sub5",
"count": 20
}]
}