Pandas read one parameter from nested json - json

i have a following json file and i would like to read all the parameters: "dataRecordId" only and store them into a df:
{'responseInformation': '20 metadata records in response.',
'metaDataResponse': [{'timestampFrom': '2020-10-07T10:19:07.7810000Z',
'timestampTo': '2020-10-07T23:59:59.9999990Z',
'component': {'type': '', 'id': '', 'name': '', 'comment': ''},
'resource': {'type': 'EQU', 'id': '6100380', 'name': '', 'comment': ''},
'processStep': {'type': '', 'id': '', 'name': '', 'comment': ''},
'context': '',
'dataRecords': [{'dataRecordId': '171533103',
'groupName': 'Process',
'sensorName': 'AutomaticProcessActive',
'profile': 'sd',
'type': 'Switch2Way',
'unit': 'state',
'returnType': 'timeSeries'}]},
{'timestampFrom': '2020-10-08T00:00:00.6540000Z',
'timestampTo': '2020-10-08T23:59:59.9999990Z',
'component': {'type': '', 'id': '', 'name': '', 'comment': ''},
'resource': {'type': 'EQU', 'id': '6100380', 'name': '', 'comment': ''},
'processStep': {'type': '', 'id': '', 'name': '', 'comment': ''},
'context': '',
'dataRecords': [{'dataRecordId': '171534669',
'groupName': 'Process',
'sensorName': 'AutomaticProcessActive',
'profile': 'sd',
'type': 'Switch2Way',
'unit': 'state',
'returnType': 'timeSeries'}]},
This is what i did so far, but i have no idea how to go deeper in the structure, in order to achieve the 'dataRecordId':
import json
with open('file_200826_201026.json') as json_file:
data = json.load(json_file)
for p in data['metaDataResponse']:
print('p['dataRecords'])

Related

Json decode error while importing from a csv file

I am writing a python program that load a json string and decode from a .csv file. The .csv file includs the title and one entry below for reference.
key,labels,raw_tweet
2017_Q3_270,"[0, 0]","{'in_reply_to_screen_name': None, 'user': {'profile_banner_url': 'https://pbs.twimg.com/profile_banners/148491006/1494299074', 'follow_request_sent': None, 'name': 'Vanessa', 'verified': False, 'profile_sidebar_fill_color': 'FFFFFF', 'profile_background_color': '352726', 'is_translator': False, 'profile_background_image_url_https': 'https://pbs.twimg.com/profile_background_images/578700342637895680/j-o_FCwY.png', 'id': 148491006, 'geo_enabled': True, 'profile_background_image_url': 'http://pbs.twimg.com/profile_background_images/578700342637895680/j-o_FCwY.png', 'default_profile': False, 'contributors_enabled': False, 'default_profile_image': False, 'location': 'everywhere', 'profile_background_tile': True, 'notifications': None, 'listed_count': 9, 'profile_link_color': '7FDBB6', 'protected': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/891824958225215488/h__HMMlC_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/891824958225215488/h__HMMlC_normal.jpg', 'following': None, 'time_zone': 'Eastern Time (US & Canada)', 'friends_count': 588, 'url': 'https://Instagram.com/vmanks/', 'profile_text_color': '333333', 'followers_count': 541, 'utc_offset': -14400, 'id_str': '148491006', 'description': 'from the bronx, studying at cornell, slowly but surely finding solace', 'created_at': 'Wed May 26 21:01:46 +0000 2010', 'screen_name': 'vmankss', 'favourites_count': 19781, 'profile_use_background_image': True, 'profile_sidebar_border_color': 'FFFFFF', 'statuses_count': 50506, 'lang': 'en'}, 'retweet_count': 0, 'is_quote_status': False, 'in_reply_to_user_id': None, 'id': 901132409508421632, 'coordinates': None, 'entities': {'symbols': [], 'urls': [], 'user_mentions': [], 'hashtags': []}, 'text': ""I basically just go to financial aid to take candy from the candy bowl, y'all are unhelpful"", 'in_reply_to_status_id_str': None, 'in_reply_to_status_id': None, 'geo': None, 'favorited': False, 'place': {'country_code': 'US', 'bounding_box': {'type': 'Polygon', 'coordinates': [[[-76.547738, 42.41815], [-76.547738, 42.480827], [-76.469987, 42.480827], [-76.469987, 42.41815]]]}, 'attributes': {}, 'country': 'United States', 'url': 'https://api.twitter.com/1.1/geo/id/ae76bffcaf2bf545.json', 'full_name': 'Ithaca, NY', 'name': 'Ithaca', 'id': 'ae76bffcaf2bf545', 'place_type': 'city'}, 'favorite_count': 0, 'retweeted': False, 'timestamp_ms': '1503681683314', 'truncated': False, 'id_str': '901132409508421632', 'created_at': 'Fri Aug 25 17:21:23 +0000 2017', 'in_reply_to_user_id_str': None, 'contributors': None, 'source': 'Twitter for iPhone', 'lang': 'en', 'filter_level': 'low'}"
2015_Q1_494,"[0, 0]","{'in_reply_to_user_id_str': None, 'id_str': '577090329658175488', 'timestamp_ms': '1426424031067', 'in_reply_to_status_id_str': None, 'lang': 'en', 'favorited': False, 'retweeted': False, 'in_reply_to_status_id': None, 'id': 577090329658175488, 'filter_level': 'low', 'created_at': 'Sun Mar 15 12:53:51 +0000 2015', 'in_reply_to_user_id': None, 'place': {'country': 'United States', 'url': 'https://api.twitter.com/1.1/geo/id/a307591cd0413588.json', 'id': 'a307591cd0413588', 'country_code': 'US', 'place_type': 'city', 'attributes': {}, 'full_name': 'Buffalo, NY', 'bounding_box': {'type': 'Polygon', 'coordinates': [[[-78.912276, 42.826008], [-78.912276, 42.966451], [-78.79485, 42.966451], [-78.79485, 42.826008]]]}, 'name': 'Buffalo'}, 'truncated': False, 'entities': {'user_mentions': [], 'hashtags': [], 'symbols': [], 'trends': [], 'urls': []}, 'text': '""He licked coke off an encyclopedia"" only in south buffalo', 'retweet_count': 0, 'source': 'Twitter for iPhone', 'in_reply_to_screen_name': None, 'user': {'id_str': '480575646', 'friends_count': 367, 'profile_image_url': 'http://pbs.twimg.com/profile_images/571759767896629250/C-94okMM_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/480575646/1402863912', 'listed_count': 2, 'screen_name': 'MichaelaFeeney', 'lang': 'en', 'notifications': None, 'profile_text_color': '333333', 'verified': False, 'favourites_count': 3995, 'name': 'Michæla...', 'protected': False, 'statuses_count': 2666, 'id': 480575646, 'profile_sidebar_border_color': 'C0DEED', 'profile_use_background_image': True, 'profile_sidebar_fill_color': 'DDEEF6', 'is_translator': False, 'time_zone': None, 'profile_link_color': '0084B4', 'created_at': 'Wed Feb 01 17:11:27 +0000 2012', 'geo_enabled': True, 'url': None, 'contributors_enabled': False, 'following': None, 'default_profile_image': False, 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'description': 'They call me Lông Isländ. Brockport2018✌', 'utc_offset': None, 'location': '', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/571759767896629250/C-94okMM_normal.jpeg', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'default_profile': True, 'followers_count': 221, 'follow_request_sent': None, 'profile_background_color': 'C0DEED'}, 'coordinates': {'type': 'Point', 'coordinates': [-78.805803, 42.869134]}, 'possibly_sensitive': False, 'geo': {'type': 'Point', 'coordinates': [42.869134, -78.805803]}, 'favorite_count': 0, 'contributors': None}"
2017_Q4_280,"[0, 0]","{'in_reply_to_screen_name': None, 'user': {'profile_banner_url': 'https://pbs.twimg.com/profile_banners/2812396208/1425183203', 'follow_request_sent': None, 'name': 'HunnyBon', 'verified': False, 'profile_sidebar_fill_color': '000000', 'profile_background_color': '000000', 'notifications': None, 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'id': 2812396208, 'geo_enabled': True, 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'default_profile': False, 'contributors_enabled': False, 'default_profile_image': False, 'location': 'New York, NY', 'profile_background_tile': False, 'translator_type': 'none', 'listed_count': 5, 'profile_link_color': '666666', 'protected': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/572570217272713216/rzw1Bbqs_normal.png', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/572570217272713216/rzw1Bbqs_normal.png', 'following': None, 'time_zone': None, 'friends_count': 68, 'url': 'http://www.hunnybon.com', 'profile_text_color': '000000', 'followers_count': 66, 'utc_offset': None, 'id_str': '2812396208', 'description': ""A Healthier Candy Store..organic, vegan, and nonGMO. Indulge your sweet tooth without the guilt. Chocolates, gummies, caramels...what's your indulgence?"", 'created_at': 'Tue Sep 16 03:56:36 +0000 2014', 'screen_name': 'HunnyBonSweets', 'favourites_count': 53, 'profile_use_background_image': False, 'profile_sidebar_border_color': '000000', 'lang': 'en', 'statuses_count': 252, 'is_translator': False}, 'retweet_count': 0, 'is_quote_status': False, 'in_reply_to_user_id': None, 'id': 925755798147313664, 'coordinates': {'type': 'Point', 'coordinates': [-74.0064, 40.7142]}, 'entities': {'symbols': [], 'urls': [{'expanded_url': '', 'display_url': 'instagram.com/p/Ba9WuoQlYuk/', 'url': '', 'indices': [98, 121]}], 'user_mentions': [], 'hashtags': []}, 'text': '🍫Hello November, and hello to our new Chocolate Matcha Truffles! 🍫RAW dark chocolate, CREAMY NUT… ', 'in_reply_to_status_id_str': None, 'in_reply_to_status_id': None, 'geo': {'type': 'Point', 'coordinates': [40.7142, -74.0064]}, 'favorited': False, 'reply_count': 0, 'place': {'country_code': 'US', 'bounding_box': {'type': 'Polygon', 'coordinates': [[[-74.026675, 40.683935], [-74.026675, 40.877483], [-73.910408, 40.877483], [-73.910408, 40.683935]]]}, 'attributes': {}, 'country': 'United States', 'url': '', 'full_name': 'Manhattan, NY', 'name': 'Manhattan', 'id': '01a9a39529b27f36', 'place_type': 'city'}, 'favorite_count': 0, 'retweeted': False, 'timestamp_ms': '1509552356646', 'possibly_sensitive': False, 'truncated': False, 'id_str': '925755798147313664', 'created_at': 'Wed Nov 01 16:05:56 +0000 2017', 'quote_count': 0, 'in_reply_to_user_id_str': None, 'contributors': None, 'source': '', 'lang': 'en', 'filter_level': 'low'}"
I am trying to load raw_tweet, which is a json object as a string and decode it into a json object. I keep getting errors regardless of how I decode the string.
import csv
import json
with open('testfile.csv','r', encoding='utf-8', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
jobj = row['raw_tweet'].replace("\'", "\"")
jobj = jobj.replace("None", "\"\"")
json.loads(jobj)
How I load the csv file. When I run the program, I get the following error. I also trying using panda dataframe to load and decode it into json object. I failed. Please suggest where I did wrong.
Traceback (most recent call last):
File "/Sandbox/csvfile.py", line 9, in <module>
json.loads(jobj)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/json/__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/json/decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 184 (char 183)
So in your csv file within the column raw tweet there are instances of False without any quotes. Also replacing the single quotes to double quotes has major break condition like your json already has strings like y'all which inherently uses single quote. So we only need to replace quotes for the keys and actual values and not quotes that occur within the string. So there are a lot of conditions to be replaced.
So I would rather suggest a different way of evaluating the csv and dumping jsons of the raw_tweet column.
import pandas as pd
data = pd.read_csv("test.csv").to_dict('records')
for d in data:
raw_tweet_dict = eval(d['raw_tweet'])
with open("json_dump.json", "w") as fp:
json.dump(raw_tweet_dict, fp)
You can use the raw_tweet_dict as a dictionary if this needs further transformation.
Alternatively you can also use your approach but you have add a lot of condition which I have added for now, it should work on your csv sample.
with open("test.csv", "r") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
jobj = row['raw_tweet'].replace('"', "'")
jobj = jobj.replace("None", "''")
jobj = jobj.replace("False", "'False'").replace("True", "'True'")
jobj = jobj.replace("':", '\":').replace(": '", ': \"').replace("',", '\",').replace(", '", ', \"').replace("{'", '{\"').replace("'}", '\"}')
json.loads(jobj)

How to pass a Json data from one folder to another folder in react

I am working on a React project, where I am trying to pass jsondata from one folder to another folder but it's not working. It is showing an error like this
./src/Pages/Dashboard/Dashboard.js Module not found: Can't resolve
'./API/jsondata'
This is is my code
This is jsondata.js
{
user: [
{
'id': '1',
'name': 'test1',
'age': '11',
'gender': 'male',
'email': 'test1#gmail.com'
},
{
'id': '2',
'name': 'test2',
'age': '12',
'gender': 'male',
'email': 'test2#gmail.com'
}, {
'id': '3',
'name': 'test3',
'age': '13',
'gender': 'male',
'email': 'test3#gmail.com'
}, {
'id': '4',
'name': 'test4',
'age': '14',
'gender': 'male',
'email': 'test4#gmail.com'
}, {
'id': '5',
'name': 'test5',
'age': '15',
'gender': 'male',
'email': 'test5#gmail.com'
},
{
'id': '6',
'name': 'test6',
'age': '16',
'gender': 'male',
'email': 'test6#gmail.com'
},
]
}
This is Dashboard.js
import React from 'react';
import Jsondata from './API/jsondata'
import './Dashboard.css';
const Dashboard = () => {
console.log(Jsondata, 'data')
return (
<div className='container'>
<div className='row'>
<div className='col-12'>
</div>
</div>
</div>
)
}
export default Dashboard
Any ideas to what may be causing the problem?
There can be many reasons:
First one you didn't exported your data from json file.
export const Jsondata = [
{
'id': '1',
'name': 'test1',
'age': '11',
'gender': 'male',
'email': 'test1#gmail.com'
},
{
'id': '2',
'name': 'test2',
'age': '12',
'gender': 'male',
'email': 'test2#gmail.com'
},
];
Second maybe your file path is wrong.Just double check it.Here you have given the name Jsondata but there is no such const in json file.
import {Jsondata} from './API/jsondata'
You can define a variable inside jsondata.js file.
Like;
const users = [
...
];
export { users };
And from Dashboard.js, you can import as
import { users } from './API/jsondata';
Rename the jsondata file to data.json (the file should have .json extension) and then it works. Your import can omit .json extension like this
import mydata from "./data";
Note: You don't have to export anything in data.json file

Pandas - json normalize inside dataframe

I want to break down a column in a dataframe into multiple columns.
I have a dataframe with the following configuration:
GroupId,SubGroups,Type,Name
-4781505553015217258,"{'GroupId': -732592932641342965, 'SubGroups': [], 'Type': 'DefaultSite', 'Name': 'Default Site'}",OrganisationGroup,CompanyXYZ
-4781505553015217258,"{'GroupId': 8123255835936628631, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'MERCEDES BENZ'}",OrganisationGroup,CompanyXYZ
-4781505553015217258,"{'GroupId': -1785570219922840611, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'VOLVO'}",OrganisationGroup,CompanyXYZ
-4781505553015217258,"{'GroupId': -3670461095557699088, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'SCANIA'}",OrganisationGroup,CompanyXYZ
-4781505553015217258,"{'GroupId': 8683757391859854416, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'DRIVERS'}",OrganisationGroup,CompanyXYZ
-4781505553015217258,"{'GroupId': -8066654520755643389, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'X - DECOMMISSION'}",OrganisationGroup,CompanyXYZ
-4781505553015217258,"{'GroupId': 4177323092254043025, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'X-INSTALLATION'}",OrganisationGroup,CompanyXYZ
-4781505553015217258,"{'GroupId': -6088426161802844604, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'FORD'}",OrganisationGroup,CompanyXYZ
-4781505553015217258,"{'GroupId': 8512440039365422841, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'HEAVY VEHICLES'}",OrganisationGroup,CompanyXYZ
I want to create a new dataframe where the SubGroups column is broken into its components. Note that the names inside SubGroups column are prefixed with SubGroups_
GroupId, SubGroup_GroupId, SubGroup_SubGroups, SubGroup_Type, SubGroup_Name, Type, Name
-4781505553015217258, -732592932641342965, [], 'DefaultSite', 'Default Site', OrganisationGroup, CompanyXYZ
-4781505553015217258, 8123255835936628631, [], 'SiteGroup', 'MERCEDES BENZ', OrganisationGroup, CompanyXYZ
I have tried the following code:
for row in AllSubGroupsDF.itertuples():
newDF= newDF.append((pd.io.json.json_normalize(row.SubGroups)))
But it returns
GroupId,SubGroups,Type,Name
-732592932641342965,[],DefaultSite,Default Site
8123255835936628631,[],SiteGroup,MERCEDES BENZ
-1785570219922840611,[],SiteGroup,VOLVO
-3670461095557699088,[],SiteGroup,SCANIA
8683757391859854416,[],SiteGroup,DRIVERS
-8066654520755643389,[],SiteGroup,X - DECOMMISSION
4177323092254043025,[],SiteGroup,X-INSTALLATION
-6088426161802844604,[],SiteGroup,FORD
8512440039365422841,[],SiteGroup,HEAVY VEHICLES
I would like to have it all end up in one dataframe but I'm not sure how. Please help?
You can try using ast package:-
import pandas as pd
import ast
data = [[-4781505553015217258,"{'GroupId': -732592932641342965, 'SubGroups': [], 'Type': 'DefaultSite', 'Name': 'Default Site'}","OrganisationGroup","CompanyXYZ"],
[-4781505553015217258,"{'GroupId': 8123255835936628631, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'MERCEDES BENZ'}","OrganisationGroup","CompanyXYZ"],
[-4781505553015217258,"{'GroupId': -1785570219922840611, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'VOLVO'}","OrganisationGroup","CompanyXYZ"],
[-4781505553015217258,"{'GroupId': -3670461095557699088, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'SCANIA'}","OrganisationGroup","CompanyXYZ"],
[-4781505553015217258,"{'GroupId': 8683757391859854416, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'DRIVERS'}","OrganisationGroup","CompanyXYZ"],
[-4781505553015217258,"{'GroupId': -8066654520755643389, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'X - DECOMMISSION'}","OrganisationGroup","CompanyXYZ"],
[-4781505553015217258,"{'GroupId': 4177323092254043025, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'X-INSTALLATION'}","OrganisationGroup","CompanyXYZ"],
[-4781505553015217258,"{'GroupId': -6088426161802844604, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'FORD'}","OrganisationGroup","CompanyXYZ"],
[-4781505553015217258,"{'GroupId': 8512440039365422841, 'SubGroups': [], 'Type': 'SiteGroup', 'Name': 'HEAVY VEHICLES'}","OrganisationGroup","CompanyXYZ"]]
df = pd.DataFrame(data,columns=["GroupId","SubGroups","Type","Name"])
df["SubGroup_GroupId"] = df["SubGroups"].map(lambda x: ast.literal_eval(x)["GroupId"])
df["SubGroup_SubGroups"] = df["SubGroups"].map(lambda x: ast.literal_eval(x)["SubGroups"])
df["SubGroup_Type"] = df["SubGroups"].map(lambda x: ast.literal_eval(x)["Type"])
df["SubGroup_Name"] = df["SubGroups"].map(lambda x: ast.literal_eval(x)["Name"])
df
Hope this helps!!

Python giving vague error when trying to parse JSON object

I'm trying to use the peopledata API at peopledatalabs.com to retrieve data. I am using the sample python code located at https://docs.peopledatalabs.com/docs/quickstart
which is:
import requests
API_KEY = # YOUR API KEY
###
pdl_url = "https://api.peopledatalabs.com/v4/person?api_key={}&".format(API_KEY)
param_string = "name=sean thorne&company=peopledatalabs.com"
json_response = requests.get(pdl_url + param_string).json()
# OR
pdl_url = "https://api.peopledatalabs.com/v4/person"
params = {
"api_key": API_KEY,
"name": ["sean thorne"],
"company": ["peopledatalabs.com"]
}
json_response = requests.get(pdl_url, params=params).json()
json_response returns:
{'status': 200,
'likelihood': 5,
'data': {'id': 'yj5RUCSORrirXf2sf3gR',
'skills': [{'name': 'social media'},
{'name': 'strategic partnerships'},
{'name': 'public speaking'},
{'name': 'sales'},
{'name': 'photoshop'},
{'name': 'networking'},
{'name': 'mobile marketing'},
{'name': 'start ups'},
{'name': 'business development'},
{'name': 'fundraising'},
{'name': 'seo'},
{'name': 'strategy'},
{'name': 'idea generation'},
{'name': 'enterprise technology sales'},
{'name': 'entrepreneurship'},
{'name': 'social networking'},
{'name': 'creative strategy'},
{'name': 'time management'},
{'name': 'product management'},
{'name': 'social media marketing'},
{'name': 'css'},
{'name': 'https'},
{'name': 'saas'},
{'name': 'management'},
{'name': 'project management'},
{'name': 'public relations'},
{'name': 'marketing communications'},
{'name': 'sales/marketing and strategic partnerships'},
{'name': 'marketing strategy'},
{'name': 'mobile devices'},
{'name': 'installation'},
{'name': 'company culture'},
{'name': 'strategic vision'},
{'name': 'html5'},
{'name': 'hiring'}],
'industries': [{'name': 'computer software', 'is_primary': True}],
'interests': [{'name': 'location based services'},
{'name': 'mobile'},
{'name': 'social media'},
{'name': 'colleges'},
{'name': 'university students'},
{'name': 'consumer internet'},
{'name': 'college campuses'}],
'profiles': [{'network': 'linkedin',
'ids': ['145991517'],
'clean': 'linkedin.com/in/seanthorne',
'aliases': [],
'username': 'seanthorne',
'is_primary': True,
'url': 'http://www.linkedin.com/in/seanthorne'},
{'network': 'linkedin',
'ids': [],
'clean': 'linkedin.com/in/sean-thorne-9b9a8540',
'aliases': ['linkedin.com/pub/sean-thorne/40/a85/9b9'],
'username': 'sean-thorne-9b9a8540',
'is_primary': False,
'url': 'http://www.linkedin.com/in/sean-thorne-9b9a8540'},
{'network': 'twitter',
'ids': [],
'clean': 'twitter.com/seanthorne5',
'aliases': [],
'username': 'seanthorne5',
'url': 'http://www.twitter.com/seanthorne5'},
{'network': 'angellist',
'ids': [],
'clean': 'angel.co/475041',
'aliases': [],
'username': '475041',
'url': 'http://www.angel.co/475041'}],
'emails': [{'address': 'sthorne#uoregon.edu',
'type': None,
'sha256': 'e206e6cd7fa5f9499fd6d2d943dcf7d9c1469bad351061483f5ce7181663b8d4',
'domain': 'uoregon.edu',
'local': 'sthorne'},
{'address': 'sean#peopledatalabs.com',
'type': 'current_professional',
'sha256': '138ea1a7076bb01889af2309de02e8b826c27f022b21ea8cf11aca9285d5a04e',
'domain': 'peopledatalabs.com',
'local': 'sean'}],
'phone_numbers': [{'E164': '+14155688415',
'number': '+14155688415',
'type': None,
'country_code': '1',
'national_number': '4155688415',
'area_code': '415'}],
'birth_date_fuzzy': '1990',
'birth_date': None,
'gender': 'male',
'primary': {'job': {'company': {'name': 'people data labs',
'founded': '2015',
'industry': 'information technology and services',
'location': {'locality': 'san francisco',
'region': 'california',
'country': 'united states'},
'profiles': ['linkedin.com/company/peopledatalabs',
'linkedin.com/company/1640694639'],
'website': 'peopledatalabs.com',
'size': '11-50'},
'locations': [],
'end_date': None,
'start_date': '2015-03',
'title': {'levels': ['owner'],
'name': 'co-founder',
'functions': ['co founder']},
'last_updated': '2019-05-01'},
'location': {'name': 'san francisco, california, united states',
'locality': 'san francisco',
'region': 'california',
'country': 'united states',
'last_updated': '2019-01-01',
'continent': 'north america'},
'name': {'first_name': 'sean',
'middle_name': None,
'last_name': 'thorne',
'clean': 'sean thorne'},
'industry': 'computer software',
'personal_emails': [],
'linkedin': 'linkedin.com/in/seanthorne',
'work_emails': ['sean#peopledatalabs.com'],
'other_emails': ['sthorne#uoregon.edu']},
'names': [{'first_name': 'sean',
'last_name': 'thorne',
'suffix': None,
'middle_name': None,
'middle_initial': None,
'name': 'sean thorne',
'clean': 'sean thorne',
'is_primary': True}],
'locations': [{'name': 'san francisco, california, united states',
'locality': 'san francisco',
'region': 'california',
'subregion': 'city and county of san francisco',
'country': 'united states',
'continent': 'north america',
'type': 'locality',
'geo': '37.77,-122.41',
'postal_code': None,
'zip_plus_4': None,
'street_address': None,
'address_line_2': None,
'most_recent': True,
'is_primary': True,
'last_updated': '2019-01-01'}],
'experience': [{'company': {'name': 'hallspot',
'size': '1-10',
'founded': '2013',
'industry': 'computer software',
'location': {'locality': 'portland',
'region': 'oregon',
'country': 'united states'},
'profiles': ['linkedin.com/company/hallspot',
'twitter.com/hallspot',
'crunchbase.com/organization/hallspot',
'linkedin.com/company/3019184'],
'website': 'hallspot.com'},
'locations': [],
'end_date': '2015-02',
'start_date': '2012-08',
'title': {'levels': ['owner'],
'name': 'co-founder',
'functions': ['co founder']},
'type': None,
'is_primary': False,
'most_recent': False,
'last_updated': None},
{'company': {'name': 'people data labs',
'size': '11-50',
'founded': '2015',
'industry': 'information technology and services',
'location': {'locality': 'san francisco',
'region': 'california',
'country': 'united states'},
'profiles': ['linkedin.com/company/peopledatalabs',
'linkedin.com/company/1640694639'],
'website': 'peopledatalabs.com'},
'locations': [],
'end_date': None,
'start_date': '2015-03',
'title': {'levels': ['owner'],
'name': 'co-founder',
'functions': ['co founder']},
'type': None,
'is_primary': True,
'most_recent': True,
'last_updated': '2019-05-01'}],
'education': [{'school': {'name': 'university of oregon',
'type': 'post-secondary institution',
'location': 'eugene, oregon, united states',
'profiles': ['linkedin.com/edu/university-of-oregon-19207',
'facebook.com/universityoforegon',
'twitter.com/uoregon'],
'website': 'uoregon.edu'},
'end_date': '2014',
'start_date': '2010',
'gpa': None,
'degrees': [],
'majors': ['entrepreneurship'],
'minors': [],
'locations': []}]},
'dataset_version': '7.3'}
While trying to get the phone_numbers field, I have tried:
print(json_response["phone_numbers"])
and got the error code:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-132-2acb0f9f59c5> in <module>()
----> 1 json_response["phone_numbers"]
KeyError: 'phone_numbers'
I am hoping to get the number '+14155688415' as my result
print(json_response["data"]["phone_numbers"])
When dealing with lots of data like that, JSONLint is a good resource to stay organized.

Strings getting converted to null when writing JSON representation of RDD

I am trying to write RDD which is structure like
(int , ListofList , ListofListofList)
Something like this
(49807360, [[111206019,'ABC','XYZ:RDC' , 'RDC' , 123] , [111206019,'ABC','XYZ:RDC' , 'RDC' , 123]] , [[[111206019,'ABC','XYZ:RDC' , 'RDC' , 123] , 111206019,'ABC','XYZ:RDC' , 'RDC' , 123]] , [[111206019,'ABC','XYZ:RDC' , 'RDC' , 123],[111206019,'ABC','XYZ:RDC' , 'RDC' , 123]])
When I print this is RDD form I see the data correctly. When I used inbuilt library to write it in JSON format I am getting null values in place of strings.
{"user":49807360,"history":[[111206019,null,null,null,123], [111206019,null,null,null,123]],"collection":...}
The line of code I am using to serialize RDD to JSON is
rdd.toDF().toJSON().saveAsTextFile(ouput_file_path)
I have also tried
rdd.toDF().write.json(ouput_file_path,"overwrite","gzip")
Above code was run in spark version 2.0.0
This happens because you use DataFrame as an intermediate step. Spark SQL doesn't support heterogeneous arrays, so values which don't match inferred type (array<bigint>) are replaced by NULL.
If you really want to go this way, and support heterogeneous structures, you should use tuples which should be mapped to Spark SQL structs, or don't depend on schema inference, and provide desired schema explicitly:
schema = ... # type: StructType
spark.createDataFrame(rdd, schema)
with schema (JSON representation) similar to:
{'fields': [{'metadata': {}, 'name': '_1', 'nullable': True, 'type': 'long'},
{'metadata': {},
'name': '_2',
'nullable': True,
'type': {'containsNull': True,
'elementType': {'fields': [{'metadata': {},
'name': '_1',
'nullable': True,
'type': 'long'},
{'metadata': {}, 'name': '_2', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_3', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_4', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_5', 'nullable': True, 'type': 'long'}],
'type': 'struct'},
'type': 'array'}},
{'metadata': {},
'name': '_3',
'nullable': True,
'type': {'fields': [{'metadata': {},
'name': '_1',
'nullable': True,
'type': {'fields': [{'metadata': {},
'name': '_1',
'nullable': True,
'type': 'long'},
{'metadata': {}, 'name': '_2', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_3', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_4', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_5', 'nullable': True, 'type': 'long'}],
'type': 'struct'}},
{'metadata': {}, 'name': '_2', 'nullable': True, 'type': 'long'},
{'metadata': {}, 'name': '_3', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_4', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_5', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_6', 'nullable': True, 'type': 'long'}],
'type': 'struct'}},
{'metadata': {},
'name': '_4',
'nullable': True,
'type': {'containsNull': True,
'elementType': {'fields': [{'metadata': {},
'name': '_1',
'nullable': True,
'type': 'long'},
{'metadata': {}, 'name': '_2', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_3', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_4', 'nullable': True, 'type': 'string'},
{'metadata': {}, 'name': '_5', 'nullable': True, 'type': 'long'}],
'type': 'struct'},
'type': 'array'}}],
'type': 'struct'}