Flatten JSON data to individual columns - json

I am working upon Twitter streaming data and I am having an output like this:
"data": {
"author_id": "1318123716522479616",
"created_at": "2020-11-05T04:18:21.000Z",
"entities": {
"hashtags": [
{
"end": 107,
"start": 86,
"tag": "MilliHesaplarYanyana"
}
],
"mentions": [
{
"end": 15,
"start": 3,
"username": "MilliTaakip"
}
]
},
"id": "1324204381177323520",
"lang": "tr",
"text": "RT #MilliTaakip: Milli hesaplar\u0131m\u0131z\u0131n g\u00fc\u00e7lenmesi i\u00e7in\nCumhurba\u015fkan\u0131m\u0131z\u0131n talimat\u0131yla,\n#MilliHesaplarYanyana \u00e7al\u0131\u015fmas\u0131n\u0131 destekliyoruz;\n\n\ud83c\uddf9\ud83c\uddf7\u2026"
}
}
I want to extract specific information like the hashtags from this data and store them in my database.
I tried using multiple ways like json.normalize ,flatten_json but it does not work. I get the following as my output
here's my code:
def connect_to_endpoint(url, headers):
response = requests.request("GET", url, headers=headers, stream=True, params=payload)
print(response.status_code)
for response_line in response.iter_lines():
if response_line:
# print(ndjson.dumps(json_response["data"]["text"], indent=4, sort_keys=True))
conn = psycopg2.connect(database="tweetData", user="postgres", password="pass", host="localhost", port="5432")
cur = conn.cursor()
# cc
try:
data = json.loads(response_line.decode('utf-8'))
index = 0
#for created at
var2 = json.loads(response_line.decode('utf-8'))["data"]["text"]
# define a list of keywords
keywords = ('biden', 'election', 'trump','stocks')
if any(keyword in var2.lower() for keyword in keywords):
df= pd.json_normalize(data)
dffinal=pd.DataFrame(df)
engine = create_engine('postgresql+psycopg2://postgres:root#localhost:5432/tweetData')
dffinal.to_sql("new-tweets", engine,if_exists='append',dtype = {'relevant_column':sqlalchemy.types.JSON})
print("loaded")
else:
print("none")
conn.commit()
index += 1
cur.close()
except IOError as io:
print("ERROR!")
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {}".format(
response.status_code, response.text
)
)
Please advise on how should I proceed and what errors I have in my approach
EDIT:
Every time I try to retrieve the tweet data, in case there is no entities or no hashtags in the tweet data, it sends an error saying Key Error: 'entities'

In PostgreSQL you could use
SELECT value ->> 'tag'
FROM jsonb_array_elements(your_json #> '{data,entities,hashtags}') AS x(value);
to extract the tags.

Related

Finding Values in Python json.loads Dictionary

I'm working with a REST API that returns data in the following format:
{
"id": "2902cbad6da44459ad05abd1305eed14",
"displayName": "",
"sourceHost": "dev01.test.lan",
"sourceIP": "192.168.145.1",
"messagesPerSecond": 0,
"messages": 2733,
"size": 292062,
"archiveSize": 0,
"dates": [
{
"date": 1624921200000,
"messages": 279,
"size": 29753,
"archiveSize": 0
},
{
"date": 1625007600000,
"messages": 401,
"size": 42902,
"archiveSize": 0
}
]
}
I'm using json.loads to successfully pull the data from the API, and I now need to search for a particular "date:" value and read the corresponding "messages", "size" and "archiveSize" values.
I'm trying to use the "if-in" method to find the value I'm interested in, for example:
response = requests.request("GET", apiQuery, headers=headers, data=payload)
json_response = json.loads(response.text)
test = 2733
if test in json_response.values():
print(f"Yes, value: '{test}' exist in dictionary")
else:
print(f"No, value: '{test}' does not exist in dictionary")
This works fine for any value in the top section of the JSON return, but it never finds any values in the "dates" sub-branches.
I have two questions, firstly, how do I find the target "date" value? Secondly, once I find that "sub-branch" what would be the best way to extract the three values I need?
Thanks.
from json import load
def list_dates_whose_message_count_equals(dates=None, message_count=0):
return list(filter(
lambda date: date.get("messages") == message_count, dates
))
def main():
json_ = {}
with open("values.json", "r") as fp:
json_ = load(fp)
print(list_dates_whose_message_count_equals(json_["dates"], message_count=279))
print(list_dates_whose_message_count_equals(json_["dates"], message_count=401))
if __name__ == "__main__":
main()
Returns this
[{'date': 1624921200000, 'messages': 279, 'size': 29753, 'archiveSize': 0}]
[{'date': 1625007600000, 'messages': 401, 'size': 42902, 'archiveSize': 0}]

TypeError: list indices must be integers or slices, not str JSON Scrapy

I was scraping a JSON response but getting the following error
values = resp['acf']
TypeError: list indices must be integers or slices, not str
I am not sure where did I do wrong.
Your response is highly appreciated.
# -*- coding: utf-8 -*-
import scrapy
import json
class MainSpider(scrapy.Spider):
name = 'main'
start_urls = 'https://chamber.vinylagency.com/wp-json/wp/v2/directory?industry-type=547&per_page=100'
def parse(self, response):
resp = json.loads(response.body)
values = resp['acf']
for value in values:
name = value['OrgName']
yield {
"Name": name,
}
The exception is raised because the response is a list of objects and you are trying to access it as a dict directly.
Here is a sample of the response:
[
{
"id": 33286,
"date": "2020-05-09T02:38:47",
"date_gmt": "2020-05-09T02:38:47",
"guid":
...
},
{
"id": 32954,
"date": "2020-05-09T02:38:22",
"date_gmt": "2020-05-09T02:38:22",
"guid":
...
}
]
You probably want to parse like this:
def parse(self, response):
resp = json.loads(response.body)
for value in values:
name = value['acf']['OrgName']
yield {
"Name": name,
}

Join nested JSON dataframe and another dataframe

I am trying to join a dataframe1 generated by the JSON with dataframe2 using the field order_id, then assign the "status" from dataframe2 to the "status" of dataframe1. Anyone knows how to do this. Many thanks for your help.
dataframe1
[{
"client_id": 1,
"name": "Test01",
"olist": [{
"order_id": 10000,
"order_dt_tm": "2012-12-01",
"status": "" <== use "status" from dataframe2 to populate this field
},
{
"order_id": 10000,
"order_dt_tm": "2012-12-01",
"status": ""
}
]
},
{
"client_id": 2,
"name": "Test02",
"olist": [{
"order_id": 10002,
"order_dt_tm": "2012-12-01",
"status": ""
},
{
"order_id": 10003,
"order_dt_tm": "2012-12-01",
"status": ""
}
]
}
]
dataframe2
order_id status
10002 "Delivered"
10001 "Ordered"
Here is your raw dataset as a json string:
d = """[{
"client_id": 1,
"name": "Test01",
"olist": [{
"order_id": 10000,
"order_dt_tm": "2012-12-01",
"status": ""
},
{
"order_id": 10000,
"order_dt_tm": "2012-12-01",
"status": ""
}
]
},
{
"client_id": 2,
"name": "Test02",
"olist": [{
"order_id": 10002,
"order_dt_tm": "2012-12-01",
"status": ""
},
{
"order_id": 10003,
"order_dt_tm": "2012-12-01",
"status": ""
}
]
}
]"""
Firstly, I would load it as json:
import json
data = json.loads(d)
Then, I would turn it into a Pandas dataframe, notice that I remove status field as it will be populated by the join step :
df1 = pd.json_normalize(data, 'olist')[['order_id', 'order_dt_tm']]
Then, from the second dataframe sample, I would do a left join using merge function:
data = {'order_id':[10002, 10001],'status':['Delivered', 'Ordered']}
df2 = pd.DataFrame(data)
result = df1.merge(df2, on='order_id', how='left')
Good luck
UPDATE
# JSON to Dataframe
df1 = pd.json_normalize(data)
# Sub JSON to dataframe
df1['sub_df'] = df1['olist'].apply(lambda x: pd.json_normalize(x).drop('status', axis=1))
# Build second dataframe
data2 = {'order_id':[10002, 10001],'status':['Delivered', 'Ordered']}
df2 = pd.DataFrame(data2)
# Populates status in sub dataframes
df1['sub_df'] = df1['sub_df'].apply(lambda x: x.merge(df2, on='order_id', how='left').fillna(''))
# Sub dataframes back to JSON
def back_to_json_str(df):
# turns a df back to string json
return str(df.to_json(orient="records", indent=4))
df1['olist'] = df1['sub_df'].apply(lambda x: back_to_json_str(x))
# Global DF back to JSON string
parsed = str(df1.drop('sub_df', axis=1).to_json(orient="records", indent=4))
parsed = parsed.replace(r'\n', '\n')
parsed = parsed.replace(r'\"', '\"')
# Print result
print(parsed)
UPDATE 2
here is a way to add index colum to a dataframe:
df1['index'] = [e for e in range(df1.shape[0])]
This is my code assigning title values from a dataframe back to the JSON object. The assignment operation takes a bit time if the number records in the JSON object is 100000. Anyone knows how to improve the performance of this code. Many thanks.
import json
import random
import pandas as pd
import pydash as _
data = [{"pid":1,"name":"Test1","title":""},{"pid":2,"name":"Test2","title":""}] # 5000 records
# dataframe1
df = pd.json_normalize(data)
# dataframe2
pid = [x for x in range(1, 5000)]
title_set = ["Boss", "CEO", "CFO", "PMO", "Team Lead"]
titles = [title_set[random.randrange(0, 5)] for x in range(1, 5000)]
df2 = pd.DataFrame({'pid': pid, 'title': titles})
#left join dataframe1 and dataframe2
df3 = df.merge(df2, on='pid', how='left')
#assign title values from dataframe back to the json object
for row in df3.iterrows():
idx = _.find_index(data, lambda x: x['pid'] == row[1]['pid'])
data[idx]['title'] = row[1]['title_y']
print(data)

How to Get JSON values Python

Learning Days
Code to the get the data in JSON Format
#...
cursor.execute("SELECT * FROM user")
response = {
"version": "5.2",
"user_type": "online",
"user": list(cursor),
}
response = json.dumps(response, sort_keys=False, indent=4, separators=(',', ': '))
print(response)
# ...
This produces output as
{
"version": "5.2",
"user_type": "online",
"user":
[
{
"name": "John",
"id": 50
},
{
"name": "Mark",
"id": 57
}
]
}
print(response["user"]) - TypeError: string indices must be integers
How do i access the values in JSON
json.dumps return a string, need a small conversion something like this, not sure is this the exact method to do
Solution:
response = JSONEncoder().encode(response )
response = JSONDecoder().decode(response )
response = json.loads(response )
print(response['user'[0]['id'])

JSON in python from mysql with additional key value pairs

This is the code used to fetch data from DB
import pymysql
import json
conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='', db='test', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
cursor = conn.cursor()
cursor.execute("SELECT * FROM user")
rows = []
for row in cursor:
rows += [row]
print(json.dumps(rows, sort_keys=False, indent=4, separators=(',', ': ')))
cursor.close()
conn.close()
Output in json is -
[
{
"name": "John",
"id": 50
},
{
"name": "Mark",
"id": 57
}
]
But I want the output in this format -
{
"version": "5.2",
"user_type": "online",
"user":
[
{
"name": "John",
"id": 50
},
{
"name": "Mark",
"id": 57
}
]
}
where the version and user_type can be manually entered or appended to the result.
Simply wrap the result set in a dict of your liking then.
# ...
cursor.execute("SELECT * FROM user")
response = {
"version": "5.2",
"user_type": "online",
"user": list(cursor), # This is equivalent to iterating over the cursor yourself.
}
print(json.dumps(response, sort_keys=False, indent=4, separators=(',', ': ')))
# ...
You can create a dict with the version, the user type, and the user (where for the key 'user' you enter rows as the value). Then convert that to json using json.dump or json.dumps:
data = { "version": "5.2", "user_type": "online", "user":rows }
print(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': ')))