How to covert data to pandas dataframe [duplicate] - json

REST API call response in bytes, how to convert the data in bytes to dataframe object
import requests
import pandas as pd
from io import StringIO
url ='url_path with_ending_&format=csv'
response =requests.get(url, auth=(user,password), allow_redirects=True)
result = str((response.content, 'utf-8'))
data = StringIO(result)
df = pd.DataFrame(data)

I have solved the same issue with Pandas.read_csv();
result = str(response.content, 'utf-8')
data = StringIO(result)
df = pd.read_csv(data)

Related

how to read large json file on s3 to dataframe using sagemaker

I tried using the code:
from sagemaker import get_execution_role
import pandas as pd
bucket = 'xxx'
data_key = 'TV.json'
data_location = 's3://{}/{}'.format(bucket, data_key)
textfilereader=pd.read_json(
data_location,lines=True,chunksize=1000)
dflist=[]
for df in textfilereader:
dfList.append(df)
df=pd.concat(dflist,sort=False)
error:sequence item 0: expected str instance, bytes found

python how to convert json contains multiple arrays to pandas dataframe

hey im having trouble converting json to dataframe using pandas here is my solution
import json
import pandas as pd
f = open('write.json')
data = json.load(f)
df = pd.DataFrame.from_dict(data,orient = 'index').reset_index()
print(df)
and here is the json file
{"_id":"60b53d92ccb1483964da45f9","Avg_sm":[26.66953125,26.66953125,26.666666666666668,26.666666666666668,26.666666666666668,26.666666666666668,26.666666666666668,26.666666666666668,26.6647859922179,26.6647859922179,26.45263157894737,26.45263157894737],"Avg_st":[22.6517578125,22.6517578125,22.65204678362573,22.65204678362573,22.65204678362573,22.65204678362573,22.65204678362573,22.65204678362573,22.65272373540856,22.65272373540856,22.694567062818336,22.694567062818336],"SensorCoordinates":[10.33363276545083,36.8434191667489],"SensorIdentifier":["CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC","CCCCCCCCCCCCCCCC"],"count":24,"date":["25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","25-06-2021","26-06-2021","26-06-2021","26-06-2021","26-06-2021"],"min_sm":[21.1,21.1,21.1,21.1,21.1,21.1,21.1,21.1,21.1,21.1,21.1,21.1],"sensorId":["60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285","60b54789a21c170aecb25285"],"status":[true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true]}
IIUC:
you can try:
df=pd.json_normalize(data).apply(pd.Series.explode,ignore_index=True)
OR
df = pd.DataFrame.from_dict(data,orient = 'index').T.apply(pd.Series.explode,ignore_index=True)

Pandas dataframe extracting value from json, which returned from as content(JSON) from request,

Pandas dataframe extracting value from JSON, which returned from as content from request.
import pandas as pd
import pandas as pd
import json
import requests
import ast
from pandas.io.json import json_normalize
df['response'] = df.URL.apply(lambda u: requests.get(u).content)
df.head()
b'{"error":0,"short":"http:\\/\\/192.168.42.72\\/ECyKY"}'
b'{"error":0,"short":"http:\\/\\/192.168.42.72\\/IsMgE"}'
When we use Python without Pandas, we can just use:
resp = requests.get(u)
y=resp.json()
print(y)
print(y['short'])
to store the short value as "http://192.168.42.72/ECyKY"
spend hours trying to get it work with Pandas without luck, any hint?
Instead of using response.get.content directly use response.get.json then use Series.str.get to extract the value corresponding to key short from the dictionary and then assign it to new column short:
df['response'] = df['URL'].apply(lambda u: requests.get(u).json())
df['short'] = df['response'].str.get('short')
# print(df)
response short
0 {'error': 0, 'short': 'http://192.168.42.72/EC... http://192.168.42.72/ECyKY
1 {'error': 0, 'short': 'http://192.168.42.72/Is... http://192.168.42.72/IsMgE

I am having trouble converting my nested json into a dataframe. I am getting the json from an API and want it in a dataframe

This code is from Sportradar API. The API outputs the data as JSON or XML; below is my attempt at taking the JSON and making it into a dataframe.
import numpy as np
import pandas as pd
import http.client
import json
from pandas.io.json import json_normalize
#API Call including my key
conn = http.client.HTTPSConnection("api.sportradar.us")
conn.request("GET", "/nfl/official/trial/v5/en/players/0acdcd3b-5442-4311-a139-ae7c506faf88/profile.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/b7aeb58f-7987-4202-bc41-3ad9a5b83fa4/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/teams/0d855753-ea21-4953-89f9-0e20aff9eb73/full_roster.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/030d37cf-b896-4f10-b16e-2a5120fef6cf/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
res = conn.getresponse()
data = res.read()
data_dec = data.decode("utf-8")
json_data = json.loads(data_dec)
flat_data = json_normalize(json_data)
print(json_data)
df = pd.DataFrame.from_records(flat_data)
df2 = pd.DataFrame.from_dict(json_data, orient='index')
df2.reset_index(level=0, inplace=True)
#The closest thing to a dataframe I can get
df.head()
Why not make use of a Python Wrapper that is publicly available and maintained.
See link.

pandas parallel urllib request for Loop

i have a code to parse JSON data through API using urllib as the following:
import pandas as pd
import json
import urllib.request
import os
import time
import csv
import datetime
# Send URL Request & Get JSON Data
with urllib.request.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries") as url:
data = json.loads(url.read().decode())
# Select Data from result section
df = pd.DataFrame(data=data['result'])
# tickers = df['MarketName']
tickers= ["BTC-1ST", "BTC-2GIVE", "BTC-ABY", "BTC-ARDR", "BTC-WAVE"]
print(tickers)
for ticker in tickers:
with urllib.request.urlopen("https://bittrex.com/Api/v2.0/pub/market/GetTicks?marketName=" + ticker + "&tickInterval=thirtyMin") as URL:
data = json.loads(URL.read().decode())
df2 = pd.DataFrame(data=data['result'])
Market01 = "Market"
df2[Market01] = ticker
df2.to_csv('all.csv', encoding="utf-8-sig", index=False, mode='a', header=False)
print("done " + ticker)
actually it's not request for five currency only .. they are 295 request for 295 currency .. which take 5 minutes to complete all required data to csv file (very long time)
i wonder if there are a method to send all requests in parallel to decrease time with the same option to save data to csv file as dataframe
i searched many times and found multiprocessor module but couldn't found a sample similar to my case
any one can help me please!!!!!!
What about something like this?
import pandas as pd
import json
import urllib.request
import os
from urllib import parse
import csv
import datetime
from multiprocessing import Process, Pool
import time
# Send URL Request & Get JSON Data
with urllib.request.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries") as url:
data = json.loads(url.read().decode())
# Select Data from result section
df = pd.DataFrame(data=data['result'])
# tickers = df['MarketName']
tickers= ["BTC-1ST", "BTC-2GIVE", "BTC-ABY", "BTC-ARDR", "BTC-WAVE"]
print(tickers)
def http_get(url):
result = {"url": url, "data": urllib.request.urlopen(url, timeout=5).read()}
return result
urls = [ "https://bittrex.com/Api/v2.0/pub/market/GetTicks?marketName=" + ticker + "&tickInterval=thirtyMin" for ticker in tickers ]
pool = Pool(processes=5)
results = pool.map(http_get, urls)
for result in results:
j = json.loads(result['data'].decode())
df2 = pd.DataFrame(data=j)
Market01 = "Market"
marketName = parse.parse_qs(parse.urlparse(result['url']).query)['marketName'][0]
df2[Market01] = marketName
df2.to_csv('all.csv', encoding="utf-8-sig", index=False, mode='a', header=False)
print("done " + marketName)