pandas parallel urllib request for Loop - json

i have a code to parse JSON data through API using urllib as the following:
import pandas as pd
import json
import urllib.request
import os
import time
import csv
import datetime
# Send URL Request & Get JSON Data
with urllib.request.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries") as url:
data = json.loads(url.read().decode())
# Select Data from result section
df = pd.DataFrame(data=data['result'])
# tickers = df['MarketName']
tickers= ["BTC-1ST", "BTC-2GIVE", "BTC-ABY", "BTC-ARDR", "BTC-WAVE"]
print(tickers)
for ticker in tickers:
with urllib.request.urlopen("https://bittrex.com/Api/v2.0/pub/market/GetTicks?marketName=" + ticker + "&tickInterval=thirtyMin") as URL:
data = json.loads(URL.read().decode())
df2 = pd.DataFrame(data=data['result'])
Market01 = "Market"
df2[Market01] = ticker
df2.to_csv('all.csv', encoding="utf-8-sig", index=False, mode='a', header=False)
print("done " + ticker)
actually it's not request for five currency only .. they are 295 request for 295 currency .. which take 5 minutes to complete all required data to csv file (very long time)
i wonder if there are a method to send all requests in parallel to decrease time with the same option to save data to csv file as dataframe
i searched many times and found multiprocessor module but couldn't found a sample similar to my case
any one can help me please!!!!!!

What about something like this?
import pandas as pd
import json
import urllib.request
import os
from urllib import parse
import csv
import datetime
from multiprocessing import Process, Pool
import time
# Send URL Request & Get JSON Data
with urllib.request.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries") as url:
data = json.loads(url.read().decode())
# Select Data from result section
df = pd.DataFrame(data=data['result'])
# tickers = df['MarketName']
tickers= ["BTC-1ST", "BTC-2GIVE", "BTC-ABY", "BTC-ARDR", "BTC-WAVE"]
print(tickers)
def http_get(url):
result = {"url": url, "data": urllib.request.urlopen(url, timeout=5).read()}
return result
urls = [ "https://bittrex.com/Api/v2.0/pub/market/GetTicks?marketName=" + ticker + "&tickInterval=thirtyMin" for ticker in tickers ]
pool = Pool(processes=5)
results = pool.map(http_get, urls)
for result in results:
j = json.loads(result['data'].decode())
df2 = pd.DataFrame(data=j)
Market01 = "Market"
marketName = parse.parse_qs(parse.urlparse(result['url']).query)['marketName'][0]
df2[Market01] = marketName
df2.to_csv('all.csv', encoding="utf-8-sig", index=False, mode='a', header=False)
print("done " + marketName)

Related

How can I save some json files generated in a for loop as csv?

Sorry, I am new in coding in Python, I would need to save a json file generated in a for loop as csv for each iteration of the loop.
I wrote a code that works fine to generate the first csv file but then it is overwritten and I did not find a solution yet. Can anyone help me? many thanks
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file.csv')
You need to add a sequence number or some other unique identifier to the filename. The clearest example would be to keep track of a counter, or use a GUID. Below I've used a counter that is initialized before your loop, and is incremented in each iteration. This will produce a list of files like output_file_1.csv, output_file_2.csv, output_file_3.csv and so on.
counter = 0
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file_' + str(counter) + '.csv')
counter += 1
We convert the integer to a string, and paste it inbetween the name of your file and its extension.
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for idx, user in enumerate(user_objects):
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv(f'output_file{str(idx)}.csv')

How to covert data to pandas dataframe [duplicate]

REST API call response in bytes, how to convert the data in bytes to dataframe object
import requests
import pandas as pd
from io import StringIO
url ='url_path with_ending_&format=csv'
response =requests.get(url, auth=(user,password), allow_redirects=True)
result = str((response.content, 'utf-8'))
data = StringIO(result)
df = pd.DataFrame(data)
I have solved the same issue with Pandas.read_csv();
result = str(response.content, 'utf-8')
data = StringIO(result)
df = pd.read_csv(data)

Python: request json gets error - If using all scalar values, you must pass an index

When attempting to request a json file from API, i have an error after get the first result.
Does anyone have any idea why is requested an index from a list ?
Best regards :)
import requests
import json
import pandas as pd
import time
import datetime
### OCs List ids
OCs = ['1003473-1116-SE21','1003473-1128-AG21','1031866-12-CC21','1057440-3184-AG21','1070620-1832-CM21', '1070620-2219-SE21', '1070620-2499-CM21']
for i in OCs:
link ="http://api.mercadopublico.cl/servicios/v1/publico/ordenesdecompra.json?codigo="+ str(i) +"&ticket=F8537A18-6766-4DEF-9E59-426B4FEE2844"
response = requests.get(link, [])
data = response.json()
df = pd.DataFrame.from_dict(data)
### remove unnecessary columns
df.drop(df.columns[[0,1,2]],axis=1, inplace=True)
### flat json to pandas dataframe
df_detail = pd.json_normalize(df['Listado'])
ValueError: If using all scalar values, you must pass an index
The server detects too many requests and sends error response (and then your script throws an error). Solution is to wait for correct response, for example:
import requests
import json
import pandas as pd
import time
import datetime
### OCs List ids
OCs = [
"1003473-1116-SE21",
"1003473-1128-AG21",
"1031866-12-CC21",
"1057440-3184-AG21",
"1070620-1832-CM21",
"1070620-2219-SE21",
"1070620-2499-CM21",
]
for i in OCs:
link = (
"http://api.mercadopublico.cl/servicios/v1/publico/ordenesdecompra.json?codigo="
+ str(i)
+ "&ticket=F8537A18-6766-4DEF-9E59-426B4FEE2844"
)
while True: # <--- repeat until we get correct response
print(link)
response = requests.get(link, [])
data = response.json()
if "Listado" in data:
break
time.sleep(3) # <--- wait 3 seconds and try again
df = pd.DataFrame.from_dict(data)
### remove unnecessary columns
df.drop(df.columns[[0, 1, 2]], axis=1, inplace=True)
### flat json to pandas dataframe
df_detail = pd.json_normalize(df["Listado"])
# ...

I am having trouble converting my nested json into a dataframe. I am getting the json from an API and want it in a dataframe

This code is from Sportradar API. The API outputs the data as JSON or XML; below is my attempt at taking the JSON and making it into a dataframe.
import numpy as np
import pandas as pd
import http.client
import json
from pandas.io.json import json_normalize
#API Call including my key
conn = http.client.HTTPSConnection("api.sportradar.us")
conn.request("GET", "/nfl/official/trial/v5/en/players/0acdcd3b-5442-4311-a139-ae7c506faf88/profile.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/b7aeb58f-7987-4202-bc41-3ad9a5b83fa4/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/teams/0d855753-ea21-4953-89f9-0e20aff9eb73/full_roster.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/030d37cf-b896-4f10-b16e-2a5120fef6cf/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
res = conn.getresponse()
data = res.read()
data_dec = data.decode("utf-8")
json_data = json.loads(data_dec)
flat_data = json_normalize(json_data)
print(json_data)
df = pd.DataFrame.from_records(flat_data)
df2 = pd.DataFrame.from_dict(json_data, orient='index')
df2.reset_index(level=0, inplace=True)
#The closest thing to a dataframe I can get
df.head()
Why not make use of a Python Wrapper that is publicly available and maintained.
See link.

"TypeError: Object of type 'bytes' is not JSON serializable" live streaming data

I am trying to live stream data into Power Bi from python. However I am encountering the error
TypeError: Object of type 'bytes' is not JSON serializable
I have put my code below, please indicate what I am doing wrong as I don't quite understand what the issue is.
import pandas as pd
from datetime import datetime
from datetime import timedelta
import requests
import json
import time
import random
# function for data_generation
def data_generation():
surr_id = random.randint(1, 3)
speed = random.randint(20, 200)
date = datetime.today().strftime("%Y-%m-%d")
time = datetime.now().isoformat()
return [surr_id, speed, date, time]
if __name__ == '__main__':
REST_API_URL = 'api_url'
while True:
data_raw = []
for j in range(1):
row = data_generation()
data_raw.append(row)
print("Raw data - ", data_raw)
# set the header record
HEADER = ["surr_id", "speed", "date", "time"]
data_df = pd.DataFrame(data_raw, columns=HEADER)
data_json = bytes(data_df.to_json(orient='records'), encoding='utf-8')
print("JSON dataset", data_json)
# Post the data on the Power BI API
try:
req = requests.post(REST_API_URL, data=json.dumps(
data_json), headers=HEADER, timeout=5)
print("Data posted in Power BI API")
except requests.exceptions.ConnectionError as e:
req = "No response"
print(req)
time.sleep(3)
Solved, Just changed req = requests.post(REST_API_URL, data=json.dumps(data_json), headers=HEADER, timeout=5) to req = requests.post(url=REST_API_URL, data=data_json)