how to read large json file on s3 to dataframe using sagemaker - json

I tried using the code:
from sagemaker import get_execution_role
import pandas as pd
bucket = 'xxx'
data_key = 'TV.json'
data_location = 's3://{}/{}'.format(bucket, data_key)
textfilereader=pd.read_json(
data_location,lines=True,chunksize=1000)
dflist=[]
for df in textfilereader:
dfList.append(df)
df=pd.concat(dflist,sort=False)
error:sequence item 0: expected str instance, bytes found

Related

How to covert data to pandas dataframe [duplicate]

REST API call response in bytes, how to convert the data in bytes to dataframe object
import requests
import pandas as pd
from io import StringIO
url ='url_path with_ending_&format=csv'
response =requests.get(url, auth=(user,password), allow_redirects=True)
result = str((response.content, 'utf-8'))
data = StringIO(result)
df = pd.DataFrame(data)
I have solved the same issue with Pandas.read_csv();
result = str(response.content, 'utf-8')
data = StringIO(result)
df = pd.read_csv(data)

How to create PyArrow Table from IOBytes stream of a JSON file from s3 bucket

import io
from pyarrow.json import read_json
import boto3
session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.resource('s3', endpoint_url=SCALITY_ENDPOINT, verify=r'C:\path\to\verification\certificate.crt')
bucket = s3.Bucket('my_bucket')
object = bucket.Object('my_object.json')
file_stream = io.BytesIO()
object.download_fileobj(file_stream)
table = read_json(file_stream.getvalue())
But I get the following error:
TypeError: Cannot convert bytes to pyarrow.lib.NativeFile
Is there a way to create a PyArrow table directly from the stream?

How to merge multiple JSON files reading from S3, convert to single .csv and store in S3?

Input :
There are 5 part JSON files named as test_par1.json, test_part2.json, test_part3.json, test_part4.json, test_part5.json in s3://test/json_files/data/.
Expected Output :
Single csv file
Explanation : All of the json files are having same number of columns with same structure. They are basically part files of same source.
I want to merge/re partition all of them and convert them into a csv file and store it in S3.
import pandas as pd
import os
import boto3
import numpy
# Boto3 clients
resource = boto3.resource('s3')
client = boto3.client('s3')
session = boto3.session.Session()
bucket = 'test'
path = 'json_files/data/'
delimiter = '/'
suffix = '.json'
json_files = client.list_objects(Bucket=bucket, Prefix=path, Delimiter=delimiter)
#print(inter_files)
for obj in inter_files['Contents']:
#print(obj)
obj = client.get_object(Bucket=bucket, Key=obj['Key'])
#print(obj)
df = pd.read_json(obj["Body"], lines=True)
print(df)

I am having trouble converting my nested json into a dataframe. I am getting the json from an API and want it in a dataframe

This code is from Sportradar API. The API outputs the data as JSON or XML; below is my attempt at taking the JSON and making it into a dataframe.
import numpy as np
import pandas as pd
import http.client
import json
from pandas.io.json import json_normalize
#API Call including my key
conn = http.client.HTTPSConnection("api.sportradar.us")
conn.request("GET", "/nfl/official/trial/v5/en/players/0acdcd3b-5442-4311-a139-ae7c506faf88/profile.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/b7aeb58f-7987-4202-bc41-3ad9a5b83fa4/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/teams/0d855753-ea21-4953-89f9-0e20aff9eb73/full_roster.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/030d37cf-b896-4f10-b16e-2a5120fef6cf/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
res = conn.getresponse()
data = res.read()
data_dec = data.decode("utf-8")
json_data = json.loads(data_dec)
flat_data = json_normalize(json_data)
print(json_data)
df = pd.DataFrame.from_records(flat_data)
df2 = pd.DataFrame.from_dict(json_data, orient='index')
df2.reset_index(level=0, inplace=True)
#The closest thing to a dataframe I can get
df.head()
Why not make use of a Python Wrapper that is publicly available and maintained.
See link.

pyspark csv at url to dataframe, without writing to disk

How can I read a csv at a url into a dataframe in Pyspark without writing it to disk?
I've tried the following with no luck:
import urllib.request
from io import StringIO
url = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
response = urllib.request.urlopen(url)
data = response.read()
text = data.decode('utf-8')
f = StringIO(text)
df1 = sqlContext.read.csv(f, header = True, schema=customSchema)
df1.show()
TL;DR It is not possible and in general transferring data through driver is a dead-end.
Before Spark 2.3 csv reader can read only from URI (and http is not supported).
In Spark 2.3 you use RDD:
spark.read.csv(sc.parallelize(text.splitlines()))
but data will be written to disk.
You can createDataFrame from Pandas:
spark.createDataFrame(pd.read_csv(url)))
but this once again writes to disk
If file is small I'd just use sparkFiles:
from pyspark import SparkFiles
spark.sparkContext.addFile(url)
spark.read.csv(SparkFiles.get("iris.csv"), header=True))