I tried using the code:
from sagemaker import get_execution_role
import pandas as pd
bucket = 'xxx'
data_key = 'TV.json'
data_location = 's3://{}/{}'.format(bucket, data_key)
textfilereader=pd.read_json(
data_location,lines=True,chunksize=1000)
dflist=[]
for df in textfilereader:
dfList.append(df)
df=pd.concat(dflist,sort=False)
error:sequence item 0: expected str instance, bytes found
Related
REST API call response in bytes, how to convert the data in bytes to dataframe object
import requests
import pandas as pd
from io import StringIO
url ='url_path with_ending_&format=csv'
response =requests.get(url, auth=(user,password), allow_redirects=True)
result = str((response.content, 'utf-8'))
data = StringIO(result)
df = pd.DataFrame(data)
I have solved the same issue with Pandas.read_csv();
result = str(response.content, 'utf-8')
data = StringIO(result)
df = pd.read_csv(data)
import io
from pyarrow.json import read_json
import boto3
session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.resource('s3', endpoint_url=SCALITY_ENDPOINT, verify=r'C:\path\to\verification\certificate.crt')
bucket = s3.Bucket('my_bucket')
object = bucket.Object('my_object.json')
file_stream = io.BytesIO()
object.download_fileobj(file_stream)
table = read_json(file_stream.getvalue())
But I get the following error:
TypeError: Cannot convert bytes to pyarrow.lib.NativeFile
Is there a way to create a PyArrow table directly from the stream?
Input :
There are 5 part JSON files named as test_par1.json, test_part2.json, test_part3.json, test_part4.json, test_part5.json in s3://test/json_files/data/.
Expected Output :
Single csv file
Explanation : All of the json files are having same number of columns with same structure. They are basically part files of same source.
I want to merge/re partition all of them and convert them into a csv file and store it in S3.
import pandas as pd
import os
import boto3
import numpy
# Boto3 clients
resource = boto3.resource('s3')
client = boto3.client('s3')
session = boto3.session.Session()
bucket = 'test'
path = 'json_files/data/'
delimiter = '/'
suffix = '.json'
json_files = client.list_objects(Bucket=bucket, Prefix=path, Delimiter=delimiter)
#print(inter_files)
for obj in inter_files['Contents']:
#print(obj)
obj = client.get_object(Bucket=bucket, Key=obj['Key'])
#print(obj)
df = pd.read_json(obj["Body"], lines=True)
print(df)
This code is from Sportradar API. The API outputs the data as JSON or XML; below is my attempt at taking the JSON and making it into a dataframe.
import numpy as np
import pandas as pd
import http.client
import json
from pandas.io.json import json_normalize
#API Call including my key
conn = http.client.HTTPSConnection("api.sportradar.us")
conn.request("GET", "/nfl/official/trial/v5/en/players/0acdcd3b-5442-4311-a139-ae7c506faf88/profile.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/b7aeb58f-7987-4202-bc41-3ad9a5b83fa4/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/teams/0d855753-ea21-4953-89f9-0e20aff9eb73/full_roster.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/030d37cf-b896-4f10-b16e-2a5120fef6cf/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
res = conn.getresponse()
data = res.read()
data_dec = data.decode("utf-8")
json_data = json.loads(data_dec)
flat_data = json_normalize(json_data)
print(json_data)
df = pd.DataFrame.from_records(flat_data)
df2 = pd.DataFrame.from_dict(json_data, orient='index')
df2.reset_index(level=0, inplace=True)
#The closest thing to a dataframe I can get
df.head()
Why not make use of a Python Wrapper that is publicly available and maintained.
See link.
How can I read a csv at a url into a dataframe in Pyspark without writing it to disk?
I've tried the following with no luck:
import urllib.request
from io import StringIO
url = "https://raw.githubusercontent.com/pandas-dev/pandas/master/pandas/tests/data/iris.csv"
response = urllib.request.urlopen(url)
data = response.read()
text = data.decode('utf-8')
f = StringIO(text)
df1 = sqlContext.read.csv(f, header = True, schema=customSchema)
df1.show()
TL;DR It is not possible and in general transferring data through driver is a dead-end.
Before Spark 2.3 csv reader can read only from URI (and http is not supported).
In Spark 2.3 you use RDD:
spark.read.csv(sc.parallelize(text.splitlines()))
but data will be written to disk.
You can createDataFrame from Pandas:
spark.createDataFrame(pd.read_csv(url)))
but this once again writes to disk
If file is small I'd just use sparkFiles:
from pyspark import SparkFiles
spark.sparkContext.addFile(url)
spark.read.csv(SparkFiles.get("iris.csv"), header=True))