How to convert json file into table structure in redshift using python - json

How can I convert JSON file into a table structure in Redshift? I tried the below python code.
import boto3
import json
import os
import sys
import psycopg2
import csv
from collections import defaultdict
def jsonfile(path):
session = boto3.Session(
aws_access_key_id='dfjfkgj',
aws_secret_access_key='sdfg',
region_name='us-west-2')
s3 = session.resource('s3')
bucket= s3.Bucket('ag-redshift-poc')
with open(path, 'rb') as data:
res=json.load(data)
f = open('data.csv','wb')
output = csv.writer(f)
output.writerow(res[0].keys())
for row in res:
output.writerow(row.values())
bucket.put_object(Key=('C:\Python27\data.csv'),Body=res)
print 'success'
def redshift():
co=psycopg2.connect(dbname= 'redshiftpoc', host='shdjf',
port= '5439', user= 'admin', password= 'snd')
curr = co.cursor()
curr.execute("""copy sample from 's3://ag-redshift-poc/testfile/json.txt'
CREDENTIALS 'aws_access_key_id=fdfd;aws_secret_access_key=sxhd'
""")
co.commit()
print 'success'
curr.close()
co.close()
jsonfile('C:\Python27\json.txt')
redshift()

Redshift can directly absorb JSON to COPY into your table. (Though not very efficient).
In your case, modify the COPY query to,
COPY sample FROM 's3://<bucket_name>/<path_to_json>'
CREDENTIALS 'aws_access_key_id=xxxx;aws_secret_access_key=xxxx'
JSON 'auto' ACCEPTINVCHARS;
Please note JSON 'auto' in query. This maps every column in table with keys in JSON file.
More details here in the COPY examples

Related

How can I save some json files generated in a for loop as csv?

Sorry, I am new in coding in Python, I would need to save a json file generated in a for loop as csv for each iteration of the loop.
I wrote a code that works fine to generate the first csv file but then it is overwritten and I did not find a solution yet. Can anyone help me? many thanks
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file.csv')
You need to add a sequence number or some other unique identifier to the filename. The clearest example would be to keep track of a counter, or use a GUID. Below I've used a counter that is initialized before your loop, and is incremented in each iteration. This will produce a list of files like output_file_1.csv, output_file_2.csv, output_file_3.csv and so on.
counter = 0
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file_' + str(counter) + '.csv')
counter += 1
We convert the integer to a string, and paste it inbetween the name of your file and its extension.
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for idx, user in enumerate(user_objects):
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv(f'output_file{str(idx)}.csv')

How do I split / chunk Large JSON Files with AWS glueContext before converting them to JSON?

I'm trying to convert a 20GB JSON gzip file to parquet using AWS Glue.
I've setup a job using Pyspark with the code below.
I got this log WARN message:
LOG.WARN: Loading one large unsplittable file s3://aws-glue-data.json.gz with only one partition, because the file is compressed by unsplittable compression codec.
I was wondering if there was a way to split / chunk the file? I know I can do it with pandas, but unfortunately that takes far too long (12+ hours).
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
import pyspark.sql.functions
from pyspark.sql.functions import col, concat, reverse, translate
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
test = glueContext.create_dynamic_frame_from_catalog(
database="test_db",
table_name="aws-glue-test_table")
# Create Spark DataFrame, remove timestamp field and re-name other fields
reconfigure = test.drop_fields(['timestamp']).rename_field('name', 'FirstName').rename_field('LName', 'LastName').rename_field('type', 'record_type')
# Create pyspark DF
spark_df = reconfigure.toDF()
# Filter and only return 'a' record types
spark_df = spark_df.where("record_type == 'a'")
# Once filtered, remove the record_type column
spark_df = spark_df.drop('record_type')
spark_df = spark_df.withColumn("LastName", translate("LastName", "LName:", ""))
spark_df = spark_df.withColumn("FirstName", reverse("FirstName"))
spark_df.write.parquet("s3a://aws-glue-bucket/parquet/test.parquet")
Spark does not parallelize reading a single gzip file. However, you can do split it in chunks.
Also, Spark is really slow at reading gzip files(since its not paralleized). You can do this to speed it up:
file_names_rdd = sc.parallelize(list_of_files, 100)
lines_rdd = file_names_rdd.flatMap(lambda _: gzip.open(_).readlines())

How to read a csv file from S3 bucket using AWS lambda and write it as new CSV to another S3 bucket? Python boto3

Ok so I am a beginner to AWS in general. I am writing a lambda function to trigger based on file upload event in S3, remove some coulmns and write it to a new bucket. Been banging my head for the past two datas and I am getting different error each time. Can someone modify my code/fix it? outputlv will be my target bucket.. Currently I am getting '/outputlv/output.csv' path does not exist in the with open('/outputlv/output.csv', 'w') as output_file line. Thanks.
import json
import urllib.parse
import boto3
import csv
s3 = boto3.client('s3')
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
file_name = s3.get_object(Bucket=bucket, Key=key)
csv_reader = csv.reader(file_name)
with open('/outputlv/output.csv', 'w') as output_file:
wtr = csv.writer(output_file)
for i in csv_reader:
wtr.writerow(i[0], i[2], i[3])
target_bucket = 'outputlv'
final_file = 'outputlv/output.csv'
s3.put_object(Bucket=target_bucket, Key=final_file)
Why don't you get the content, is it required to work with local files at all ?
response = s3.get_object(Bucket=bucket, Key=key)
# Get file content
content = response['Body'].read()
# Pass file content to csv reader
csv_reader = csv.reader(content)

How to merge multiple JSON files reading from S3, convert to single .csv and store in S3?

Input :
There are 5 part JSON files named as test_par1.json, test_part2.json, test_part3.json, test_part4.json, test_part5.json in s3://test/json_files/data/.
Expected Output :
Single csv file
Explanation : All of the json files are having same number of columns with same structure. They are basically part files of same source.
I want to merge/re partition all of them and convert them into a csv file and store it in S3.
import pandas as pd
import os
import boto3
import numpy
# Boto3 clients
resource = boto3.resource('s3')
client = boto3.client('s3')
session = boto3.session.Session()
bucket = 'test'
path = 'json_files/data/'
delimiter = '/'
suffix = '.json'
json_files = client.list_objects(Bucket=bucket, Prefix=path, Delimiter=delimiter)
#print(inter_files)
for obj in inter_files['Contents']:
#print(obj)
obj = client.get_object(Bucket=bucket, Key=obj['Key'])
#print(obj)
df = pd.read_json(obj["Body"], lines=True)
print(df)

Export JSON to CSV using Python

I wrote a code to extract some information from a website. the output is in JSON and I want to export it to CSV. So, I tried to convert it to a pandas dataframe and then export it to CSV in pandas. I can print the results but still, it doesn't convert the file to a pandas dataframe. Do you know what the problem with my code is?
# -*- coding: utf-8 -*-
# To create http request/session
import requests
import re, urllib
import pandas as pd
from BeautifulSoup import BeautifulSoup
url = "https://www.indeed.com/jobs?
q=construction%20manager&l=Houston&start=10"
# create session
s = requests.session()
html = s.get(url).text
# exctract job IDs
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' +
urllib.quote(job_ids)
# do Ajax request and convert the response to json
ajax_content = s.get(ajax_url).json()
print(ajax_content)
#Convert to pandas dataframe
df = pd.read_json(ajax_content)
#Export to CSV
df.to_csv("c:\\users\\Name\desktop\\newcsv.csv")
The error message is:
Traceback (most recent call last):
File "C:\Users\Mehrdad\Desktop\Indeed 06.py", line 21, in
df = pd.read_json(ajax_content)
File "c:\python27\lib\site-packages\pandas\io\json\json.py", line 408, in read_json
path_or_buf, encoding=encoding, compression=compression,
File "c:\python27\lib\site-packages\pandas\io\common.py", line 218, in get_filepath_or_buffer
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
ValueError: Invalid file path or buffer object type:
The problem was that nothing was going into the dataframe when you called read_json() because it was a nested JSON dict:
import requests
import re, urllib
import pandas as pd
from pandas.io.json import json_normalize
url = "https://www.indeed.com/jobs?q=construction%20manager&l=Houston&start=10"
s = requests.session()
html = s.get(url).text
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.quote(job_ids)
ajax_content= s.get(ajax_url).json()
df = json_normalize(ajax_content).transpose()
df.to_csv('your_output_file.csv')
Note that I called json_normalize() to collapse the nested columns from the JSON. I also called transpose() so that the rows were labelled with the job ID rather than columns. This will give you a dataframe that looks like this:
0079ccae458b4dcf <p><b>Company Environment: </b></p><p>Planet F...
0c1ab61fe31a5c62 <p><b>Commercial Construction Project Manager<...
0feac44386ddcf99 <div><div>Trendmaker Homes is currently seekin...
...
It's not really clear what your expected output is, though ... what are you expecting the DataFrame/CSV file to look like?. If you actually were looking for just a single row/Series with the job ID's as column labels, just remove the call to transpose()