Export JSON to CSV using Python - json

I wrote a code to extract some information from a website. the output is in JSON and I want to export it to CSV. So, I tried to convert it to a pandas dataframe and then export it to CSV in pandas. I can print the results but still, it doesn't convert the file to a pandas dataframe. Do you know what the problem with my code is?
# -*- coding: utf-8 -*-
# To create http request/session
import requests
import re, urllib
import pandas as pd
from BeautifulSoup import BeautifulSoup
url = "https://www.indeed.com/jobs?
q=construction%20manager&l=Houston&start=10"
# create session
s = requests.session()
html = s.get(url).text
# exctract job IDs
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' +
urllib.quote(job_ids)
# do Ajax request and convert the response to json
ajax_content = s.get(ajax_url).json()
print(ajax_content)
#Convert to pandas dataframe
df = pd.read_json(ajax_content)
#Export to CSV
df.to_csv("c:\\users\\Name\desktop\\newcsv.csv")
The error message is:
Traceback (most recent call last):
File "C:\Users\Mehrdad\Desktop\Indeed 06.py", line 21, in
df = pd.read_json(ajax_content)
File "c:\python27\lib\site-packages\pandas\io\json\json.py", line 408, in read_json
path_or_buf, encoding=encoding, compression=compression,
File "c:\python27\lib\site-packages\pandas\io\common.py", line 218, in get_filepath_or_buffer
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
ValueError: Invalid file path or buffer object type:

The problem was that nothing was going into the dataframe when you called read_json() because it was a nested JSON dict:
import requests
import re, urllib
import pandas as pd
from pandas.io.json import json_normalize
url = "https://www.indeed.com/jobs?q=construction%20manager&l=Houston&start=10"
s = requests.session()
html = s.get(url).text
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.quote(job_ids)
ajax_content= s.get(ajax_url).json()
df = json_normalize(ajax_content).transpose()
df.to_csv('your_output_file.csv')
Note that I called json_normalize() to collapse the nested columns from the JSON. I also called transpose() so that the rows were labelled with the job ID rather than columns. This will give you a dataframe that looks like this:
0079ccae458b4dcf <p><b>Company Environment: </b></p><p>Planet F...
0c1ab61fe31a5c62 <p><b>Commercial Construction Project Manager<...
0feac44386ddcf99 <div><div>Trendmaker Homes is currently seekin...
...
It's not really clear what your expected output is, though ... what are you expecting the DataFrame/CSV file to look like?. If you actually were looking for just a single row/Series with the job ID's as column labels, just remove the call to transpose()

Related

Pandas parallel URL downloads with pd.read_html

I know I can download a csv file from a web page by doing:
import pandas as pd
import numpy as np
from io import StringIO
URL = "http://www.something.com"
data = pd.read_html(URL)[0].to_csv(index=False, header=True)
file = pd.read_csv(StringIO(data), sep=',')
Now I would like to do the above for more URLs at the same time, like when you open different tabs in your browser. In other words, a way to parallelize this when you have different URLs, instead of looping through or doing it one at a time. So, I thought of having a series of URLs inside a dataframe, and then create a new column which contains the strings 'data', one for each URL.
list_URL = ["http://www.something.com", "http://www.something2.com",
"http://www.something3.com"]
df = pd.DataFrame(list_URL, columns =['URL'])
df['data'] = pd.read_html(df['URL'])[0].to_csv(index=False, header=True)
But it gives me error: cannot parse from 'Series'
Is there a better syntax, or does this mean I cannot do this in parallel for more than one URL?
You could try like this:
import pandas as pd
URLS = [
"https://en.wikipedia.org/wiki/Periodic_table#Presentation_forms",
"https://en.wikipedia.org/wiki/Planet#Planetary_attributes",
]
df = pd.DataFrame(URLS, columns=["URL"])
df["data"] = df["URL"].map(
lambda x: pd.read_html(x)[0].to_csv(index=False, header=True)
)
print(df)
# Output
URL data
0 https://en.wikipedia.org/wiki/Periodic_t... 0\r\nPart of a series on the\r\nPeriodic...
1 https://en.wikipedia.org/wiki/Planet#Pla... 0\r\n"The eight known planets of the Sol...

I am having trouble converting my nested json into a dataframe. I am getting the json from an API and want it in a dataframe

This code is from Sportradar API. The API outputs the data as JSON or XML; below is my attempt at taking the JSON and making it into a dataframe.
import numpy as np
import pandas as pd
import http.client
import json
from pandas.io.json import json_normalize
#API Call including my key
conn = http.client.HTTPSConnection("api.sportradar.us")
conn.request("GET", "/nfl/official/trial/v5/en/players/0acdcd3b-5442-4311-a139-ae7c506faf88/profile.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/b7aeb58f-7987-4202-bc41-3ad9a5b83fa4/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/teams/0d855753-ea21-4953-89f9-0e20aff9eb73/full_roster.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
#conn.request("GET", "/nfl/official/trial/v5/en/games/030d37cf-b896-4f10-b16e-2a5120fef6cf/pbp.json?api_key=99s3ewmn5rrdrd9r3v5wrfgd")
res = conn.getresponse()
data = res.read()
data_dec = data.decode("utf-8")
json_data = json.loads(data_dec)
flat_data = json_normalize(json_data)
print(json_data)
df = pd.DataFrame.from_records(flat_data)
df2 = pd.DataFrame.from_dict(json_data, orient='index')
df2.reset_index(level=0, inplace=True)
#The closest thing to a dataframe I can get
df.head()
Why not make use of a Python Wrapper that is publicly available and maintained.
See link.

Convert dataframe to JSON using Python

I have been trying to convert a dataframe to JSON using Python. I am able to do it successfully but i am not getting the required format of JSON.
Code -
df1 = df.rename_axis('CUST_ID').reset_index()
df.to_json('abc.json')
Here, abc.json is the filename of JSON and df is the required dataframe.
What I am getting -
{"CUST_LAST_UPDATED":
{"1000":1556879045879.0,"1001":1556879052416.0},
"CUST_NAME":{"1000":"newly
updated_3_file","1001":"heeloo1"}}
What I want -
[{"CUST_ID":1000,"CUST_NAME":"newly
updated_3_file","CUST_LAST_UPDATED":1556879045879},
{"CUST_ID":1001,"CUST_NAME":"heeloo1","CUST_LAST_UPDATED":1556879052416}]
Error -
Traceback (most recent call last):
File
"C:/Users/T/PycharmProject/test_pandas.py",
line 19, in <module>
df1 = df.rename_axis('CUST_ID').reset_index()
File "C:\Users\T\AppData\Local\Programs\Python\Python36\lib\site-
packages\pandas\core\frame.py", line 3379, in reset_index
new_obj.insert(0, name, level_values)
File "C:\Users\T\AppData\Local\Programs\Python\Python36\lib\site-
packages\pandas\core\frame.py", line 2613, in insert
allow_duplicates=allow_duplicates)
File "C:\Users\T\AppData\Local\Programs\Python\Python36\lib\site-
packages\pandas\core\internals.py", line 4063, in insert
raise ValueError('cannot insert {}, already exists'.format(item))
ValueError: cannot insert CUST_ID, already exists
df.head() Output -
CUST_ID CUST_LAST_UPDATED CUST_NAME
0 1000 1556879045879 newly updated_3_file
1 1001 1556879052416 heeloo1
How to change the format while converting dataframe to JSON?
Use DataFrame.rename_axis with DataFrame.reset_index for column from index and then DataFrame.to_json with orient='records':
df1 = df.rename_axis('CUST_ID').reset_index()
df1.to_json('abc.json', orient='records')
[{"CUST_ID":"1000",
"CUST_LAST_UPDATED":1556879045879.0,
"CUST_NAME":"newly updated_3_file"},
{"CUST_ID":"1001",
"CUST_LAST_UPDATED":1556879052416.0,
"CUST_NAME":"heeloo1"}]
EDIT:
Because there is default index in data, use:
df1.to_json('abc.json', orient='records')
Verify:
print (df1.to_json(orient='records'))
[{"CUST_ID":1000,
"CUST_LAST_UPDATED":1556879045879,
"CUST_NAME":"newly pdated_3_file"},
{"CUST_ID":1001,
"CUST_LAST_UPDATED":1556879052416,
"CUST_NAME":"heeloo1"}]
You can convert a dataframe to a jason format using to_dict:
df1.to_dict('records')
the outpit would the one that you need.
Suppose if dataframe has nan values in each row and you don't want them in your json file. Follow below code
import pandas as pd
from pprint import pprint
import json
import argparse
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--csv")
parser.add_argument("--json")
args = parser.parse_args()
entities=pd.read_csv(args.csv)
json_data=[row.dropna().to_dict() for index,row in entities.iterrows()]
with open(args.json,"w") as file:
json.dump(json_data,file)

How to convert json file into table structure in redshift using python

How can I convert JSON file into a table structure in Redshift? I tried the below python code.
import boto3
import json
import os
import sys
import psycopg2
import csv
from collections import defaultdict
def jsonfile(path):
session = boto3.Session(
aws_access_key_id='dfjfkgj',
aws_secret_access_key='sdfg',
region_name='us-west-2')
s3 = session.resource('s3')
bucket= s3.Bucket('ag-redshift-poc')
with open(path, 'rb') as data:
res=json.load(data)
f = open('data.csv','wb')
output = csv.writer(f)
output.writerow(res[0].keys())
for row in res:
output.writerow(row.values())
bucket.put_object(Key=('C:\Python27\data.csv'),Body=res)
print 'success'
def redshift():
co=psycopg2.connect(dbname= 'redshiftpoc', host='shdjf',
port= '5439', user= 'admin', password= 'snd')
curr = co.cursor()
curr.execute("""copy sample from 's3://ag-redshift-poc/testfile/json.txt'
CREDENTIALS 'aws_access_key_id=fdfd;aws_secret_access_key=sxhd'
""")
co.commit()
print 'success'
curr.close()
co.close()
jsonfile('C:\Python27\json.txt')
redshift()
Redshift can directly absorb JSON to COPY into your table. (Though not very efficient).
In your case, modify the COPY query to,
COPY sample FROM 's3://<bucket_name>/<path_to_json>'
CREDENTIALS 'aws_access_key_id=xxxx;aws_secret_access_key=xxxx'
JSON 'auto' ACCEPTINVCHARS;
Please note JSON 'auto' in query. This maps every column in table with keys in JSON file.
More details here in the COPY examples

Python folder contents CSV writer

I'm trying to make a simple command line script with Python code that generates a CSV when it scans the contents of a directory, but I'm not sure if I'm doing it correctly, cause I keep getting errors. Can someone tell me what the heck I'm doing wrong?
import sys
import argparse
import os
import string
import fnmatch
import csv
from string import Template
from os import path
from os.path import basename
header = ["Title","VersionData","PathOnClient","OwnerId","FirstPublishLocationId","RecordTypeId","TagsCsv"]
if not sys.argv.len < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames = header, delimiter=',')
writer.writeheader()
if os.path.isdir(sys.argv[2]):
for d in os.scandir(sys.argv[2]):
row = Template('"$title","$path","$path"') #some default values in the template were omitted here
writer.writerow(row.substitute(title=basename(d.path)), path=path.abspath(d.path))
Right off the bat, csvwriter.writerow(row) takes only one argument. You need to wrap your arguments inside brackets and then join with comma.
Moreover, you cannot call other functions within the row object, which is what you are trying to do with row.substitute(args) etc.
Figured it out. For anyone else needing a quick CSV listing of folders, here's the code I got to work:
#!/usr/bin/env python3
import sys, os, csv
from string import Template
from pathlib import PurePath, PureWindowsPath
from os.path import basename
header = ["Title","Path","","","","",""] # insert what header you need, if any
if not len(sys.argv) < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames=header, dialect='excel', delimiter=',', quoting=csv.QUOTE_ALL)
writer.writeheader()
initPath = os.path.abspath(sys.argv[2])
if sys.platform.startswith('linux') or sys.platform.startswith('cygwin') or sys.platform.startswith('darwin'):
p = PurePath(initPath)
else:
if sys.platform.startswith('win32'):
p = PureWindowsPath(initPath)
if os.path.isdir(str(p)) and not str(p).startswith('.'):
for d in os.scandir(str(p)):
srow = Template('"$title","$path", "","","",""')
#s = srow.substitute({'title': basename(d.path), 'path': os.path.abspath(d.path)) #
#print(s) # this is for testing if the content produces what's expected
row = {'Title': basename(d.path), 'Path': os.path.abspath(d.path)} # the dictionary must have the same number of entries as the number of header fields your CSV is going to contain.
writer.writerow(row)