Python3 save a json to a csv file from Coingeko API - json

I am struggling to convert a json file to a csv file. Any help would be appreciated. I am using Python3
Code
import json
import urllib.request
url = 'https://api.coingecko.com/api/v3/coins/bitcoin/market_chart?vs_currency=usd&days=1&interval=daily&sparkline=false'
req = urllib.request.Request(url)
##parsing response
myfile=open("coingecko1.csv","w",encoding="utf8")
headers="Prices,MrkCap,TolVol \n"
myfile.write(headers)
r = urllib.request.urlopen(req).read()
cont = json.loads(r.decode('utf-8'))
print (cont)#Just to check json result
for market in cont:
prices =(cont["prices"])
market_caps = (cont["market_caps"])
total_volumes = (cont["total_volumes"])
content= prices+","+str(market_caps)+","+str(total_volumes)+" \n"
myfile.write(content)
print("job complete")
Python Result
{'prices': [[1629331200000, 45015.46554608543], [1629361933000, 44618.52978218442]], 'market_caps': [[1629331200000, 847143004614.999], [1629361933000, 837151985590.3453]], 'total_volumes': [[1629331200000, 34668999387.83819], [1629361933000, 33367392889.386738]]}
Traceback (most recent call last):
File "ma1.py", line 22, in <module>
content= prices+","+str(market_caps)+","+str(total_volumes)+" \n"
TypeError: can only concatenate list (not "str") to list
CSV Result
CSV Result
Thank You

Your JSON is nested which is list of lists. To read easily in CSV you must flatten it out
I've reformatted the code to dump to CSV. check below
import csv
import json
import urllib.request
url = 'https://api.coingecko.com/api/v3/coins/bitcoin/market_chart?vs_currency=usd&days=1&interval=daily&sparkline=false'
req = urllib.request.Request(url)
r = urllib.request.urlopen(req).read()
cont = json.loads(r.decode('utf-8'))
# flatten the JSON data to read csv easily
flatten_data = {}
for key in cont:
for value in cont[key]:
if value[0] not in flatten_data:
flatten_data[value[0]] = {}
flatten_data[value[0]].update({key: value[1]})
# write csv with DictWriter
with open('coingecko1.csv', 'w', encoding='utf-8') as csvfile:
headers = ['Item', 'Prices', 'MrkCap', 'TolVol']
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()
for k, v in flatten_data.items():
v.update({'Item': k})
# renamed the columns as required
v['Prices'] = v.pop('prices')
v['MrkCap'] = v.pop('market_caps')
v['TolVol'] = v.pop('total_volumes')
writer.writerow(v)
print("job complete")

Related

Code Workbooks - File not found using hadoop_path

I have a python transform in code workbooks that is running this code:
import pandas as pd
def contents(dataset_with_files):
fs = dataset_with_files.filesystem()
filenames = [f.path for f in fs.ls()]
fp = fs.hadoop_path + "/" + filenames[0]
with open(fp, 'r') as f:
t = f.read()
rows = {"text": [t]}
return pd.DataFrame(rows)
But I am getting the error FileNotFoundError: [Errno 2] No such file or directory:
My understanding is that this is the correct way to access a file in the hdfs, is this a repository versus code workbooks limitation?
This documentation helped me figure it out:
https://www.palantir.com/docs/foundry/code-workbook/transforms-unstructured/
It was actually a pretty small change. If you are using the filesystem() you only need the relative path.
import pandas as pd
def contents_old(pycel_test):
fs = pycel_test.filesystem()
filenames = [f.path for f in fs.ls()]
with fs.open(filenames[0], 'r') as f:
value = ...
rows = {"values": [value]}
return pd.DataFrame(rows)
There is also this option, but I found it 10x slower.
from pyspark.sql import Row
def contents(dataset_with_files):
fs = dataset_with_files.filesystem() # This is the FileSystem object.
MyRow = Row("column")
def process_file(file_status):
with fs.open(file_status.path, 'r') as f:
...
rdd = fs.files().rdd
rdd = rdd.flatMap(process_file)
df = rdd.toDF()
return df

How to read from excel and write in json file using python?

I am trying to create a json file which reads data from test.xlsx. My sample code is below.
Instead of "WO-12345" and other values, I want that to be read from the excel sheet, like I want it to be read from a particular cell in excel.
import xlrd
from collections import OrderedDict
import simplejson as json
import json
jsonfile = open('data1.json', 'w')
data_list = []
data = OrderedDict()
data['workOrder'] = "WO-12345"
data['alternateStart'] = "2018-01-13T10:00:00Z"
data['mobileNumber'] = "(555) 555-5555"
data['officeNumber'] = "(555) 555-5554"
data['description'] = "Testing"
data['equipment'] = "Testing"
data_list.append(data)
j = json.dumps(data_list)
json.dump(data, jsonfile, indent=3, sort_keys=False)
jsonfile.write('\n')
If you want to read an Excel there's pandas pandas.read_excel, it returns a pandas.DataFrame that has the to_json method.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html

python3 read data from csv file and covert it into json

this is my code covert json into csv.
import csv
import json
with open('Documents/SampleCSVStory.csv', 'r') as f:
reader = csv.reader(f, delimiter=';')
data_list = list()
for row in reader:
data_list.append(row)
data = [dict(zip(data_list[0],row)) for row in data_list]
data.pop(0)
s = json.dumps(data)
print (s)
but the output coming like this
[{"Id,Name,Description": "1,User 1,Python Developer"}
my expectation is
[{"Id:"1",Name:"User 1",Description:"Python Developer"}
can anyone helping me in this please.?
import csv
import json
with open('Documents/SampleCSVStory.csv', 'r') as f:
reader = csv.DictReader(f, delimiter=';')
json.dumps([row for row in reader])

How to soup a browser response

I've got a program that sends a lot of requests to a website using RoboBrowser and gets the answers, but now I need to filter these answers to only the ones that don't have this string " Case Status Not Available " I tried to use beautifulsoup for it, but it is returning an error.
Here's the code so far:
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
import requests
from robobrowser import RoboBrowser
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
print(browser.response.text)
souptwo = BeautifulSoup(browser.response.text)
texttwo = soup.get_text()
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
if not matchtwo:
soupthree = BeautifulSoup(browser.response.text)
print soupthree
The error that returns is:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\converterpluspa.py", line 87, in <module>
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
TypeError: 'NoneType' object is not callable
Line 87 includes an attempt to call the method findall of soup. soup was defined in line 65 where BeautifulSoup was called to parse the contents of a file. Since the error diagnostic says that soup is None this means that BeautifulSoup was unable to parse that file.

Scraping Data from JSON

How to scrape this data,
http://jsonviewer.stack.hu/#http://91.134.133.185:5000/viaroute?loc=25.299919,55.376774&loc=25.298738,55.369181
and Extract only total_time" to a file?
It should be fairly easy to achieve this with a little search.
You just have to find some modules to work with json, dataframes and text files, and learn how to use them.
Steps:
1 - read json data using pandas.from_json()
2 - set data = df['total_time']
2 - write data using pandas.to_csv()
Simple as py.
Documentation:
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
import json
json_string = '''Json data here'''
data = json.loads(json_string)
total_time = data["route_summary"]["total_time"]
f = open("file_name_here.txt", "w+")
f.write(str(total_time))
f.close()
I've wrote this program for you:
import json, urllib2
url = 'http://91.134.133.185:5000/viaroute?loc=25.299919,55.376774&loc=25.298738,55.369181'
response = urllib2.urlopen(url)
data = json.load(response)
tot_time = str(data['route_summary']['total_time'])
s = tot_time + "\n"
outfile = "C:\\Users\\USER\\Desktop\\outfile.txt"
with open(outfile, "a+") as f:
f.write(s)
It'll append each observation to the end of outfile.txt
Saving json data to a file and reading that file
import json, urllib2
url = 'http://91.134.133.185:5000/viaroute?loc=25.299919,55.376774&loc=25.298738,55.369181'
response = urllib2.urlopen(url)
data = json.load(response)
outfile = "C:\\Users\\USER\\Desktop\\outfile.txt"
#saving json to file
with open(outfile, "w") as f:
f.write(str(data))
#reading file with json data
with open(outfile, 'r') as g:
json_data = g.readline()
print json_data
#Output:
{u'route_geometry': u'{_ego#m}|rhBpBaBvHuC`EuArEUtEtAlDvEnD`MlDvMli#hsEfFzn#QlTgNhwCs#fKwBhF', u'status': 0, u'via_indices': [0, 15], u'route_summary': {u'total_time': 101, u'end_point': u'', u'start_point': u'', u'total_distance': 871}, u'route_name': [u'', u''], u'hint_data': {u'checksum': 326195011, u'locations': [u'AXQDAP____8AAAAABwAAABEAAAAYAAAAIwIAAERwAgAAAAAADgyCAef7TAMCAAEB', u'bOsDAP____8AAAAAAwAAAAcAAADFAQAAFAAAAEJwAgAAAAAANQeCAd3dTAMFAAEB']}, u'via_points': [[25.299982, 55.376873], [25.29874, 55.369179]], u'status_message': u'Found route between points', u'found_alternative': False}