Read nested data from json using .proto in python - json

I want to read nested data from a json. I have created a .proto file based on the json but still I am not able to read nested data from this said json.
nested.proto --> compiling using protoc --python_out=$PWD nested.proto
syntax = "proto2";
message Employee{
required int32 EMPLOYEE_ID = 1;
message ListItems {
required string FULLADDRESS = 1;
}
repeated ListItems EMPLOYEE_ADDRESS = 2;
}
nested.json
{
"EMPLOYEE_ID": 5044,
"EMPLOYEE_ADDRESS": [
{
"FULLADDRESS": "Suite 762"
}
]
}
parse.py
#!/usr/bin/env python3
import json
from google.protobuf.json_format import Parse
import nested_pb2 as np
input_file = "nested.json"
if __name__ == "__main__":
# reading json file
f = open(input_file, 'rb')
content = json.load(f)
# initialize emp_table here
emp_table = np.Employee()
employee = Parse(json.dumps(content), emp_table, True)
print(employee.EMPLOYEE_ID) #output: 5044
emp_table = np.Employee().ListItems()
items = Parse(json.dumps(content), emp_table, True)
print(items.FULLADDRESS) #output: NO OUTPUT (WHY?)

Couple of things:
The type is ListItems but the name is EMPLOYEE_ADDRESS
Python is awkward (!) with repeated's
You're writing more code than you need
I recommend adhering to the style guide if you can.
Try:
#!/usr/bin/env python3
import json
from google.protobuf.json_format import Parse
import nested_pb2 as np
input_file = "nested.json"
if __name__ == "__main__":
# reading json file
f = open(input_file, 'rb')
content = json.load(f)
# initialize emp_table here
emp_table = np.Employee()
employee = Parse(json.dumps(content), emp_table, True)
print(employee.EMPLOYEE_ID) #output: 5044
for item in employee.EMPLOYEE_ADDRESS:
print(item)

Related

Parsing JSON element (Iterate over list of elements)

I created a script where it creates a JSON file where the list of my server URLs and API keys are stored. Now, I need to get those element (URL & API keys) in order for me to get all the users on each server. I'm stuck at the point where I need to iterate on all server URLs and declare them as a variable.
Below is my sample code.
import sys
import json
import testlink
import xmlrpc.client
import requests
import xml.etree.ElementTree as ET
import openpyxl
from openpyxl import Workbook
requests.packages.urllib3.disable_warnings()
# Create a Json file that compose of TESTLINK_API_PYTHON_SERVER_URL and TESTLINK_API_PYTHON_DEVKEY
def serverJson():
serverDictionary = {
"servers": [
{
"server_name": "https://firstInstance/lib/api/xmlrpc/v1/xmlrpc.php",
"devKey": "1234567890abcdef"
},
{
"server_name": "https://secondInstance/lib/api/xmlrpc/v1/xmlrpc.php",
"devKey": "0987654321fedcba"
}
]
}
# Create json file
with open("server.json", "w") as server:
json.dump(serverDictionary, server)
# Return TESTLINK_API_PYTHON_SERVER_URL and TESTLINK_API_PYTHON_DEVKEY
def serverList():
serverJson()
# Open json file
server_file = open('server.json')
# Return JSON Object as dictionary
data = json.load(server_file)
# Iterating through the json list
for servers in data['servers']:
serverName = servers.get('server_name')
devKey = servers.get('devKey')
print(serverName, devKey)
# Start Testlink-API-Call
TESTLINK_API_PYTHON_SERVER_URL = str()
TESTLINK_API_PYTHON_DEVKEY = str()
tls = testlink.TestlinkAPIClient(TESTLINK_API_PYTHON_SERVER_URL, TESTLINK_API_PYTHON_DEVKEY)
# IF Else to each instance & devKey
# First instance
if TESTLINK_API_PYTHON_SERVER_URL == (firstServerURL) and TESTLINK_API_PYTHON_DEVKEY == (firstDevKey):
print("----------User list for First Instance----------")
tree = ET.parse('usersTLFirstInstance.xml')
root = tree.getroot()
for user in root.findall('user'):
loginID = user.find('id').text
for tl_first_user in tls.getUserByID(loginID):
first_name = tl_first_user.get('firstName')
print(loginID, first_name)
print("----------Ending List for First Instance----------")
# Second instance
elif TESTLINK_API_PYTHON_SERVER_URL == (secondServerURL) and TESTLINK_API_PYTHON_DEVKEY == (secondDevKey):
print("----------User list for Second Instance----------")
tree = ET.parse('usersTLSecondInstance.xml')
root = tree.getroot()
for user in root.findall('user'):
loginID = user.find('id').text
for tl_second_user in tls.getUserByID(loginID):
first_name = tl_second_user.get('firstName')
print(loginID, first_name)
print("----------Ending List for Second Instance----------")
serverList()
Here is my JSON File that I created.
{
"servers": [
{
"server_name": "https://firstInstance/lib/api/xmlrpc/v1/xmlrpc.php",
"devKey": "1234567890abcdef"
},
{
"server_name": "https://secondInstance/lib/api/xmlrpc/v1/xmlrpc.php",
"devKey": "0987654321fedcba"
}
]
}

Code Workbooks - File not found using hadoop_path

I have a python transform in code workbooks that is running this code:
import pandas as pd
def contents(dataset_with_files):
fs = dataset_with_files.filesystem()
filenames = [f.path for f in fs.ls()]
fp = fs.hadoop_path + "/" + filenames[0]
with open(fp, 'r') as f:
t = f.read()
rows = {"text": [t]}
return pd.DataFrame(rows)
But I am getting the error FileNotFoundError: [Errno 2] No such file or directory:
My understanding is that this is the correct way to access a file in the hdfs, is this a repository versus code workbooks limitation?
This documentation helped me figure it out:
https://www.palantir.com/docs/foundry/code-workbook/transforms-unstructured/
It was actually a pretty small change. If you are using the filesystem() you only need the relative path.
import pandas as pd
def contents_old(pycel_test):
fs = pycel_test.filesystem()
filenames = [f.path for f in fs.ls()]
with fs.open(filenames[0], 'r') as f:
value = ...
rows = {"values": [value]}
return pd.DataFrame(rows)
There is also this option, but I found it 10x slower.
from pyspark.sql import Row
def contents(dataset_with_files):
fs = dataset_with_files.filesystem() # This is the FileSystem object.
MyRow = Row("column")
def process_file(file_status):
with fs.open(file_status.path, 'r') as f:
...
rdd = fs.files().rdd
rdd = rdd.flatMap(process_file)
df = rdd.toDF()
return df

Python3 save a json to a csv file from Coingeko API

I am struggling to convert a json file to a csv file. Any help would be appreciated. I am using Python3
Code
import json
import urllib.request
url = 'https://api.coingecko.com/api/v3/coins/bitcoin/market_chart?vs_currency=usd&days=1&interval=daily&sparkline=false'
req = urllib.request.Request(url)
##parsing response
myfile=open("coingecko1.csv","w",encoding="utf8")
headers="Prices,MrkCap,TolVol \n"
myfile.write(headers)
r = urllib.request.urlopen(req).read()
cont = json.loads(r.decode('utf-8'))
print (cont)#Just to check json result
for market in cont:
prices =(cont["prices"])
market_caps = (cont["market_caps"])
total_volumes = (cont["total_volumes"])
content= prices+","+str(market_caps)+","+str(total_volumes)+" \n"
myfile.write(content)
print("job complete")
Python Result
{'prices': [[1629331200000, 45015.46554608543], [1629361933000, 44618.52978218442]], 'market_caps': [[1629331200000, 847143004614.999], [1629361933000, 837151985590.3453]], 'total_volumes': [[1629331200000, 34668999387.83819], [1629361933000, 33367392889.386738]]}
Traceback (most recent call last):
File "ma1.py", line 22, in <module>
content= prices+","+str(market_caps)+","+str(total_volumes)+" \n"
TypeError: can only concatenate list (not "str") to list
CSV Result
CSV Result
Thank You
Your JSON is nested which is list of lists. To read easily in CSV you must flatten it out
I've reformatted the code to dump to CSV. check below
import csv
import json
import urllib.request
url = 'https://api.coingecko.com/api/v3/coins/bitcoin/market_chart?vs_currency=usd&days=1&interval=daily&sparkline=false'
req = urllib.request.Request(url)
r = urllib.request.urlopen(req).read()
cont = json.loads(r.decode('utf-8'))
# flatten the JSON data to read csv easily
flatten_data = {}
for key in cont:
for value in cont[key]:
if value[0] not in flatten_data:
flatten_data[value[0]] = {}
flatten_data[value[0]].update({key: value[1]})
# write csv with DictWriter
with open('coingecko1.csv', 'w', encoding='utf-8') as csvfile:
headers = ['Item', 'Prices', 'MrkCap', 'TolVol']
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()
for k, v in flatten_data.items():
v.update({'Item': k})
# renamed the columns as required
v['Prices'] = v.pop('prices')
v['MrkCap'] = v.pop('market_caps')
v['TolVol'] = v.pop('total_volumes')
writer.writerow(v)
print("job complete")

json to csv using python, json.loads and json_normalize function

I am trying to convert a JSON file to CSV format using Python. I am using JSON.loads() function and then using json_normalize() to flatten the objects. I was wondering if there is better way of doing this.
this is the input file, one row form it:
{"ID": "02","Date": "2019-08-01","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "01#abc.com"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}
Code
import json
import pandas as pd
from pandas.io.json import json_normalize
data_final=pd.DataFrame()
with open("sample.json") as f:
for line in f:
json_obj = json.loads(line)
ID = json_obj['ID']
Item = json_obj['Item']
dataMain = json_normalize(json_obj)
dataMain=dataMain.drop(['Item','Address'], axis=1)
#dataMain.to_csv("main.csv",index=False)
dataItem = json_normalize(json_obj,'Item',['ID'])
dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
#dataItem.to_csv("Item.csv",index=False)
dataDisc = pd.DataFrame()
dataTx = pd.DataFrame()
for rt in Item:
NM=rt['NM']
rt['ID'] = ID
if 'Disc' in rt:
data = json_normalize(rt, 'Disc', ['NM','ID'])
dataDisc = dataDisc.append(data, sort=False)
if 'TxLns' in rt:
tx=rt['TxLns']
tx['NM'] = NM
tx['ID'] = ID
if 'TX' in tx:
data = json_normalize(tx, 'TX', ['NM','ID'])
dataTx = dataTx.append(data, sort=False)
dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
dataAddress = json_normalize(json_obj,'Address',['ID'])
data_IT = pd.merge(dataMain, dataTX, on=['ID'])
data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
data_final=data_final.append(data_merge,sort=False)
data_final=data_final.drop_duplicates(keep = 'first')
data_final.to_csv("data_merged.csv",index=False)
this is the output:
ID,Date,Total,QTY,Cust.CustID,Cust.Email,NM,CD_x,SRL,CD_y,Amount,TXNM,TXCD,FirstName,LastName,Address
02,2019-08-01,400,12,10,01#abc.com,0000000001,item_CD1,25,discount_CD1,2.0,000001-001,TX_CD1,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000002,item_CD2,26,discount_CD2,4.0,000002-001,TX_CD2,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000003,item_CD3,27,,,,,firstname,lastname,address
The code is working fine for now. By Better I mean:
Is it efficient in terms of time and space complexity? If this code has to process around 10K records in a file, is this the optimized solution?

Scraping Data from JSON

How to scrape this data,
http://jsonviewer.stack.hu/#http://91.134.133.185:5000/viaroute?loc=25.299919,55.376774&loc=25.298738,55.369181
and Extract only total_time" to a file?
It should be fairly easy to achieve this with a little search.
You just have to find some modules to work with json, dataframes and text files, and learn how to use them.
Steps:
1 - read json data using pandas.from_json()
2 - set data = df['total_time']
2 - write data using pandas.to_csv()
Simple as py.
Documentation:
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html
import json
json_string = '''Json data here'''
data = json.loads(json_string)
total_time = data["route_summary"]["total_time"]
f = open("file_name_here.txt", "w+")
f.write(str(total_time))
f.close()
I've wrote this program for you:
import json, urllib2
url = 'http://91.134.133.185:5000/viaroute?loc=25.299919,55.376774&loc=25.298738,55.369181'
response = urllib2.urlopen(url)
data = json.load(response)
tot_time = str(data['route_summary']['total_time'])
s = tot_time + "\n"
outfile = "C:\\Users\\USER\\Desktop\\outfile.txt"
with open(outfile, "a+") as f:
f.write(s)
It'll append each observation to the end of outfile.txt
Saving json data to a file and reading that file
import json, urllib2
url = 'http://91.134.133.185:5000/viaroute?loc=25.299919,55.376774&loc=25.298738,55.369181'
response = urllib2.urlopen(url)
data = json.load(response)
outfile = "C:\\Users\\USER\\Desktop\\outfile.txt"
#saving json to file
with open(outfile, "w") as f:
f.write(str(data))
#reading file with json data
with open(outfile, 'r') as g:
json_data = g.readline()
print json_data
#Output:
{u'route_geometry': u'{_ego#m}|rhBpBaBvHuC`EuArEUtEtAlDvEnD`MlDvMli#hsEfFzn#QlTgNhwCs#fKwBhF', u'status': 0, u'via_indices': [0, 15], u'route_summary': {u'total_time': 101, u'end_point': u'', u'start_point': u'', u'total_distance': 871}, u'route_name': [u'', u''], u'hint_data': {u'checksum': 326195011, u'locations': [u'AXQDAP____8AAAAABwAAABEAAAAYAAAAIwIAAERwAgAAAAAADgyCAef7TAMCAAEB', u'bOsDAP____8AAAAAAwAAAAcAAADFAQAAFAAAAEJwAgAAAAAANQeCAd3dTAMFAAEB']}, u'via_points': [[25.299982, 55.376873], [25.29874, 55.369179]], u'status_message': u'Found route between points', u'found_alternative': False}