I have a python transform in code workbooks that is running this code:
import pandas as pd
def contents(dataset_with_files):
fs = dataset_with_files.filesystem()
filenames = [f.path for f in fs.ls()]
fp = fs.hadoop_path + "/" + filenames[0]
with open(fp, 'r') as f:
t = f.read()
rows = {"text": [t]}
return pd.DataFrame(rows)
But I am getting the error FileNotFoundError: [Errno 2] No such file or directory:
My understanding is that this is the correct way to access a file in the hdfs, is this a repository versus code workbooks limitation?
This documentation helped me figure it out:
https://www.palantir.com/docs/foundry/code-workbook/transforms-unstructured/
It was actually a pretty small change. If you are using the filesystem() you only need the relative path.
import pandas as pd
def contents_old(pycel_test):
fs = pycel_test.filesystem()
filenames = [f.path for f in fs.ls()]
with fs.open(filenames[0], 'r') as f:
value = ...
rows = {"values": [value]}
return pd.DataFrame(rows)
There is also this option, but I found it 10x slower.
from pyspark.sql import Row
def contents(dataset_with_files):
fs = dataset_with_files.filesystem() # This is the FileSystem object.
MyRow = Row("column")
def process_file(file_status):
with fs.open(file_status.path, 'r') as f:
...
rdd = fs.files().rdd
rdd = rdd.flatMap(process_file)
df = rdd.toDF()
return df
import pandas as pd
df_reader = pd.read_json('Clothing_Shoes_and_Jewelry.json', lines = True ,chunksize = 1000000 )
counter = 1
for chunk in df_reader:
new_df = pd.DataFrame(chunk[['overall', 'reviewText','summary']])
new_df1 = new_df[new_df['overall' == 1]].sample(4000)
new_df2 = new_df[new_df['overall' == 2]].sample(4000)
new_df3 = new_df[new_df['overall' == 4]].sample(4000)
new_df4 = new_df[new_df['overall' == 5]].sample(4000)
new_df5 = new_df[new_df['overall' == 3]].sample(8000)
new_df6 = pd.concat([new_df1, new_df2, new_df3, new_df4, new_df5], axis = 0,ignore_index = True)
new_df6.to_csv(str(counter)+'.csv', index = False)
counter = counter+1
from glob import glob
#the glob module is used to retrieve the files
#or pathnames matching a pattern
filenames = glob('*.csv')
#['1.csv','2.csv',..........,'33.csv']
dataframes = []
for f in filenames:
dataframes.append(pd.read_csv(f))
#[..........]
finaldf = pd.concat(dataframes, axis = 0, ignore_index = True)
finaldf.to_csv("balanced_reviews.csv", index = False)
#---------------------------------
df = pd.read_csv('balanced_reviews.csv')
I get a ValueError: Expected object or value when getting a chunk from df_reader
The error usually occurs when either the file is not referenced correctly or if your JSON in itself is malformed. Like #Cimbali mentioned above - if are allowed to copy a sample JSON then it would help additionally. Meantime check these answers from a related question from Stackoverflow itself earlier - [(ValueError: Expected object or value when reading json as pandas dataframe)]
I am trying to convert a JSON file to CSV format using Python. I am using JSON.loads() function and then using json_normalize() to flatten the objects. I was wondering if there is better way of doing this.
this is the input file, one row form it:
{"ID": "02","Date": "2019-08-01","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "01#abc.com"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}
Code
import json
import pandas as pd
from pandas.io.json import json_normalize
data_final=pd.DataFrame()
with open("sample.json") as f:
for line in f:
json_obj = json.loads(line)
ID = json_obj['ID']
Item = json_obj['Item']
dataMain = json_normalize(json_obj)
dataMain=dataMain.drop(['Item','Address'], axis=1)
#dataMain.to_csv("main.csv",index=False)
dataItem = json_normalize(json_obj,'Item',['ID'])
dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
#dataItem.to_csv("Item.csv",index=False)
dataDisc = pd.DataFrame()
dataTx = pd.DataFrame()
for rt in Item:
NM=rt['NM']
rt['ID'] = ID
if 'Disc' in rt:
data = json_normalize(rt, 'Disc', ['NM','ID'])
dataDisc = dataDisc.append(data, sort=False)
if 'TxLns' in rt:
tx=rt['TxLns']
tx['NM'] = NM
tx['ID'] = ID
if 'TX' in tx:
data = json_normalize(tx, 'TX', ['NM','ID'])
dataTx = dataTx.append(data, sort=False)
dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
dataAddress = json_normalize(json_obj,'Address',['ID'])
data_IT = pd.merge(dataMain, dataTX, on=['ID'])
data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
data_final=data_final.append(data_merge,sort=False)
data_final=data_final.drop_duplicates(keep = 'first')
data_final.to_csv("data_merged.csv",index=False)
this is the output:
ID,Date,Total,QTY,Cust.CustID,Cust.Email,NM,CD_x,SRL,CD_y,Amount,TXNM,TXCD,FirstName,LastName,Address
02,2019-08-01,400,12,10,01#abc.com,0000000001,item_CD1,25,discount_CD1,2.0,000001-001,TX_CD1,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000002,item_CD2,26,discount_CD2,4.0,000002-001,TX_CD2,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000003,item_CD3,27,,,,,firstname,lastname,address
The code is working fine for now. By Better I mean:
Is it efficient in terms of time and space complexity? If this code has to process around 10K records in a file, is this the optimized solution?
I have to open a lot of json files in python. The following code works fine for a small amount of json files. But im already waiting 6 hours now and it's still not working. Im sure there should be a faster way than this.
base_dir = 'All Datasets EDIT/airlinesjson'
json_data_firstmonth2 = pd.DataFrame()
json_data_fmnoreset = pd.DataFrame()
for file in os.listdir(base_dir):
if 'json' in file:
json_path = os.path.join(base_dir, file)
json_data = pd.read_json(json_path, lines=True)
json_data_fmnoreset = pd.concat([json_data_fmnoreset,json_data], sort=False)
json_data_firstmonth2 = json_data_fmnoreset.reset_index()
Try to use this piece of code
json_list = [f for f in os.listdir(base_dir) if f.endswith('.json')]
for i in json_list:
with open(base_dir+ i) as json_file:
data = json.load(json_file)
...
How to create a json object/csv using below variables data:
out = [['core java'],['angular js']]
skills = 'Java'
Can someone please tell me how I can get a json object/csv as shown in expected output?
Expected Output:
Java
0 core java
1 angular js
try this example:
import json
data = {}
data['dynamic_col_name'] = 'dynamic_upcoming_value'
json_data = json.dumps(data)
Use csv module:
With list:
import csv
out = [['Java'], ['core java'],['angular js']]
with open('some.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(out)
With dict:
import csv
out = [['core javaaaa'],['angular js']]
skills = ['Java']
with open('names.csv', 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=skills)
writer.writeheader()
rows = [{skills[idx]: cell for idx, cell in enumerate(row)} for row in out]
writer.writerows(rows)