Loading multiple json files with respective file name into dataframe

Loading multiple json files with respective file name into dataframe - json

Reading the json files into the dataframe currently works this way but the file name is the same one for the two different files. Where have I gone wrong?
json_files = glob.glob(r"json_files\*.json")
df = pd.DataFrame()
for _,ele in enumerate(json_files,len(json_files)):
df = pd.concat([df, pd.read_json(ele)])
df['filename'] = os.path.basename(ele).strip(".json")
df = df.drop(['pages'], axis=1)

Or, How about this code?
json_files = glob.glob(r"json_files\*.json")
dfs = []
for _,ele in enumerate(json_files,len(json_files)):
#df = pd.concat([df, pd.read_json(ele)])
df = pd.read_json(ele)
df['filename'] = os.path.basename(ele).strip(".json")
df = df.drop(['pages'], axis=1)
dfs.append(df)
df_final = pd.concat(dfs)

Related

Code Workbooks - File not found using hadoop_path

I have a python transform in code workbooks that is running this code:
import pandas as pd
def contents(dataset_with_files):
fs = dataset_with_files.filesystem()
filenames = [f.path for f in fs.ls()]
fp = fs.hadoop_path + "/" + filenames[0]
with open(fp, 'r') as f:
t = f.read()
rows = {"text": [t]}
return pd.DataFrame(rows)
But I am getting the error FileNotFoundError: [Errno 2] No such file or directory:
My understanding is that this is the correct way to access a file in the hdfs, is this a repository versus code workbooks limitation?

This documentation helped me figure it out:
https://www.palantir.com/docs/foundry/code-workbook/transforms-unstructured/
It was actually a pretty small change. If you are using the filesystem() you only need the relative path.
import pandas as pd
def contents_old(pycel_test):
fs = pycel_test.filesystem()
filenames = [f.path for f in fs.ls()]
with fs.open(filenames[0], 'r') as f:
value = ...
rows = {"values": [value]}
return pd.DataFrame(rows)
There is also this option, but I found it 10x slower.
from pyspark.sql import Row
def contents(dataset_with_files):
fs = dataset_with_files.filesystem() # This is the FileSystem object.
MyRow = Row("column")
def process_file(file_status):
with fs.open(file_status.path, 'r') as f:
...
rdd = fs.files().rdd
rdd = rdd.flatMap(process_file)
df = rdd.toDF()
return df

I am getting an error while trying to break my json file into chunks amd then creating sifferent csv files and finally combining into one csv file

import pandas as pd
df_reader = pd.read_json('Clothing_Shoes_and_Jewelry.json', lines = True ,chunksize = 1000000 )
counter = 1
for chunk in df_reader:
new_df = pd.DataFrame(chunk[['overall', 'reviewText','summary']])
new_df1 = new_df[new_df['overall' == 1]].sample(4000)
new_df2 = new_df[new_df['overall' == 2]].sample(4000)
new_df3 = new_df[new_df['overall' == 4]].sample(4000)
new_df4 = new_df[new_df['overall' == 5]].sample(4000)
new_df5 = new_df[new_df['overall' == 3]].sample(8000)
new_df6 = pd.concat([new_df1, new_df2, new_df3, new_df4, new_df5], axis = 0,ignore_index = True)
new_df6.to_csv(str(counter)+'.csv', index = False)
counter = counter+1
from glob import glob
#the glob module is used to retrieve the files
#or pathnames matching a pattern
filenames = glob('*.csv')
#['1.csv','2.csv',..........,'33.csv']
dataframes = []
for f in filenames:
dataframes.append(pd.read_csv(f))
#[..........]
finaldf = pd.concat(dataframes, axis = 0, ignore_index = True)
finaldf.to_csv("balanced_reviews.csv", index = False)
#---------------------------------
df = pd.read_csv('balanced_reviews.csv')
I get a ValueError: Expected object or value when getting a chunk from df_reader

The error usually occurs when either the file is not referenced correctly or if your JSON in itself is malformed. Like #Cimbali mentioned above - if are allowed to copy a sample JSON then it would help additionally. Meantime check these answers from a related question from Stackoverflow itself earlier - [(ValueError: Expected object or value when reading json as pandas dataframe)]

json to csv using python, json.loads and json_normalize function

I am trying to convert a JSON file to CSV format using Python. I am using JSON.loads() function and then using json_normalize() to flatten the objects. I was wondering if there is better way of doing this.
this is the input file, one row form it:
{"ID": "02","Date": "2019-08-01","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "01#abc.com"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}
Code
import json
import pandas as pd
from pandas.io.json import json_normalize
data_final=pd.DataFrame()
with open("sample.json") as f:
for line in f:
json_obj = json.loads(line)
ID = json_obj['ID']
Item = json_obj['Item']
dataMain = json_normalize(json_obj)
dataMain=dataMain.drop(['Item','Address'], axis=1)
#dataMain.to_csv("main.csv",index=False)
dataItem = json_normalize(json_obj,'Item',['ID'])
dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
#dataItem.to_csv("Item.csv",index=False)
dataDisc = pd.DataFrame()
dataTx = pd.DataFrame()
for rt in Item:
NM=rt['NM']
rt['ID'] = ID
if 'Disc' in rt:
data = json_normalize(rt, 'Disc', ['NM','ID'])
dataDisc = dataDisc.append(data, sort=False)
if 'TxLns' in rt:
tx=rt['TxLns']
tx['NM'] = NM
tx['ID'] = ID
if 'TX' in tx:
data = json_normalize(tx, 'TX', ['NM','ID'])
dataTx = dataTx.append(data, sort=False)
dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
dataAddress = json_normalize(json_obj,'Address',['ID'])
data_IT = pd.merge(dataMain, dataTX, on=['ID'])
data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
data_final=data_final.append(data_merge,sort=False)
data_final=data_final.drop_duplicates(keep = 'first')
data_final.to_csv("data_merged.csv",index=False)
this is the output:
ID,Date,Total,QTY,Cust.CustID,Cust.Email,NM,CD_x,SRL,CD_y,Amount,TXNM,TXCD,FirstName,LastName,Address
02,2019-08-01,400,12,10,01#abc.com,0000000001,item_CD1,25,discount_CD1,2.0,000001-001,TX_CD1,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000002,item_CD2,26,discount_CD2,4.0,000002-001,TX_CD2,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000003,item_CD3,27,,,,,firstname,lastname,address
The code is working fine for now. By Better I mean:
Is it efficient in terms of time and space complexity? If this code has to process around 10K records in a file, is this the optimized solution?

How to open multiple json files in Python?

I have to open a lot of json files in python. The following code works fine for a small amount of json files. But im already waiting 6 hours now and it's still not working. Im sure there should be a faster way than this.
base_dir = 'All Datasets EDIT/airlinesjson'
json_data_firstmonth2 = pd.DataFrame()
json_data_fmnoreset = pd.DataFrame()
for file in os.listdir(base_dir):
if 'json' in file:
json_path = os.path.join(base_dir, file)
json_data = pd.read_json(json_path, lines=True)
json_data_fmnoreset = pd.concat([json_data_fmnoreset,json_data], sort=False)
json_data_firstmonth2 = json_data_fmnoreset.reset_index()

Try to use this piece of code
json_list = [f for f in os.listdir(base_dir) if f.endswith('.json')]
for i in json_list:
with open(base_dir+ i) as json_file:
data = json.load(json_file)
...

How to create json object with the data stored in variables of different datatypes

How to create a json object/csv using below variables data:
out = [['core java'],['angular js']]
skills = 'Java'
Can someone please tell me how I can get a json object/csv as shown in expected output?
Expected Output:
Java
0 core java
1 angular js

try this example:
import json
data = {}
data['dynamic_col_name'] = 'dynamic_upcoming_value'
json_data = json.dumps(data)

Use csv module:
With list:
import csv
out = [['Java'], ['core java'],['angular js']]
with open('some.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(out)
With dict:
import csv
out = [['core javaaaa'],['angular js']]
skills = ['Java']
with open('names.csv', 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=skills)
writer.writeheader()
rows = [{skills[idx]: cell for idx, cell in enumerate(row)} for row in out]
writer.writerows(rows)

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Loading multiple json files with respective file name into dataframe - json

Related

Code Workbooks - File not found using hadoop_path

I am getting an error while trying to break my json file into chunks amd then creating sifferent csv files and finally combining into one csv file

json to csv using python, json.loads and json_normalize function

How to open multiple json files in Python?

How to create json object with the data stored in variables of different datatypes

Categories

Resources