I have a following folder structure:
Directory
- Subdirectory 1:
file.json
- Subdirectory 2:
file.json
- Subdirectory 3:
file.json
- Subdirectory 4:
file.json
How do I read these JSON files using Pandas?
Try this code:
import pandas as pd
from pathlib import Path
files = Path("Directory").glob("**/*.json")
for file in files:
df = pd.read_json(file)
To learn more about converting JSON string to Pandas object:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html
You could do the following:
import glob, os
working_directory = os.getcwd()
sub_directories = [active_directory + "/" + x for x in os.listdir(working_directory) if os.path.isdir(active_directory + "/"+x)]
all_json_files = []
for sub_dir in sub_directories:
os.chdir(sub_dir)
for file in glob.glob("*.json"):
all_json_files.append(sub_dir + "/" + file)
#Get back to original working directory
os.chdir(working_directory)
list_of_dfs = [pd.read_json(x) for x in all_json_files]
From there, if all json files have the same structure, you could concatenate them to get one single dataframe:
final_df = pd.concat(list_of_dfs)
Related
I´m trying to extract some metadata and store them in a JSON file using Exiftool via Python.
If I run the following command (according to the documentation) in the CMD it works fine, generating a temp.json file:
exiftool -filename -createdate -json C:/Users/XXX/Desktop/test_folder > C:/Users/XXX/Desktop/test_folder/temp.json
When managing Exiftool from Python the data is extracted correctly but no JSON file is generated.
import os
import subprocess
root_path = 'C:/Users/XXX/Desktop/test_folder'
for path, dirs, files in os.walk(root_path):
for file in files:
file_path = path + os.sep + file
exiftool_exe = 'C/Users/XXX/Desktop/exiftool.exe'
json_path = path + os.sep + 'temp.json'
export = os.path.join(path + ' > ' + json_path)
exiftool_command = [exiftool_exe, '-filename', '-createdate', '-json', export]
process = subprocess.run(exiftool_command)
print(process.stdout)
When I run the code it shows the error:
Error: File not found - C:/Users/XXX/Desktop/test_folder > C:/Users/XXX/Desktop/test_folder/temp.json
What am I missing, any ideas on how to get it to work? Thanks!
Edit with the solution:
I let the fixed code here just in case it could help someone else:
import os
import subprocess
root_path = 'C:/Users/XXX/Desktop/test_folder'
for path, dirs, files in os.walk(root_path):
for file in files:
file_path = path + os.sep + file
exiftool_exe = 'C/Users/XXX/Desktop/exiftool.exe'
export = root_path + os.sep + 'temp.json'
exiftool_command = [exiftool_exe, file_path, '-filename', '-createdate', '-json', '-W+!', export]
process = subprocess.run(exiftool_command)
print(process.stdout)
Thanks to StarGeek!
I believe the problem is that file redirection is a property of the command line and isn't available with subprocess.run. See this StackOverflow question.
For a exiftool solution, you would use the -W (-tagOut) option, specifically -W+! C:/Users/XXX/Desktop/test_folder/temp.json. See Note #3 under that link.
I have a folder with lots of *.dat files (which were created with the program IDL). I am able to take one single file, convert it to a *.csv file and save it in a different (already existing) folder:
import idlsave
import csv
input_file = idlsave.read("C:/Users/RAW/06211714.dat")
n = input_file["raw"]
with open("C:/Users/CSV/06211714.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerows(n)
The line input_file = idlsave.read("C:/Users/RAW/06211714.dat") shows the following output:
Available variables: raw class ['numpy.recarray']
So, this works fine for just taking one file, but I am looking for a way to take all *.dat files at once and convert each of them to a *.csv file with their original name.
I was thinking of something like this, but it didn't work:
import glob
for filename in glob.glob("C:/Users/RAW/*.dat"):
for element in filename:
i = idlsave.read(element)
n = i["raw"]
with open("C:/Users/CSV/*.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerows(n)
Can someone please give me some advice?
Thanks.
import csv
import idlsave
from os import listdir
from os.path import isfile, join, splitext
dat_folder = "/folder/to/dat/files/"
csv_folder = "/folder/to/save/new/csv/files/"
onlyfilenames = [f for f in listdir(dat_folder) if isfile(join(dat_folder,f))]
for fullfilename in onlyfilenames:
file_name, file_extension = splitext(fullfilename)
if file_extension == ".dat":
input_file = idlsave.read(dat_folder + fullfilename)
n = input_file["raw"]
with open(join(csv_folder, file_name + ".csv"), "w", newline='') as f:
writer = csv.writer(f)
writer.writerows(n)
Hello again StackExchange!
Attempting to print all files in a directory but this time I only want to print all of the .csv files that have the string ..."AMX_error"...csv somewhere in the filename. I have the "all .csv" working, but am missing that bit of search logic.
import glob
import pandas as pd
path = r'C:\Users\Desktop\Experiment\'
#Following command to search for string in the filename
allFiles = glob.glob(path + "/*.csv") & (search filename 'AMX_error' = true)
for filename in allFiles:
print(filename)
#rest of code..
What is the notation to search for a string in a filename? Thanks!
Unless you have a reason for filtering the files first, you can simply check that the string of interest is in the filename while you're in the for loop.
import glob
import pandas as pd
path = r'C:\Users\Desktop\Experiment'
#Following command to search for string in the filename
allFiles = glob.glob(path + "/*.csv")
for filename in allFiles:
if 'AMX_error' in filename:
print(filename)
I am trying to open JSON files located in a directory other than the current working directory (cwd). My setting: Python3.5 on Windows (using Anaconda).
from pathlib import *
import json
path = Path("C:/foo/bar")
filelist = []
for f in path.iterdir():
filelist.append(f)
for file in filelist:
with open(file.name) as data_file:
data = json.load(data_file)
In this setting I have these values:
file >> C:\foo\bar\0001.json
file.name >> 0001.json
However, I get the following error message:
---> 13 with open(file.name) as data_file:
14 data = json.load(data_file)
FileNotFoundError: [Errno 2] No such file or directory: '0001.json'
Here is what I tried so far:
Use .joinpath() to add the directory to the file name in the open command:
with open(path.joinpath(file.name)) as data_file:
data = json.load(data_file)
TypeError: invalid file: WindowsPath:('C:/foo/bar/0001.json')
Used .resolve() as that works for me to load CSV files into Pandas. Did not work here.
for file in filelist:
j = Path(path, file.name).resolve()
with open(j) as data_file:
data = json.load(data_file)
Since I'm on Windows write path as (and yes, the file is in that directory):
path = Path("C:\\foo\\bar") #resulted in the same FileNotFoundError above.
Instantiate path like this:
path = WindowsPath("C:/foo/bar")
#Same TypeError as above for both '\\' and '/'
The accepted answer has a lot of redundants - re-collected generator and mixed with statement with pathlib.Path.
pathlib.Path is awesome solution to handle paths especially if we want to create scripts which may work with Linux and Windows.
# modules
from pathlib import Path
import json
# static values
JSON_SUFFIXES = [".json", ".js", ".other_suffix"]
folder_path = Path("C:/users/user/documents")
for file_path in folder_path.iterdir():
if file_path.suffix in JSON_SUFFIXES:
data = json.loads(file_path.read_bytes())
Just adding modification for new users. pathlib.Path works with Python3.
Complete solution; thanks #eryksun:
from pathlib import *
import json
path = Path("C:/foo/bar")
filelist = []
for f in path.iterdir():
filelist.append(f)
for file in filelist:
with open(str(file) as data_file:
data = json.load(data_file)
This line works as well:
with file.open() as data_file:
I have used the following command for feature extraction:
./build/tools/extract_features.bin models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel examples/_temp/imagenet_val.prototxt fc7 examples/_temp/features 10 leveldb GPU 0
the I used the following code to read leveldb features:
https://15519285443829437810.googlegroups.com/attach/b71d5c99c447fc2a/main.cpp?part=0.1&view=1&vt=ANaJVrHM26ydsY5Z2ognvhBaFtDzXnE_SiVf19DLkrNpf9Q34t5O4bJWy0nHH_HBnBAVx5wZusXd1joX93JBK0_r7XKEIc-5odz9_HPHV1RUo8MD3zNHgoY
everything is OK and I have one .ldb file now.
How can I read this .ldb file? Can I change it to .text?
I have used the following code :
import caffe
import leveldb
import numpy as np
from caffe.proto import caffe_pb2
db = leveldb.LevelDB('/home/deep/rahim/caffe-master/examples/_temp/features')
datum = caffe_pb2.Datum()
for key, value in db.RangeIter():
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
image = np.transpose(data, (1,2,0))
np.save('feature.txt',image)
Then pass the feature.txt.npy to the following code which convert .npy to .txt:
import struct
import numpy as np
import os
def parseNPY(path, fileJustName):
# load from the file
inputFile = os.path.join(path, fileJustName + ".npy")
matrices = np.load(inputFile)
outputfile = os.path.join(path, fileJustName)
for m in range(matrices.shape[0]):
# file name for this matrix
outFileFull = outputfile + "-" + str(m) + ".txt"
# output matrix to a numbered file
np.savetxt(outFileFull, matrices[m], fmt="%i", delimiter="\t")
mypath = "/home/deep/rahim/caffe-master/python/"
for path, paths, filenames in os.walk(mypath):
# translate all filenames.
for filename in filenames:
fileJustName, fileExtension = os.path.splitext(filename)
if fileExtension == ".npy":
print(os.path.join(path, fileJustName))
parseNPY(path, fileJustName)