Python 3 pandas directory search for a string in filename - csv

Hello again StackExchange!
Attempting to print all files in a directory but this time I only want to print all of the .csv files that have the string ..."AMX_error"...csv somewhere in the filename. I have the "all .csv" working, but am missing that bit of search logic.
import glob
import pandas as pd
path = r'C:\Users\Desktop\Experiment\'
#Following command to search for string in the filename
allFiles = glob.glob(path + "/*.csv") & (search filename 'AMX_error' = true)
for filename in allFiles:
print(filename)
#rest of code..
What is the notation to search for a string in a filename? Thanks!

Unless you have a reason for filtering the files first, you can simply check that the string of interest is in the filename while you're in the for loop.
import glob
import pandas as pd
path = r'C:\Users\Desktop\Experiment'
#Following command to search for string in the filename
allFiles = glob.glob(path + "/*.csv")
for filename in allFiles:
if 'AMX_error' in filename:
print(filename)

Related

Python read JSON files from all sub-directories

I have a following folder structure:
Directory
- Subdirectory 1:
file.json
- Subdirectory 2:
file.json
- Subdirectory 3:
file.json
- Subdirectory 4:
file.json
How do I read these JSON files using Pandas?
Try this code:
import pandas as pd
from pathlib import Path
files = Path("Directory").glob("**/*.json")
for file in files:
df = pd.read_json(file)
To learn more about converting JSON string to Pandas object:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html
You could do the following:
import glob, os
working_directory = os.getcwd()
sub_directories = [active_directory + "/" + x for x in os.listdir(working_directory) if os.path.isdir(active_directory + "/"+x)]
all_json_files = []
for sub_dir in sub_directories:
os.chdir(sub_dir)
for file in glob.glob("*.json"):
all_json_files.append(sub_dir + "/" + file)
#Get back to original working directory
os.chdir(working_directory)
list_of_dfs = [pd.read_json(x) for x in all_json_files]
From there, if all json files have the same structure, you could concatenate them to get one single dataframe:
final_df = pd.concat(list_of_dfs)

Python: How to save *.dat-files as *.csv-files to new folder

I have a folder with lots of *.dat files (which were created with the program IDL). I am able to take one single file, convert it to a *.csv file and save it in a different (already existing) folder:
import idlsave
import csv
input_file = idlsave.read("C:/Users/RAW/06211714.dat")
n = input_file["raw"]
with open("C:/Users/CSV/06211714.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerows(n)
The line input_file = idlsave.read("C:/Users/RAW/06211714.dat") shows the following output:
Available variables: raw class ['numpy.recarray']
So, this works fine for just taking one file, but I am looking for a way to take all *.dat files at once and convert each of them to a *.csv file with their original name.
I was thinking of something like this, but it didn't work:
import glob
for filename in glob.glob("C:/Users/RAW/*.dat"):
for element in filename:
i = idlsave.read(element)
n = i["raw"]
with open("C:/Users/CSV/*.csv", "w", newline='') as f:
writer = csv.writer(f)
writer.writerows(n)
Can someone please give me some advice?
Thanks.
import csv
import idlsave
from os import listdir
from os.path import isfile, join, splitext
dat_folder = "/folder/to/dat/files/"
csv_folder = "/folder/to/save/new/csv/files/"
onlyfilenames = [f for f in listdir(dat_folder) if isfile(join(dat_folder,f))]
for fullfilename in onlyfilenames:
file_name, file_extension = splitext(fullfilename)
if file_extension == ".dat":
input_file = idlsave.read(dat_folder + fullfilename)
n = input_file["raw"]
with open(join(csv_folder, file_name + ".csv"), "w", newline='') as f:
writer = csv.writer(f)
writer.writerows(n)

Open JSON files in different directory - Python3, Windows, pathlib

I am trying to open JSON files located in a directory other than the current working directory (cwd). My setting: Python3.5 on Windows (using Anaconda).
from pathlib import *
import json
path = Path("C:/foo/bar")
filelist = []
for f in path.iterdir():
filelist.append(f)
for file in filelist:
with open(file.name) as data_file:
data = json.load(data_file)
In this setting I have these values:
file >> C:\foo\bar\0001.json
file.name >> 0001.json
However, I get the following error message:
---> 13 with open(file.name) as data_file:
14 data = json.load(data_file)
FileNotFoundError: [Errno 2] No such file or directory: '0001.json'
Here is what I tried so far:
Use .joinpath() to add the directory to the file name in the open command:
with open(path.joinpath(file.name)) as data_file:
data = json.load(data_file)
TypeError: invalid file: WindowsPath:('C:/foo/bar/0001.json')
Used .resolve() as that works for me to load CSV files into Pandas. Did not work here.
for file in filelist:
j = Path(path, file.name).resolve()
with open(j) as data_file:
data = json.load(data_file)
Since I'm on Windows write path as (and yes, the file is in that directory):
path = Path("C:\\foo\\bar") #resulted in the same FileNotFoundError above.
Instantiate path like this:
path = WindowsPath("C:/foo/bar")
#Same TypeError as above for both '\\' and '/'
The accepted answer has a lot of redundants - re-collected generator and mixed with statement with pathlib.Path.
pathlib.Path is awesome solution to handle paths especially if we want to create scripts which may work with Linux and Windows.
# modules
from pathlib import Path
import json
# static values
JSON_SUFFIXES = [".json", ".js", ".other_suffix"]
folder_path = Path("C:/users/user/documents")
for file_path in folder_path.iterdir():
if file_path.suffix in JSON_SUFFIXES:
data = json.loads(file_path.read_bytes())
Just adding modification for new users. pathlib.Path works with Python3.
Complete solution; thanks #eryksun:
from pathlib import *
import json
path = Path("C:/foo/bar")
filelist = []
for f in path.iterdir():
filelist.append(f)
for file in filelist:
with open(str(file) as data_file:
data = json.load(data_file)
This line works as well:
with file.open() as data_file:

read leveldb(.ldb)features extracted using caffe

I have used the following command for feature extraction:
./build/tools/extract_features.bin models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel examples/_temp/imagenet_val.prototxt fc7 examples/_temp/features 10 leveldb GPU 0
the I used the following code to read leveldb features:
https://15519285443829437810.googlegroups.com/attach/b71d5c99c447fc2a/main.cpp?part=0.1&view=1&vt=ANaJVrHM26ydsY5Z2ognvhBaFtDzXnE_SiVf19DLkrNpf9Q34t5O4bJWy0nHH_HBnBAVx5wZusXd1joX93JBK0_r7XKEIc-5odz9_HPHV1RUo8MD3zNHgoY
everything is OK and I have one .ldb file now.
How can I read this .ldb file? Can I change it to .text?
I have used the following code :
import caffe
import leveldb
import numpy as np
from caffe.proto import caffe_pb2
db = leveldb.LevelDB('/home/deep/rahim/caffe-master/examples/_temp/features')
datum = caffe_pb2.Datum()
for key, value in db.RangeIter():
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
image = np.transpose(data, (1,2,0))
np.save('feature.txt',image)
Then pass the feature.txt.npy to the following code which convert .npy to .txt:
import struct
import numpy as np
import os
def parseNPY(path, fileJustName):
# load from the file
inputFile = os.path.join(path, fileJustName + ".npy")
matrices = np.load(inputFile)
outputfile = os.path.join(path, fileJustName)
for m in range(matrices.shape[0]):
# file name for this matrix
outFileFull = outputfile + "-" + str(m) + ".txt"
# output matrix to a numbered file
np.savetxt(outFileFull, matrices[m], fmt="%i", delimiter="\t")
mypath = "/home/deep/rahim/caffe-master/python/"
for path, paths, filenames in os.walk(mypath):
# translate all filenames.
for filename in filenames:
fileJustName, fileExtension = os.path.splitext(filename)
if fileExtension == ".npy":
print(os.path.join(path, fileJustName))
parseNPY(path, fileJustName)

Download files from FTP or HTTP using filenames in CSV - Python 3

I have a csv file of products for an ecommerce site I'm working on, as well as FTP access to the corresponding images for each product (~15K products).
I would like to use Python to pull only the images listed in the csv from either the FTP or HTTP and save them locally.
import urllib.request
import urllib.parse
import re
url='http://www.fakesite.com/pimages/filename.jpg'
split = urllib.parse.urlsplit(url)
filename = split.path.split("/")[-1]
urllib.request.urlretrieve(url, filename)
print(filename)
saveFile = open(filename,'r')
saveFile.close()
import csv
with open('test.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=",")
images = []
for row in readCSV:
image = row[14]
print(image)
The code I have currently can pull the filename from the URL and save the file as that filename. It can also pull the filename of the image from the CSV file. (filename and image are the exact same) What I need it to do, is input the filename, from the CSV into the end of the URL, and then save that file as the filename.
I have graduated to this:
import urllib.request
import urllib.parse
import re
import os
import csv
with open('test.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=",")
images = []
for row in readCSV:
image = row[14]
images.append(image)
x ='http://www.fakesite.com/pimages/'
url = os.path.join (x,image)
split = urllib.parse.urlsplit(url)
filename = split.path.split("/")[-1]
urllib.request.urlretrieve(url,filename)
saveFile = open(filename,'r')
saveFile.close()
Now this is great. It works perfectly. It is pulling the correct filename out of the CSV file, adding it on to the end of the URL, downloading the file, and saving it as the filename.
However, I can't seem to figure out how to make this work for more than one line of the CSV file. As of now, it takes that last line, and pulls the relevant information. Ideally, I would use the CSV file with all of the products on it, and it would go through and download every single one, not just the last image.
You're doing strange things ...
import urllib.request
import csv
# the images list should be outside the with block
images = []
IMAGE_COLUMN = 14
with open('test.csv') as csvfile:
# read csv
readCSV = csv.reader(csvfile, delimiter=",")
for row in readCSV:
# I guess 14 is the column-index of the image-name like 'image.jpg'
# I've put it in some constant
# now append all the image-names into the list
images.append(row[IMAGE_COLUMN])
# no need for the following
# image = row[14]
# images.append(image)
# make sure, root_url ends with a slash
# x was some strange name for an url
root_url = 'http://www.fakesite.com/pimages/'
# iterate through the list
for image in images:
# you don't need os.path.join, because that's operating system dependent.
# you don't need to urlsplit, because you have created the url yourself.
# you don't need to split the filename as it is the image name
# with the following line, the root_url must end with a slash
url = root_url + image
# urlretrieve saves the file as whatever image is into the current directory
urllib.request.urlretrieve(url, image)
or in short, that's all you need:
import urllib.request
import csv
IMAGE_COLUMN = 14
ROOT_URL = 'http://www.fakesite.com/pimages/'
images = []
with open('test.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=",")
for row in readCSV:
images.append(row[IMAGE_COLUMN])
for image in images:
url = ROOT_URL + image
urllib.request.urlretrieve(url, image)