Splitting a csv file into multiple files - csv

I have a csv file of 150500 rows and I want to split it into multiple files containing 500 rows (entries)
I'm using Jupyter and I know how to open and read the file. However, I don't know how to specify an output_path to record the newly created files from splitting the big one.
I have found this code online but once again since I don't know what is my output_path I don't know how to use it. Moreover, for this block of code I don't understand how we specify the input file.
import os
def split(filehandler, delimiter=',', row_limit=1000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
import csv
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
My file name is DataSet2.csv and it's in the same file in jupyter as my ipynb notebook is running.

number_of_small_files = 301
lines_per_small_file = 500
largeFile = open('large.csv', 'r')
header = largeFile.readline()
for i in range(number_of_small_files):
smallFile = open(str(i) + '_small.csv', 'w')
smallFile.write(header) # This line copies the header to all small files
for x in range(lines_per_small_file):
line = largeFile.readline()
smallFile.write(line)
smallFile.close()
largeFile.close()
This will create many small files in the same directory. About 301 of them. They will be named from 0_small.csv to 300_small.csv.

Using standard unix utilities:
cat DataSet2.csv | tail -n +2 | split -l 500 --additional-suffix=.csv output_
This pipeline takes the original file, strips off the first line with 'tail -n +2', and then splits the rest into 500 line chunks that are put into files with names that start with 'output_' and end with '.csv'

Related

Loading Multiple CSV files across all subfolder levels with Wildcard file name

I want to Load Multiple CSV files matching certain names into a dataframe. Currently i am looping through the whole folder and creating a list of filenames and then loading those csv's into the dataframe list and then concatenating that dataframe.
The approach i want to use (if possible) is to bypass all the code and read all files in a one liner kind of approach.
I know this can be done easily for single level of subfolders, but my subfolder structure is as follows
Root Folder
|
Subfolder1
|
Subfolder 2
|
X01.csv
Y01.csv
Z01.csv
|
Subfolder3
|
Subfolder4
|
X01.csv
Y01.csv
|
Subfolder5
|
X01.csv
Y01.csv
I want to read all "X01.csv" files while reading from Root Folder.
Is there a way i can read all the required files in code something like the below
filepath = "rootpath" + "/**/X*.csv"
df = spark.read.format("com.databricks.spark.csv").option("recursiveFilelookup","true").option("header","true").load(filepath)
This code works fine for single level of subfolders, is there any equivalent of this for multi level folders ? i thought the "recursiveFilelookup" option would look across all levels of subfolders, but apparently this is not the way it works.
Currently i am getting a
Path not found ... filepath
exception
any help please
Have you tried using the glob.glob function?
You can use it to search for files that match certain criteria inside a root path, and pass the list of files it finds to spark.read.csv function.
For example, I've recreated the folder structure from your example inside a Google Colab environment:
To get a list of all CSV files matching the criteria you've specified, you can use the following code:
import glob
rootpath = './Root Folder/'
# The following line of code looks through all files
# inside the rootpath recursively, trying to match the
# pattern specified. In this case, it tries to find any
# CSV file that starts with the letters X, Y, or Z,
# and ends with 2 numbers (ranging from 0 to 9).
glob.glob(rootpath + "**/[X|Y|Z][0-9][0-9].csv", recursive=True)
# Returns:
# ['./Root Folder/Subfolder5/Y01.csv',
# './Root Folder/Subfolder5/X01.csv',
# './Root Folder/Subfolder1/Subfolder 2/Y01.csv',
# './Root Folder/Subfolder1/Subfolder 2/Z01.csv',
# './Root Folder/Subfolder1/Subfolder 2/X01.csv']
Now you can combine this with spark.read.csv capability of reading a list of files to get the answer you're looking for:
import glob
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
rootpath = './Root Folder/'
spark.read.csv(glob.glob(rootpath + "**/[X|Y|Z][0-9][0-9].csv", recursive=True), inferSchema=True, header=True)
Note
You can specify more general patterns like:
glob.glob(rootpath + "**/*.csv", recursive=True)
To return a list of all csv files inside any subdirectory of rootpath.
Additionally, to consider only the immediate subdirectories files, you could use something like:
glob.glob(rootpath + "*.csv", recursive=True)
Edit
Based on your comments to this answer, does something like this works on Databricks?
from notebookutils import mssparkutils as ms
# databricks has a module called dbutils.fs.ls
# that works similarly to mssparkutils.fs, based on
# the following page of its documentation:
# https://docs.databricks.com/dev-tools/databricks-utils.html#ls-command-dbutilsfsls
def scan_dir(
initial_path: str,
search_str: str,
account_name: str,
):
"""Scan a directory and subdirectories for a string.
Parameters
----------
initial_path : str
The path to start the search. Accepts either a valid container name,
or the entire connection string.
search_str : str
The string to search.
account_name : str
The name of the account to access the container folders.
This value is only used, when the `initial_path`, doesn't
conform with the format: "abfss://<initial_path>#<account_name>.dfs.core.windows.net/"
Raises
------
FileNotFoundError
If the `initial_path` informed doesn't exist.
ValueError
If `initial_path` is not a string.
"""
if not isinstance(initial_path, str):
raise ValueError(
f'`initial_path` needs to be of type string, not {type(initial_path)}'
)
elif not initial_path.startswith('abfss'):
initial_path = f'abfss://{initial_path}#{account_name}.dfs.core.windows.net/'
try:
fdirs = ms.fs.ls(initial_path)
except Py4JJavaError as exc:
raise FileNotFoundError(
f'The path you informed \"{initial_path}\" doesn\'t exist'
) from exc
found = []
for path in fdirs:
p = path.path
if path.isDir:
found = [*found, *scan_dir(p, search_str)]
if search_str.lower() in path.name.lower():
# print(p.split('.net')[-1])
found = [*found, p.replace(path.name, "")]
return list(set(found))
Example:
# Change .parquet to .csv
spark.read.parquet(*scan_dir("abfss://CONTAINER_NAME#ACCOUNTNAME.dfs.core.windows.net/ROOT/FOLDER/", ".parquet"))
This method above worked for on Azure Synapse:

writer.writerow() doesn't write to the correct column

I have three DynamoDB tables. Two tables have instance IDs that are part of an application and the other is a master table of all instances across all of my accounts and the tag metadata. I have two scans for the two tables to get the instance IDs and then query the master table for the tag metadata. However, when I try writing this to the CSV file, I want to have two separate header sections for each dynamo table's unique output. Once the first iteration is done, the second file write writes to the last row where the first iteration left off instead of starting over at the top in the second header section. Below is my code and an output example to make it clear.
CODE:
import boto3
import csv
import json
from boto3.dynamodb.conditions import Key, Attr
dynamo = boto3.client('dynamodb')
dynamodb = boto3.resource('dynamodb')
s3 = boto3.resource('s3')
# Required resource and client calls
all_instances_table = dynamodb.Table('Master')
missing_response = dynamo.scan(TableName='T1')
installed_response = dynamo.scan(TableName='T2')
# Creates CSV DictWriter object and fieldnames
with open('file.csv', 'w') as csvfile:
fieldnames = ['Agent Not Installed', 'Not Installed Account', 'Not Installed Tags', 'Not Installed Environment', " ", 'Agent Installed', 'Installed Account', 'Installed Tags', 'Installed Environment']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# Find instances IDs from the missing table in the master table to pull tag metadata
for instances in missing_response['Items']:
instance_missing = instances['missing_instances']['S']
#print("Missing:" + instance_missing)
query_missing = all_instances_table.query(KeyConditionExpression=Key('ID').eq(instance_missing))
for item_missing in query_missing['Items']:
missing_id = item_missing['ID']
missing_account = item_missing['Account']
missing_tags = item_missing['Tags']
missing_env = item_missing['Environment']
# Write the data to the CSV file
writer.writerow({'Agent Not Installed': missing_id, 'Not Installed Account': missing_account, 'Not Installed Tags': missing_tags, 'Not Installed Environment': missing_env})
# Find instances IDs from the installed table in the master table to pull tag metadata
for instances in installed_response['Items']:
instance_installed = instances['installed_instances']['S']
#print("Installed:" + instance_installed)
query_installed = all_instances_table.query(KeyConditionExpression=Key('ID').eq(instance_installed))
for item_installed in query_installed['Items']:
installed_id = item_installed['ID']
print(installed_id)
installed_account = item_installed['Account']
installed_tags = item_installed['Tags']
installed_env = item_installed['Environment']
# Write the data to the CSV file
writer.writerow({'Agent Installed': installed_id, 'Installed Account': installed_account, 'Installed Tags': installed_tags, 'Installed Environment': installed_env})
OUTPUT:
This is what the columns/rows look like in the file.
I need all of the output to be on the same line for each header section.
DATA:
Here is a sample of what both tables look like.
SAMPLE OUTPUT:
Here is what the for loops print out and appends to the lists.
Missing:
i-0xxxxxx 333333333 foo#bar.com int
i-0yyyyyy 333333333 foo1#bar.com int
Installed:
i-0zzzzzz 44444444 foo2#bar.com int
i-0aaaaaa 44444444 foo3#bar.com int
You want to collect related rows together into a single list to write on a single row, something like:
missing = [] # collection for missing_responses
installed = [] # collection for installed_responses
# Find instances IDs from the missing table in the master table to pull tag metadata
for instances in missing_response['Items']:
instance_missing = instances['missing_instances']['S']
#print("Missing:" + instance_missing)
query_missing = all_instances_table.query(KeyConditionExpression=Key('ID').eq(instance_missing))
for item_missing in query_missing['Items']:
missing_id = item_missing['ID']
missing_account = item_missing['Account']
missing_tags = item_missing['Tags']
missing_env = item_missing['Environment']
# Update first half of row with missing list
missing.append(missing_id, missing_account, missing_tags, missing_env)
# Find instances IDs from the installed table in the master table to pull tag metadata
for instances in installed_response['Items']:
instance_installed = instances['installed_instances']['S']
#print("Installed:" + instance_installed)
query_installed = all_instances_table.query(KeyConditionExpression=Key('ID').eq(instance_installed))
for item_installed in query_installed['Items']:
installed_id = item_installed['ID']
print(installed_id)
installed_account = item_installed['Account']
installed_tags = item_installed['Tags']
installed_env = item_installed['Environment']
# update second half of row by updating installed list
installed.append(installed_id, installed_account, installed_tags, installed_env)
# combine your two lists outside a loop
this_row = []
i = 0;
for m in missing:
# iterate through the first half to concatenate with the second half
this_row.append( m + installed[i] )
i = i +1
# adding an empty column after the write operation, manually, is optional
# Write the data to the CSV file
writer.writerow(this_row)
This will work if your installed and missing tables operate on a relatable field - like a timestamp or an account ID, something that you can ensure keeps the rows being concatenated in the same order. A data sample would be useful to really answer the question.

Automating a process for multiple CSV file

I've been looking around and couldn't find the answer so here it is.
I'm trying to look into a way for automating of changing the content of a CSV file into something else for machine learning purposes. I have the content of a single line like this:
0, 0, 0, -2.3145, 5.567...... 65, 65, 125, 70.
(516 columns)
And trying to change it to this:
0,
0,
-2.3145,
5.567
....
65,
65,
125,
70.
(516 rows)
So basically transposing the data from horizontal to vertical (single row to single column).
It's easily done using Excel but problem is I have 4000+ of the CSV file so it takes a lot of time.
On top of that, I have to get the first 512 rows and store it into a CSV of another folder adding the last 4 rows into another CSV of another folder while both files have the same name.
Eg:
features(folder)
1.CSV
2.CSV
.....
4000+.CSV
labels(folder)
1.CSV
2.CSV
.....
4000+.CSV
Any suggestions on how I can speed things up? Tried writing my own program but I'm stumped on changing it from row to column. I've only managed to split the single CSV file to it's 4000+ pieces.
EDIT:
I've tested by putting the csv rows into an array and then storing the array into the csv where the code looks like this:
with open('FFTMIM16_512L1H1S0D0_1194.csv', 'r') as f:
reader = csv.reader(f)
your_list = list(reader)
print(your_list[0:512])
print(your_list[512:516])
print(your_list)
with open('test.csv', 'w', newline = '') as fa:
writer = csv.writer(fa)
writer.writerows(your_list[0:511])
with open('test1.csv', 'w', newline = '') as fb:
writer = csv.writer(fb)
writer.writerows(your_list[512:516])
It works but I just need to run it in a loop. A problem that I don't understand is that if I save the values from 0 to 512 on test.csv, it will show 512 counts of rows but when I store from 513 to 516 to test1.csv, it only shows three instead of four rows that I need. Changing fb content from 512 to 516 will work which doesn't make sense to me because the value of 512 in test.csv is 0 while test1.csv is 69. Why is that? From what I can understand is the index of the array, it starts from 0 to the place of number I need. Or is it not the case in python?
EDIT 2:
My new code is as follows:
import csv
import os
import glob
#import itertools
directory = input("INPUT FOLDER: ")
output1 = input("FEATURES FODLER: ")
output2 = input("LABELS FOLDER: ")
in_files = os.path.join(directory, '*.csv')
for in_file in glob.glob(in_files):
with open(in_file) as input_file:
reader = csv.reader(input_file)
your_list = (reader)
filename = os.path.splitext(os.path.basename(in_file))[0] + '.csv'
with open(os.path.join(output1, filename), 'w', newline='') as output_file1:
writer = csv.writer(output_file1)
writer.writerow(your_list[0:512])
with open(os.path.join(output2, filename), 'w', newline='' ) as output_file2:
writer = csv.writer(output_file2)
writer.writerow(your_list[512:516])
It shows the output as I wanted but now it stores apostrophes and braces eg. ['0.0'], ['2.321223'] as well. How do I remove these?
I don't understand why you can't do it programatically if you have your 4000+ pieces, just write every piece in a new line?
In my opinion the easiest way, but not automatically, would be some editor like Notepad ++.
Here you can Replace "," by "\r\n" or if you want to keep the "," you replace it with ",\r\n".
If you want it automated i don't see a not programmatical way.
By the way... if you use python with numpy/scipy you can just use the .transpose() function
*Edit to your comment:
what do you mean with "split from the first to the 512"? If you want parts with the size 512 it would be something like:
new_array = []
temp_array = []
k = 0
for num in your_array:
temp_array.append(num)
k += 1
if k % 512 == 0:
new_array.append(temp_array)
k = 0
temp_array = []
#to append the last block which might not be 512 sized
if len(temp_array) > 0:
new_array.append(temp_array)
# Save Arrays
for i in len(new_array):
saveToCsv(array = new_array[i], name="csv_"+str(i))
Your new_array would now be an array filled with 512 sized arrays.
Might be mistakes here, i did not test the code. To save you only need a function saveToCsf(array, name) which saves an array into a file.

blank file while copying a file in python

I have a function takes a file as input and prints certain statistics and also copies the file into a file name provided by the user. Here is my current code:
def copy_file(option):
infile_name = input("Please enter the name of the file to copy: ")
infile = open(infile_name, 'r')
outfile_name = input("Please enter the name of the new copy: ")
outfile = open(outfile_name, 'w')
slist = infile.readlines()
if option == 'statistics':
for line in infile:
outfile.write(line)
infile.close()
outfile.close()
result = []
blank_count = slist.count('\n')
for item in slist:
result.append(len(item))
print('\n{0:<5d} lines in the list\n{1:>5d} empty lines\n{2:>7.1f} average character per line\n{3:>7.1f} average character per non-empty line'.format(
len(slist), blank_count, sum(result)/len(slist), (sum(result)-blank_count)/(len(slist)-blank_count)))
copy_file('statistics')
It prints the statistics of the file correctly, however the copy it makes of the file is empty. If I remove the readline() part and the statistics part, the function seems to make a copy of the file correctly. How can I correct my code so that it does both. It's a minor problem but I can't seem to get it.
The reason the file is blank is that
slist = infile.readlines()
is reading the entire contents of the file, so when it gets to
for line in infile:
there is nothing left to read and it just closes the newly truncated (mode w) file leaving you with a blank file.
I think the answer here is to change your for line in infile: to for line in slist:
def copy_file(option):
infile_name= input("Please enter the name of the file to copy: ")
infile = open(infile_name, 'r')
outfile_name = input("Please enter the name of the new copy: ")
outfile = open(outfile_name, 'w')
slist = infile.readlines()
if option == 'statistics':
for line in slist:
outfile.write(line)
infile.close()
outfile.close()
result = []
blank_count = slist.count('\n')
for item in slist:
result.append(len(item))
print('\n{0:<5d} lines in the list\n{1:>5d} empty lines\n{2:>7.1f} average character per line\n{3:>7.1f} average character per non-empty line'.format(
len(slist), blank_count, sum(result)/len(slist), (sum(result)-blank_count)/(len(slist)-blank_count)))
copy_file('statistics')
Having said all that, consider if it's worth using your own copy routine rather than shutil.copy - Always better to delegate the task to your OS as it will be quicker and probably safer (thanks to NightShadeQueen for the reminder)!

Filtering rows in a csv file with Python

I have a script that opens and modifies a text file. The text file which contains personnel info and a lunch account balance. My script takes the text file removes the quotes and only writes rows that contain the values D, F or R in column 8. It writes this filtered data to two files, a csv import file called lunchimport.csv for a separate program and a csv temp file called to be used for further filtering. The second stage of the script uses the csv temp file to generate two additional csv files. One file, negativebal.csv, contains only rows with a negative value in column 14. The other file, lowbal.cav, contains rows with a value between 0 and 5 in column 14. My issue is that I cant get the script to filter "between" values properly. When using the code below to just write rows with values in column14 between 0 and 5 nothing will filter out. If I use values between 0 and 1.99 it works. Anything greater than 1.99 and the code doesnt filter anything:
if row[13] > "0" and row[13] < "1.99":
lowwriter.writerow([row[0], row[13]])
I have pasted my entire code below. I do use alot of temp files to accomplish my tasks. There probably is a better way but im just interested in getting my filters to work properly.
import os
import csv
infile = open("\\\\comalexsrv\\export\\update.txt", "r")
outfile1 = open("casttemp1.csv", "w")
infile2 = open("casttemp1.csv", "r")
outfile2 = open("casttemp2.csv", "w")
infile3 = open("casttemp2.csv", "r")
outfile3 = open("casttemp3.csv", "w")
infile4 = open("casttemp3.csv", "r")
inowcsv = open("F:\zbennett\Lunch_Imports\lunchimport.csv", "w")
negcastcsv = open("\\\\tcdc\\inow_transfer$\\negativebal.csv", "w")
lowcastcsv = open("\\\\tcdc\\inow_transfer$\\lowbal.csv", "w")
# Remove quotes in update.txt, write to outfile1(casttemp1.csv)
string = infile.read()
outfile1.write(string.replace("\"", ''))
# Open infile2(casttemp1.csv), write rows with D,F,R in column 8 to outfile2(casttemp2.csv)
# Open infile2(casttemp1.csv), write rows with D,F,R in column 8 to inowcsv(F:\zbennett\Lunch_Imports\lunchimport.csv)
# Open infile2(casttemp1.csv), write rows with D,R in column 8 to outfile3(casttemp3.csv)
tempwriter = csv.writer(outfile2, delimiter=',', lineterminator= '\n')
importwriter = csv.writer(inowcsv, delimiter=',', lineterminator= '\n')
lowtemp = csv.writer(outfile3, delimiter=',', lineterminator= '\n')
for row in csv.reader(infile2, delimiter=','):
if row[7] == "D":
tempwriter.writerow(row)
importwriter.writerow(row)
lowtemp.writerow(row)
if row[7] == "F":
tempwriter.writerow(row)
importwriter.writerow(row)
if row[7] == "R":
tempwriter.writerow(row)
importwriter.writerow(row)
lowtemp.writerow(row)
# Open infile3(casttemp2.csv), write columns 1,14 for rows with less than 0 in column 14 to negcastcsv(\\tcdc\inow_transfer$\negativebal.csv)
negwriter = csv.writer(negcastcsv, delimiter=',', lineterminator= '\n')
for row in csv.reader(infile3, delimiter=','):
if row[13] < "0":
negwriter.writerow([row[0], row[13]])
# Open infile4(casttemp3.csv), write columns 1,14 for rows with column 14 greater than 0 and less than 1.75 to lowcastcsv(\\tcdc\inow_transfer$\lowbal.csv)
lowwriter = csv.writer(lowcastcsv, delimiter=',', lineterminator= '\n')
for row in csv.reader(infile4, delimiter=','):
if row[13] > "0" and row[13] < "1.99":
lowwriter.writerow([row[0], row[13]])
infile.close()
outfile1.close()
infile2.close()
outfile2.close()
inowcsv.close()
outfile3.close()
infile3.close()
infile4.close()
negcastcsv.close()
lowcastcsv.close()
# Delete casttemp1.csv file
os.remove("casttemp1.csv")
os.remove("casttemp2.csv")
os.remove("casttemp3.csv")
Comparison is happening using strings, when you probably want numeric comparison:
if 0. < float(row[13]) < 1.99:
lowwriter.writerow([row[0], row[13]])