How to handle newlines when loading a CSV into Apache Beam? - csv

I am running into an issue where in some of my fields, there are new lines within the text.
My current code is as follows:
# Python's regular expression library
import re
import sys
# Beam and interactive Beam imports
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib
p = beam.Pipeline(InteractiveRunner())
def print_row(element):
print(element)
def parse_file(element):
for line in csv.reader([element], quotechar='"', delimiter=',', lineterminator='\n', quoting=csv.QUOTE_ALL, skipinitialspace=True):
return line
parsed_csv = p | 'Read input file' >> beam.io.ReadFromText("gs://ny-data/AB_NYC_2019.csv")| 'Parse file' >> beam.Map(parse_file)
split = parsed_csv | beam.Map(lambda x: x[0]) | beam.Map(print)
p.run()
I am running into issues because some of the text appears as so:
The BLUE OWL:
VEGETARIAN WBURG W PATIO & BACKYARD!
Any thoughts on how to proceed?

ReadFromText reads inputs one line at a time. As suggested before, you can use the Dataframe read_csv, or you could create a PCollection of paths and open/read them in a DoFn.
For example, you could write
def read_csv_file(file_metadata):
with beam.io.filesystems.FileSystems.open(file_metadata.path) as fin:
for row in csv.reader(fin):
yield row
rows = (
p
| beam.io.fileio.MatchFiles('/pattern/to/files/*.csv') # emits FileMetadatas
| beam.FlatMap(read_csv_file)) # emits rows

Related

How can I save some json files generated in a for loop as csv?

Sorry, I am new in coding in Python, I would need to save a json file generated in a for loop as csv for each iteration of the loop.
I wrote a code that works fine to generate the first csv file but then it is overwritten and I did not find a solution yet. Can anyone help me? many thanks
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file.csv')
You need to add a sequence number or some other unique identifier to the filename. The clearest example would be to keep track of a counter, or use a GUID. Below I've used a counter that is initialized before your loop, and is incremented in each iteration. This will produce a list of files like output_file_1.csv, output_file_2.csv, output_file_3.csv and so on.
counter = 0
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file_' + str(counter) + '.csv')
counter += 1
We convert the integer to a string, and paste it inbetween the name of your file and its extension.
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for idx, user in enumerate(user_objects):
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv(f'output_file{str(idx)}.csv')

Special Characters and Converting Problems Using Tabula for PDF to Proper CSV

The code:
#Import the required Module
import tabula
# Read a PDF File
df=tabula.read_pdf("C:/Users/Desktop/abstract/abstract.pdf",encoding='cp1252', pages='all')
#Total page number can change. All pages must be taken. (to be generic)
# convert PDF into CSV
df1=df.to_csv('C:/Users/Desktop/abstract.pdf')
print(df1)
Hello friends, I have a monthly account statement in pdf. I want to get the name and period information as text, save the date, info, amount, and gift information as CSV and read it.
I tried something called tabula read but couldn't get a file the way I wanted. In addition, there are special characters in the pdf file. These are also decoded incorrectly (ğ, ü, ç, etc.)
How can I get the format which I want? So mean I can't reach İşlem tarihi,Açıklama,Tutar,Bankomat Para columns in CSV file or Is there any better way to convert pdf to CSV file?
original version (2 pages):
original version end of the page:
The code:
#libraries
import pandas as pd
import fitz
import io
def set_texts(pdf_files:list):
print("starting to text process")
#This function reads pdf and gets "CRC-32" components as texts
for pdf_file in pdf_files:
with fitz.open(pdf_file) as doc:
text = ""
for page in doc:
new_text = page.get_text()
text += new_text
return text
file_names = [r"C:/Users/Desktop/abstract/Alışveris_Özeti.pdf"]
text = set_texts(file_names)
buffer = io.StringIO(text)
new_text = ""
flag = False
i = 0
for line in buffer.readlines():
# print(line)
if "Bankomat Para Bilgileriniz" in line:
flag = False
elif "Bankomat Para (TL)" in line:
flag = True
elif "Vakıfbank" in line:
flag = False
elif flag:
new_text += line
elif "Sayın" in line :
name=(line.lstrip("Sayın ")).replace(",","")
print(name)
buffer = io.StringIO(new_text)
text_list = buffer.readlines()
# /n correction
a = list(map(lambda x: x.replace('\n', ''), text_list))
# Converting 4 spaces to single space
b = list(map(lambda x: x.replace(' ', ' '), a))
# card vocabulay
c = list(map(lambda x: x.replace('BANKOMAT KART ', 'BANKOMAT KART'), b))
# undesired words
stopwords = ['BANKOMAT KART','İŞLEMİ', 'ALIŞVERİŞ EKSTRESİ', 'Dekont yerine kullanılmaz. Uyuşmazlık halinde Banka kayıtları esas alınacaktır', 'www.vakifbank.com.tr I 0850 222 0 724', 'Türkiye Vakıflar Bankası T.A.O. Büyük Mükellefler V.D. 9220034970 Sicil Numarası: 776444','Saray Mahallesi Dr. Adnan Büyükdeniz Caddesi No :7 / A-B Ümraniye /İSTANBUL Mersis: 0922003497000017','Saray Mahallesi Dr. Adnan Büyükdeniz Caddesi No :7 / A-B Ümraniye /İSTANBUL Mersis: 0922003497000017 Sf 2 \\ 3 ']
d = list(filter(lambda w: w not in stopwords, c))
e = list(map(lambda x: x.replace('CÜZDANDAN HESABA TRANSFER ', 'CÜZDANDAN HESABA TRANSFER İŞLEMİ'), d))
# Align the list elements by 4 in the appropriate order according to the 4 columns of the df
z=[]
for i in range(int(len(e)/4)):
y=((e[i*4:i*4+4]))
z.append(y)
df = pd.DataFrame( z,columns=['ISLEM TARIHI', 'ACIKLAMA','TUTAR', "BANKOMAT PARA"])
# creating csv file
is_file=os.path.isfile('C:/Users/Desktop/abstract/Alışveris_Özeti.csv')
if is_file==False:
df.to_csv("Alışveris_Özeti.csv", index=False)
print('CSV file has created...')
else:
print("CSV file already exists.")

How can I add every n number of rows to a n column?

I have a .txt file with 683,500 rows, every 7 rows its a different person that contain:
ID
Name
Work position
Date 1 (year - month)
Date 2 (year - month)
Gross payment
Service time
I would like to read that .txt and output (could be a json, csv, txt, or even in a database) every person in a 7 column, for example:
ID Name Work position Date 1 Date 2 Gross payment Service time
ID Name Work position Date 1 Date 2 Gross payment Service time
ID Name Work position Date 1 Date 2 Gross payment Service time
ID Name Work position Date 1 Date 2 Gross payment Service time
Example in the txt:
00000000886
MANUEL DE JESUS SUBERVI PEÑA
MAESTRO MEDIA GENERAL
2006-08
2021-09
30,556.04
15.7
00000000086
MANUEL DE JESUS SUBERVI PEÑA
MAESTRO MEDIA GENERAL
2006-01
2021-09
30,556.04
15.7
00100000086
MANUEL DE JESUS SUBERVI PEÑA
MAESTRO MEDIA GENERAL
2006-01
2021-09
30,556.04
15.7
import csv
#opening file
file = open (r"C:\Users\Redford\Documents\Proyecto automatizacion\data1.txt") #open file
counter = 0
total_lines = len(file.readlines()) #count lines
#print('Total lines:', x)
#reading from file
content = file.read()
colist = content.split ()
print(colist)
#read data from data1.txt and write in data2.txt
lines = open (r"C:\Users\Redford\Documents\Proyecto automatizacion\data1.txt")
arr = []
with open('data2.txt', 'w') as f:
for line in lines:
#arr.append(line)
f.write (line)
I'm new to programing and I don't know how to translate my logic to code.
Your code does not collect multiple lines to write them into one.
Use this approach:
read your file line by line
collect each line without a \n into a list
if list reaches 7 length, write into csv and clear list
repeat until done
Create data file:
with open ("t.txt","w") as f:
f.write("""00000000886\nMANUEL DE JESUS SUBERVI PEÑA\nMAESTRO MEDIA GENERAL\n2006-08\n2021-09\n30,556.04\n15.7
00000000086\nMANUEL DE JESUS SUBERVI PEÑA\nMAESTRO MEDIA GENERAL\n2006-01\n2021-09\n30,556.04\n15.7
00100000086\nMANUEL DE JESUS SUBERVI PEÑA\nMAESTRO MEDIA GENERAL\n2006-01\n2021-09\n30,556.04\n15.7""")
Program:
import csv
with open("t.csv","w",newline="") as wr, open("t.txt") as r:
# create a csv writer
writer = csv.writer(wr)
# uncomment if you want a header over your data
# h = ["ID","Name","Work position","Date 1","Date 2",
# "Gross payment","Service time"]
# writer.writerow(h)
person = []
for line in r: # could use enumerate as well, this works ok
# collect line data minus the \n into list
person.append(line.strip())
# this person is finished, write, clear list
if len(person) == 7:
# leveraged the csv module writer, look it up if you need
# to customize it further regarding quoting etc
writer.writerow(person)
person = [] # reset list for next person
# something went wrong, your file is inconsistent, write remainder
if person:
writer.writerow(person)
print(open("t.csv").read())
Output:
00000000886,MANUEL DE JESUS SUBERVI PEÑA,MAESTRO MEDIA GENERAL,2006-08,2021-09,"30,556.04",15.7
00000000086,MANUEL DE JESUS SUBERVI PEÑA,MAESTRO MEDIA GENERAL,2006-01,2021-09,"30,556.04",15.7
00100000086,MANUEL DE JESUS SUBERVI PEÑA,MAESTRO MEDIA GENERAL,2006-01,2021-09,"30,556.04",15.7
Readup: csv module - writer
The "Gross payment" needs to be quoted because it contain s a ',' wich is the delimiter for csv - the module does this automagically.
On top of the excellent answer from #PatrickArtner, I would like to propose an itertools-based solution:
import csv
import itertools
def file_grouper_itertools(
in_filepath="t.txt",
out_filepath="t.csv",
size=7):
with open(in_filepath, 'r') as in_file,\
open(out_filepath, 'w') as out_file:
writer = csv.writer(out_file)
args = [iter(in_file)] * size
for block in itertools.zip_longest(*args, fillvalue=' '):
# equivalent, for the given input, to:
# block = [x.rstrip('\n') for x in block]
block = ''.join(block).rstrip('\n').split('\n')
writer.writerow(block)
The idea there is to loop in blocks of the required size.
For larger group sizes this gets faster simply because of the fewer cycles the main loop is being executed.
Running some micro-benchmarking shows that your use case should benefit from this approach compared to the manual looping (adapted into a function):
import csv
def file_grouper_manual(
in_filepath="t.txt",
out_filepath="t.csv",
size=7):
with open(in_filepath, 'r') as in_file,\
open(out_filepath, 'w') as out_file:
writer = csv.writer(out_file)
block = []
for line in in_file:
block.append(line.rstrip('\n'))
if len(block) == size:
writer.writerow(block)
block = []
if block:
writer.writerow(block)
Benchmarking:
n = 100_000
k = 7
with open ("t.txt", "w") as f:
for i in range(n):
f.write("\n".join(["0123456"] * k))
%timeit file_grouper_manual()
# 1 loop, best of 5: 325 ms per loop
%timeit file_grouper_itertools()
# 1 loop, best of 5: 230 ms per loop
Alternatively, you could use Pandas, which is very convenient, but requires that all the input fit into available memory (which should not be a problem in your case, but can be for larger inputs):
import numpy as np
import pandas as pd
def file_grouper_pandas(in_filepath="t.txt", out_filepath="t.csv", size=7):
with open(in_filepath) as in_filepath:
data = [x.rstrip('\n') for x in in_filepath.readlines()]
df = pd.DataFrame(np.array(data).reshape((-1, size)), columns=list(range(size)))
# consistent with the other solutions
df.to_csv(out_filepath, header=False, index=False)
%timeit file_grouper_pandas()
# 1 loop, best of 5: 666 ms per loop
If you do a lot of work with tables and data, NumPy and Pandas are really useful libraries to get comfortable with.
import numpy as np
import pandas as pd
columns = ['ID', 'Name' , 'Work position', 'Date 1 (year - month)', 'Date 2 (year - month)',
'Gross payment', 'Service time']
with open('oldfile.txt', 'r') as stream:
# read file into a list of lines
lines = stream.readlines()
# remove newline character from each element of the list.
lines = [line.strip('\n') for line in lines]
# Figure out how many rows there will be in the table
number_of_people = len(lines)/7
# Split data into rows
data = np.array_split(lines, number_of_people)
# Convert data to pandas dataframe
df = pd.DataFrame(data, columns = columns)
Once you have converted the data to a Pandas Dataframe, you can easily output it to any of the formats you listed. For example to output to csv you can do:
df.to_csv('newfile.csv')
Or for json it would be:
df.to_json('newfile.csv')

Single CSV output with data in different columns

I have a number of CSV files with data in the first three columns only. I want to copy data from each CSV file and paste it into one single CSV file in column order. For example data from the first CSV file goes into columns 1,2 and 3 in the output file. Similarly, data from the 2nd CSV goes to columns 4,5, and 6 of the same output CSV file and so on. Any help would be highly appreciated. Thanks.
I have tried the following code but it gets me the output in same columns only.
import glob
import pandas as pd
import time
import numpy as np
start = time.time()
Filename='Combined_Data.csv'
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
for i in range(len(all_filenames)):
data= pd.read_csv(all_filenames[i],skiprows=23)
data= data.rename({'G1': 'CH1', 'G2': 'CH2','Dis': 'CH3'},axis=1)
data= data[['CH1','CH2','CH3']]
data= data.apply(pd.to_numeric, errors='coerce')
print(all_filenames[i])
if i == 0:
data.to_csv(Filename,sep=',',index=False,header=True,mode='a')
else:
data.to_csv(Filename,sep=',',index=False,header=False,mode='a')
end = time.time()
print((end - start),'Seconds(Execution Time)')
If you don't need to write your own code for this, I'd recommend GoCSV's zip command; it can also handle the CSVs having different numbers of rows.
I have three CSV files:
file1.csv
Dig1,Dig2,Dig3
1,2,3
4,5,6
7,8,9
file2.csv
Letter1,Letter2,Letter3
a,b,c
d,e,f
and file3.csv
RomNum1,RomNum2,RomNum3
I,II,III
When I run gocsv zip file2.csv file1.csv file3.csv I get:
Letter1,Letter2,Letter3,Dig1,Dig2,Dig3,RomNum1,RomNum2,RomNum3
a,b,c,1,2,3,I,II,III
d,e,f,4,5,6,,,
,,,7,8,9,,,
GoCSV is pre-built for a number of different OS'es.
Here's how to do it with Python's CSV module, using these files:
file1.csv
Dig1,Dig2,Dig3
1,2,3
4,5,6
7,8,9
file2.csv
Letter1,Letter2,Letter3
a,b,c
d,e,f
and file3.csv
RomNum1,RomNum2,RomNum3
I,II,III
The more-memory-intensive option
This accumulates the final CSV one file at a time, expanding a list that represents the final CSV with with each new input CSV.
#!/usr/bin/env python3
import csv
import sys
csv_files = [
'file2.csv',
'file1.csv',
'file3.csv',
]
all = []
for csv_file in csv_files:
with open(csv_file) as f:
reader = csv.reader(f)
rows = list(reader)
len_all = len(all)
# First file, initialize all and continue (skip)
if len_all == 0:
all = rows
continue
# The number of columns in all so far
len_cols = len(all[0])
# Extend all with the new rows
for i, row in enumerate(rows):
# Check to make sure all has as many rows as this file
if i >= len_all:
all.append(['']*len_cols)
all[i].extend(row)
# Finally, pad all rows on the right
len_cols = len(all[0])
for i in range(len(all)):
len_row = len(all[i])
if len_row < len_cols:
col_diff = len_cols - len_row
all[i].extend(['']*col_diff)
writer = csv.writer(sys.stdout)
writer.writerows(all)
The streaming option
This reads-and-writes a line/row at a time.
(this is basically a Python port of the Go code from GoCSV's zip, from above)
import csv
import sys
fnames = [
'file2.csv',
'file1.csv',
'file3.csv',
]
num_files = len(fnames)
readers = [csv.reader(open(x)) for x in fnames]
# Collect "header" lines; each header defines the number
# of columns for its file
headers = []
num_cols = 0
offsets = [0]
for reader in readers:
header = next(reader)
headers.append(header)
num_cols += len(header)
offsets.append(num_cols)
writer = csv.writer(sys.stdout)
# With all headers counted, every row must have this many columns
shell_row = [''] * num_cols
for i, header in enumerate(headers):
start = offsets[i]
end = offsets[i+1]
shell_row[start:end] = header
# Write headers
writer.writerow(shell_row)
# Expect that not all CSVs have the same number of rows; some will "finish" ahead of others
file_is_complete = [False] * num_files
num_complete = 0
# Loop a row at a time...
while True:
# ... for each CSV
for i, reader in enumerate(readers):
if file_is_complete[i]:
continue
start = offsets[i]
end = offsets[i+1]
try:
row = next(reader)
# Put this row in its place in the main row
shell_row[start:end] = row
except StopIteration:
file_is_complete[i] = True
num_complete += 1
except:
raise
if num_complete == num_files:
break
# Done iterating CSVs (for this row), write it
writer.writerow(shell_row)
# Reset for next main row
shell_row = [''] * num_cols
For either, I get:
Letter1,Letter2,Letter3,Dig1,Dig2,Dig3,RomNum1,RomNum2,RomNum3
a,b,c,1,2,3,I,II,III
d,e,f,4,5,6,,,
,,,7,8,9,,,

How to read, save and display images, encoded in csv format

I've got some images for training and testing a tensorflow model encoded in csv format. Is there a way to extract those images and / or save them in jpg like format?
Part of the file can be seen above as a opened in excel as a screenshot. If you prefer text to hyperlinks, here is a part of it in a form of a text:
label pixel1 pixel2 ...
6 149 149 ...
5 126 128 ...
10 85 88 ...
0 203 205 ...
There are 785 columns and 7173 rows in total. I have no idea how to deal with that.
You can do it like this
# first i create a dummy dataset to work on
data = make_classification(10000, n_features=784, random_state=1234)
df = pd.DataFrame(data[0], columns=[str(f'col_{i}') for i in range(784)])
df['label'] = data[1]
# Now we create a img_vector and labels array from the dataframe
img_vector = df[[str(f'col_{i}') for i in range(784)]].values
labels = df['label'].values
# splitting the data
# Now we creating the dataset
def get_img(inputs, labels):
# here you have 784 pixels which usually represent a 28*28 image with 1 channel
# hence I reshape it that way
img = tf.reshape(inputs, (28,28,1))
# you can also add some augmentation
img = tf.image.flip_left_right(img)
img = tf.image.flip_up_down(img)
return img, labels
# We pass the img_vector and labels to the make the dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_mat, train_label))
# Map the dataset to get images form it.
train_dataset = train_dataset.map(get_img).batch(16)
# same for valid dataset
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_mat, valid_label))
valid_dataset = valid_dataset.map(get_img).batch(16)
# A sanity check
import matplotlib.pyplot as plt
sample = None
for i in train_dataset:
sample = i
break
plt.imshow(sample[0][0])
# Creating a model
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(3,3, input_shape=(28,28,1)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# Finally train the model
model.fit(train_dataset,
epochs=10,
validation_data=valid_dataset)
Also, if you ever take a dataset from Kaggle you will usually find a sample notebook for that dataset in the code section.
You can read any row, plot it and save it as image like this:
import numpy as np
import pandas as pd
# read csv file
df = pd.read_csv("data.csv")
# read pixels
images = np.array(df.iloc[:,1:])
labels = np.array(df.iloc[:,0])
# select random number between 0 and 7172
index = 2
# reshape 784 rows to 28 height x 28 width
sample_image = images[index,:].reshape(28,28)
# import plt for displaying image
from matplotlib import pyplot as plt
# plot image
plt.imshow(sample_image)
plt.axis('off')
# plot it's label
print(labels[index])
# save image
plt.savefig("./image{}_label{}".format(index,labels[index]))