Split CSV file the line of a column - csv

I'm trying to split the lines of a column in the CSV file, then I want to speed up the process I'm going to do with the help of thread, but I don't know how to do the splitting.
import csv
import pandas as pd
import threading
df = pd.read_csv("clearData.csv")
a = df["Product"].head(50)
def halfSplit():
for i in a:
for k in a:
result = simTest(i,k)
print(result,i,k)
def otherHalfSplit():
for i in a:
for k in a:
result = simTest(i,k)
print(result,i,k)
if __name__ =="__main__":
t1 = threading.Thread(target=halfSplit, args=(10,))
t2 = threading.Thread(target=otherHalfSplit, args=(10,))
t1.start()
t2.start()
t1.join()
t2.join()

Related

Code Workbooks - File not found using hadoop_path

I have a python transform in code workbooks that is running this code:
import pandas as pd
def contents(dataset_with_files):
fs = dataset_with_files.filesystem()
filenames = [f.path for f in fs.ls()]
fp = fs.hadoop_path + "/" + filenames[0]
with open(fp, 'r') as f:
t = f.read()
rows = {"text": [t]}
return pd.DataFrame(rows)
But I am getting the error FileNotFoundError: [Errno 2] No such file or directory:
My understanding is that this is the correct way to access a file in the hdfs, is this a repository versus code workbooks limitation?
This documentation helped me figure it out:
https://www.palantir.com/docs/foundry/code-workbook/transforms-unstructured/
It was actually a pretty small change. If you are using the filesystem() you only need the relative path.
import pandas as pd
def contents_old(pycel_test):
fs = pycel_test.filesystem()
filenames = [f.path for f in fs.ls()]
with fs.open(filenames[0], 'r') as f:
value = ...
rows = {"values": [value]}
return pd.DataFrame(rows)
There is also this option, but I found it 10x slower.
from pyspark.sql import Row
def contents(dataset_with_files):
fs = dataset_with_files.filesystem() # This is the FileSystem object.
MyRow = Row("column")
def process_file(file_status):
with fs.open(file_status.path, 'r') as f:
...
rdd = fs.files().rdd
rdd = rdd.flatMap(process_file)
df = rdd.toDF()
return df

json to csv using python, json.loads and json_normalize function

I am trying to convert a JSON file to CSV format using Python. I am using JSON.loads() function and then using json_normalize() to flatten the objects. I was wondering if there is better way of doing this.
this is the input file, one row form it:
{"ID": "02","Date": "2019-08-01","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "01#abc.com"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}
Code
import json
import pandas as pd
from pandas.io.json import json_normalize
data_final=pd.DataFrame()
with open("sample.json") as f:
for line in f:
json_obj = json.loads(line)
ID = json_obj['ID']
Item = json_obj['Item']
dataMain = json_normalize(json_obj)
dataMain=dataMain.drop(['Item','Address'], axis=1)
#dataMain.to_csv("main.csv",index=False)
dataItem = json_normalize(json_obj,'Item',['ID'])
dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
#dataItem.to_csv("Item.csv",index=False)
dataDisc = pd.DataFrame()
dataTx = pd.DataFrame()
for rt in Item:
NM=rt['NM']
rt['ID'] = ID
if 'Disc' in rt:
data = json_normalize(rt, 'Disc', ['NM','ID'])
dataDisc = dataDisc.append(data, sort=False)
if 'TxLns' in rt:
tx=rt['TxLns']
tx['NM'] = NM
tx['ID'] = ID
if 'TX' in tx:
data = json_normalize(tx, 'TX', ['NM','ID'])
dataTx = dataTx.append(data, sort=False)
dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
dataAddress = json_normalize(json_obj,'Address',['ID'])
data_IT = pd.merge(dataMain, dataTX, on=['ID'])
data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
data_final=data_final.append(data_merge,sort=False)
data_final=data_final.drop_duplicates(keep = 'first')
data_final.to_csv("data_merged.csv",index=False)
this is the output:
ID,Date,Total,QTY,Cust.CustID,Cust.Email,NM,CD_x,SRL,CD_y,Amount,TXNM,TXCD,FirstName,LastName,Address
02,2019-08-01,400,12,10,01#abc.com,0000000001,item_CD1,25,discount_CD1,2.0,000001-001,TX_CD1,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000002,item_CD2,26,discount_CD2,4.0,000002-001,TX_CD2,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000003,item_CD3,27,,,,,firstname,lastname,address
The code is working fine for now. By Better I mean:
Is it efficient in terms of time and space complexity? If this code has to process around 10K records in a file, is this the optimized solution?

how to print output results in HDBSCAN

I have ASCII data and i need to cluster the data using HDBSCAN.
I got the lables but i don't know how to print the output cluster results i.e unique and segregated results from hdbscan.
snippet:
import hdbscan
import numpy as np
datafile = "ascii.txt"
data = np.loadtxt(datafile, dtype = np.uint8)
clusterer = hdbscan.HDBSCAN(min_cluster_size = 20)
clusterer.fit(data)
print (np.unique(clusterer.labels_, return_counts = True))
You can use Pandas to read the file and then print out the cluster labels along with the dataset you have as the input. Try something like:
import pandas as pd
df = pd.read_csv("ascii.txt")
clusterer = hdbscan.HDBSCAN().fit_predict(df.ColumnName)
df_pd = pd.DataFrame({'Datapoints:' df.ColumnName, 'Cluster Labels:' clusterer)
import hdbscan
import numpy as np
datafile = "ascii.txt"
data = np.loadtxt(datafile, dtype = np.uint8)
Modified_data=pd.DataFrame(data)
clusterer = hdbscan.HDBSCAN(min_cluster_size = 20)
clusterer.fit(Modified_data)
Modified_data['Clusters']=clusterer.labels_
Now Modified_data returns a pandas dataframe where you have a column named "Clusters" and cluster corresponding to each instance will be specified in the Clusters column.
You can manipulate this dataframe as per your requirement

Can I export a tensorflow summary to CSV?

Is there a way to extract scalar summaries to CSV (preferably from within tensorboard) from tfevents files?
Example code
The following code generates tfevent files in a summary_dir within the same directory. Suppose you let it run and you find something interesting. You want to get the raw data for further investigation. How would you do that?
#!/usr/bin/env python
"""A very simple MNIST classifier."""
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
ce_with_logits = tf.nn.softmax_cross_entropy_with_logits
FLAGS = None
def inference(x):
"""
Build the inference graph.
Parameters
----------
x : placeholder
Returns
-------
Output tensor with the computed logits.
"""
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
return y
def loss(logits, labels):
"""
Calculate the loss from the logits and the labels.
Parameters
----------
logits : Logits tensor, float - [batch_size, NUM_CLASSES].
labels : Labels tensor, int32 - [batch_size]
"""
cross_entropy = tf.reduce_mean(ce_with_logits(labels=labels,
logits=logits))
return cross_entropy
def training(loss, learning_rate=0.5):
"""
Set up the training Ops.
Parameters
----------
loss : Loss tensor, from loss().
learning_rate : The learning rate to use for gradient descent.
Returns
-------
train_op: The Op for training.
"""
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(loss)
return train_step
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
y = inference(x)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
loss_ = loss(logits=y, labels=y_)
train_step = training(loss_)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.name_scope('accuracy'):
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
sess = tf.InteractiveSession()
train_writer = tf.summary.FileWriter('summary_dir/train', sess.graph)
test_writer = tf.summary.FileWriter('summary_dir/test', sess.graph)
tf.global_variables_initializer().run()
for train_step_i in range(100000):
if train_step_i % 100 == 0:
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.test.images,
y_: mnist.test.labels})
test_writer.add_summary(summary, train_step_i)
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.train.images,
y_: mnist.train.labels})
train_writer.add_summary(summary, train_step_i)
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
print(sess.run(accuracy, feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir',
type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
While the answer here is as requested within tensorboard it only allows to download a csv for a single run of a single tag.
If you have for example 10 tags and 20 runs (what is not at all much) you would need to do the above step 200 times (that alone will probably take you more than a hour).
If now you for some reason would like to actually do something with the data for all runs for a single tag you would need to write some weird CSV accumulation script or copy everything by hand (what will probably cost you more than a day).
Therefore I would like to add a solution that extracts a CSV file for every tag with all runs contained. Column headers are the run path names and row indices are the run step numbers.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in os.listdir(dpath)]
tags = summary_iterators[0].Tags()['scalars']
for it in summary_iterators:
assert it.Tags()['scalars'] == tags
out = defaultdict(list)
steps = []
for tag in tags:
steps = [e.step for e in summary_iterators[0].Scalars(tag)]
for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
assert len(set(e.step for e in events)) == 1
out[tag].append([e.value for e in events])
return out, steps
def to_csv(dpath):
dirs = os.listdir(dpath)
d, steps = tabulate_events(dpath)
tags, values = zip(*d.items())
np_values = np.array(values)
for index, tag in enumerate(tags):
df = pd.DataFrame(np_values[index], index=steps, columns=dirs)
df.to_csv(get_file_path(dpath, tag))
def get_file_path(dpath, tag):
file_name = tag.replace("/", "_") + '.csv'
folder_path = os.path.join(dpath, 'csv')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return os.path.join(folder_path, file_name)
if __name__ == '__main__':
path = "path_to_your_summaries"
to_csv(path)
My solution builds upon: https://stackoverflow.com/a/48774926/2230045
EDIT:
I created a more sophisticated version and released it on GitHub: https://github.com/Spenhouet/tensorboard-aggregator
This version aggregates multiple tensorboard runs and is able to save the aggregates to a new tensorboard summary or as a .csv file.
Just check the "Data download links" option on the upper-left in TensorBoard, and then click on the "CSV" button that will appear under your scalar summary.
Here is my solution which bases on the previous solutions but can scale up.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
final_out = {}
for dname in os.listdir(dpath):
print(f"Converting run {dname}",end="")
ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
tags = ea.Tags()['scalars']
out = {}
for tag in tags:
tag_values=[]
wall_time=[]
steps=[]
for event in ea.Scalars(tag):
tag_values.append(event.value)
wall_time.append(event.wall_time)
steps.append(event.step)
out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])
if len(tags)>0:
df= pd.concat(out.values(),keys=out.keys())
df.to_csv(f'{dname}.csv')
print("- Done")
else:
print('- Not scalers to write')
final_out[dname] = df
return final_out
if __name__ == '__main__':
path = "youre/path/here"
steps = tabulate_events(path)
pd.concat(steps.values(),keys=steps.keys()).to_csv('all_result.csv')
Very minimal example:
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
events = event_accumulator.Scalars("train_loss")
x = [x.step for x in events]
y = [x.value for x in events]
df = pd.DataFrame({"step": x, "train_loss": y})
df.to_csv("train_loss.csv")
print(df)
step train_loss
0 0 700.491516
1 1 163.593246
2 2 146.365448
3 3 153.830215
...
Plotting loss vs epochs example:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
y_key = "val_loss"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
steps = {x.step for x in event_accumulator.Scalars("epoch")}
x = list(range(len(steps)))
y = [x.value for x in event_accumulator.Scalars(y_key) if x.step in steps]
df = pd.DataFrame({"epoch": x, y_key: y})
df.to_csv(f"{y_key}.csv")
fig, ax = plt.subplots()
sns.lineplot(data=df, x="epoch", y=y_key)
fig.savefig("plot.png", dpi=300)
Just to add to #Spen
in case you want to export the data when you have varying numbers of steps.
This will make one large csv file.
Might need to change around the keys for it to work for you.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import glob
import pandas as pd
listOutput = (glob.glob("*/"))
listDF = []
for tb_output_folder in listOutput:
print(tb_output_folder)
x = EventAccumulator(path=tb_output_folder)
x.Reload()
x.FirstEventTimestamp()
keys = ['loss', 'mean_absolute_error', 'val_loss', 'val_mean_absolute_error']
listValues = {}
steps = [e.step for e in x.Scalars(keys[0])]
wall_time = [e.wall_time for e in x.Scalars(keys[0])]
index = [e.index for e in x.Scalars(keys[0])]
count = [e.count for e in x.Scalars(keys[0])]
n_steps = len(steps)
listRun = [tb_output_folder] * n_steps
printOutDict = {}
data = np.zeros((n_steps, len(keys)))
for i in range(len(keys)):
data[:,i] = [e.value for e in x.Scalars(keys[i])]
printOutDict = {keys[0]: data[:,0], keys[1]: data[:,1],keys[2]: data[:,2],keys[3]: data[:,3]}
printOutDict['Name'] = listRun
DF = pd.DataFrame(data=printOutDict)
listDF.append(DF)
df = pd.concat(listDF)
df.to_csv('Output.csv')

Python folder contents CSV writer

I'm trying to make a simple command line script with Python code that generates a CSV when it scans the contents of a directory, but I'm not sure if I'm doing it correctly, cause I keep getting errors. Can someone tell me what the heck I'm doing wrong?
import sys
import argparse
import os
import string
import fnmatch
import csv
from string import Template
from os import path
from os.path import basename
header = ["Title","VersionData","PathOnClient","OwnerId","FirstPublishLocationId","RecordTypeId","TagsCsv"]
if not sys.argv.len < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames = header, delimiter=',')
writer.writeheader()
if os.path.isdir(sys.argv[2]):
for d in os.scandir(sys.argv[2]):
row = Template('"$title","$path","$path"') #some default values in the template were omitted here
writer.writerow(row.substitute(title=basename(d.path)), path=path.abspath(d.path))
Right off the bat, csvwriter.writerow(row) takes only one argument. You need to wrap your arguments inside brackets and then join with comma.
Moreover, you cannot call other functions within the row object, which is what you are trying to do with row.substitute(args) etc.
Figured it out. For anyone else needing a quick CSV listing of folders, here's the code I got to work:
#!/usr/bin/env python3
import sys, os, csv
from string import Template
from pathlib import PurePath, PureWindowsPath
from os.path import basename
header = ["Title","Path","","","","",""] # insert what header you need, if any
if not len(sys.argv) < 2:
with open(sys.argv[1], 'w') as f:
writer = csv.DictWriter(f, fieldnames=header, dialect='excel', delimiter=',', quoting=csv.QUOTE_ALL)
writer.writeheader()
initPath = os.path.abspath(sys.argv[2])
if sys.platform.startswith('linux') or sys.platform.startswith('cygwin') or sys.platform.startswith('darwin'):
p = PurePath(initPath)
else:
if sys.platform.startswith('win32'):
p = PureWindowsPath(initPath)
if os.path.isdir(str(p)) and not str(p).startswith('.'):
for d in os.scandir(str(p)):
srow = Template('"$title","$path", "","","",""')
#s = srow.substitute({'title': basename(d.path), 'path': os.path.abspath(d.path)) #
#print(s) # this is for testing if the content produces what's expected
row = {'Title': basename(d.path), 'Path': os.path.abspath(d.path)} # the dictionary must have the same number of entries as the number of header fields your CSV is going to contain.
writer.writerow(row)