whats wrong with this python script - function

I have a simple script which returns a lot of errors:
import numpy as np
def test(array):
ncol=np.shape(array)[1]
return ncol
which is supposed to return the number of columns of array. What is wrong with it?
array is numpy array. Here is the output:
ncol=np.shape(array)[1]
Display all 195 possibilities? (y or n)
ArithmeticError( continue
AssertionError( copyright(
AttributeError( credits(
BaseException( def
BlockingIOError( del

You need to add a try.. catch around ncol=np.shape(array)[1] because it fails when the array is a 1d dimension:
import numpy as np
arr = np.random.normal(size=10)
arr1 = np.random.normal(size=(10,5))
def test(array):
try:
ncol=np.shape(array)[1]
return ncol
except Exception as e:
print("no columns in array")
return None
print(test(arr))
# output:
# no columns in array
# None
print(test(arr1))
# output:
# 5

Related

TypeError: document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, a type that inherits from collections.MutableMapping

I am trying to write data into pymongo and this the TypeError that I am getting. The Type for mydict1 is List. Do I have to convert my data into json or bson before I write it to pymongo? Kindly help.
Thanks.
from numpy.polynomial import Polynomial as poly
import numpy as np
import matplotlib.pyplot as plt
import pymongo
import json
import pandas as pd
df = pd.read_csv(r'D:\polynomial\points.csv')
print(df)
x= np.array(df['Wavelength(A)'].tolist())
x= np.divide([299792.458], x)
y= np.array(df['Level(A)'].tolist())
x_trimmed = np.delete(x, np.where(y < 1e-4))
y_trimmed = np.delete(y, np.where(y < 1e-4))
test= poly.fit(x_trimmed, y_trimmed, 10)
print (test)
list1= test.convert().coef
print (list1)
print (len(list1))
#print (type(list1))
to_list= list1.tolist()
#print(to_list)
#data_format= json.dumps(to_list)
l = len(to_list)
#print (l)
mydict1= []
for i in range(l):
mydict = { "a"+str(i) : to_list[i] }
mydict1.append(mydict)
print (mydict1)
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["mydatabase"]
mycol = mydb["coefficients"]
x = mycol.insert_one(mydict1)
This is mydict1=
[{'a0': -2.3373800910827825e+34}, {'a1': 1.2084654060419298e+33}, {'a2': -2.811587585787653e+31}, {'a3': 3.876370042231405e+29}, {'a4': -3.507261557232249e+27}, {'a5': 2.1759768836934694e+25}, {'a6': -9.37514311649608e+22}, {'a7': 2.7697765301392782e+20}, {'a8': -5.370081422614614e+17}, {'a9': 616983041924503.2}, {'a10': -318990754999.1472}]
The problem is that MongoDB's insert_one method inserts a single document that is represented by a dictionary, not a list.
The possible solutions are:
use insert_many instead. In this case, you will have every list item as a separate mongodb document
make a dict with your list values. You can use something like {"items": mydict1}, or reduce(lambda x, y: x | y, mydict1) depending on the document structure that will be better for your needs

#cuda.jit(device=True) returns 'DeviceFunctionTemplate' object is not callable

I'm working on math method and to reduce execution time I use numba decorator
#numba.jit(nopython=True, nogil=True, cache=True)
def analize_tick(data:np.array, index:int, result_signal:np.array) -> None:
##I perform an action here and then return result
result_sirnal[0]=1
it works OK, but when I changed the decorator from #numba.jit(nopython=True, nogil=True, cache=True) to #cuda.jit(device=True) I got the error: 'DeviceFunctionTemplate' object is not callable
Could you advice me how to fix this issue?
BTW the method recieves three arguments:
numpy 2 dimensional float array
int index
numpy 1 dimensional int array where I return result
UPDATED to add code sample:
import unittest
import pandas as pd
import numpy as np
import numba
from numba import cuda
#numba.jit(nopython=True, nogil=True, cache=True)
# #cuda.jit(device=True)
def calculate(data:np.array, index:int, options:np.array, result_signal:np.array) -> None:
i = data[0]
b = data[1]
result_signal[0]= i+b
#numba.jit(nopython=True, nogil=True, cache=True)
# #cuda.jit(device=True)
def for_each(data:np.array,options:np.array, result:np.array) -> None:
for index, r in enumerate(data):
calculate(r, index, options, result)
# print(result[0])
class cuda_test(unittest.TestCase):
def test_numba_call(self):
df = pd.DataFrame([[1, 1], [2, 2]], columns=['c0', 'c1'])
data = df.to_numpy()
result = np.array([0], dtype=float)
options = np.array([0], dtype=float)
for sigma in range(0, 10, 1):
options[0] = sigma
for_each(data, options, result)
Could you advice me how to fix this issue?
There is no way to fix this. What you are trying to do is impossible.
When you decorate a function like this:
#cuda.jit(device=True)
def for_each(data:np.array,options:np.array, result:np.array) -> None:
for index, r in enumerate(data):
calculate(r, index, options, result)
you are denoting that the function is only available to be called by CUDA kernels or other device functions. You are not calling it within a CUDA kernel or device function. There is no way to change this behaviour, it is a limitation of the language.

How to convert this json file to pandas dataframe

The format in the file looks like this
{ 'match' : 'a', 'score' : '2'},{......}
I've tried pd.DataFrame and I've also tried reading it by line but it gives me everything in one cell
I'm new to python
Thanks in advance
Expected result is a pandas dataframe
Try use json_normalize() function
Example:
from pandas.io.json import json_normalize
values = [{'match': 'a', 'score': '2'}, {'match': 'b', 'score': '3'}, {'match': 'c', 'score': '4'}]
df = json_normalize(values)
print(df)
Output:
If one line of your file corresponds to one JSON object, you can do the following:
# import library for working with JSON and pandas
import json
import pandas as pd
# make an empty list
data = []
# open your file and add every row as a dict to the list with data
with open("/path/to/your/file", "r") as file:
for line in file:
data.append(json.loads(line))
# make a pandas data frame
df = pd.DataFrame(data)
If there is more than only one JSON object on one row of your file, then you should find those JSON objects, for example here are two possible options. The solution with the second option would look like this:
# import all you will need
import pandas as pd
import json
from json import JSONDecoder
# define function
def extract_json_objects(text, decoder=JSONDecoder()):
pos = 0
while True:
match = text.find('{', pos)
if match == -1:
break
try:
result, index = decoder.raw_decode(text[match:])
yield result
pos = match + index
except ValueError:
pos = match + 1
# make an empty list
data = []
# open your file and add every JSON object as a dict to the list with data
with open("/path/to/your/file", "r") as file:
for line in file:
for item in extract_json_objects(line):
data.append(item)
# make a pandas data frame
df = pd.DataFrame(data)

Convert json file into csv encode/decode problems

My semester project is about classification by using Naive bayes. I ve decided to use Yelp dataset. While I was turning the json file into csv file I came up with couple of problems. Such as :
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Its because of the wrong usage of json.loads(). I tried a couple of deifferent usage of the function to manage this part of the program. Unfortunately, none of them worked. I put my code down below, if you have any idea about how to handle this, can you please explain it to me?
`
import json
import pandas as pd
from glob import glob
import codecs
global df
global s
global count
def convert(x):
ob = json.loads(x)
for k, v in ob.items():
if isinstance(v, list):
ob[k] = ','.join(v)
elif isinstance(v, dict):
for kk, vv in v.items():
ob['%s_%s' % (k, kk)] = vv
del ob[k]
return ob
s = ""
count = 0
for json_filename in glob('*.json'):
csv_filename = '%s.csv' % json_filename[:-5]
print('Converting %s to %s' % (json_filename, csv_filename))
with open('yelp_dataset_challenge_round9.json','rb') as f: #open in binary mode
for line in f:
for cp in ('cp1252', 'cp850'):
try:
if count is 0:
count = 1
else:
s = str(line.decode('utf-8'))
except UnicodeDecodeError:
pass
df = pd.DataFrame([convert(s)])
df.to_csv(csv_filename, encoding='utf-8', index=False)
`
Thanks in advance :)

Can I export a tensorflow summary to CSV?

Is there a way to extract scalar summaries to CSV (preferably from within tensorboard) from tfevents files?
Example code
The following code generates tfevent files in a summary_dir within the same directory. Suppose you let it run and you find something interesting. You want to get the raw data for further investigation. How would you do that?
#!/usr/bin/env python
"""A very simple MNIST classifier."""
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
ce_with_logits = tf.nn.softmax_cross_entropy_with_logits
FLAGS = None
def inference(x):
"""
Build the inference graph.
Parameters
----------
x : placeholder
Returns
-------
Output tensor with the computed logits.
"""
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
return y
def loss(logits, labels):
"""
Calculate the loss from the logits and the labels.
Parameters
----------
logits : Logits tensor, float - [batch_size, NUM_CLASSES].
labels : Labels tensor, int32 - [batch_size]
"""
cross_entropy = tf.reduce_mean(ce_with_logits(labels=labels,
logits=logits))
return cross_entropy
def training(loss, learning_rate=0.5):
"""
Set up the training Ops.
Parameters
----------
loss : Loss tensor, from loss().
learning_rate : The learning rate to use for gradient descent.
Returns
-------
train_op: The Op for training.
"""
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(loss)
return train_step
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
y = inference(x)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
loss_ = loss(logits=y, labels=y_)
train_step = training(loss_)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.name_scope('accuracy'):
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
sess = tf.InteractiveSession()
train_writer = tf.summary.FileWriter('summary_dir/train', sess.graph)
test_writer = tf.summary.FileWriter('summary_dir/test', sess.graph)
tf.global_variables_initializer().run()
for train_step_i in range(100000):
if train_step_i % 100 == 0:
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.test.images,
y_: mnist.test.labels})
test_writer.add_summary(summary, train_step_i)
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.train.images,
y_: mnist.train.labels})
train_writer.add_summary(summary, train_step_i)
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
print(sess.run(accuracy, feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir',
type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
While the answer here is as requested within tensorboard it only allows to download a csv for a single run of a single tag.
If you have for example 10 tags and 20 runs (what is not at all much) you would need to do the above step 200 times (that alone will probably take you more than a hour).
If now you for some reason would like to actually do something with the data for all runs for a single tag you would need to write some weird CSV accumulation script or copy everything by hand (what will probably cost you more than a day).
Therefore I would like to add a solution that extracts a CSV file for every tag with all runs contained. Column headers are the run path names and row indices are the run step numbers.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in os.listdir(dpath)]
tags = summary_iterators[0].Tags()['scalars']
for it in summary_iterators:
assert it.Tags()['scalars'] == tags
out = defaultdict(list)
steps = []
for tag in tags:
steps = [e.step for e in summary_iterators[0].Scalars(tag)]
for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
assert len(set(e.step for e in events)) == 1
out[tag].append([e.value for e in events])
return out, steps
def to_csv(dpath):
dirs = os.listdir(dpath)
d, steps = tabulate_events(dpath)
tags, values = zip(*d.items())
np_values = np.array(values)
for index, tag in enumerate(tags):
df = pd.DataFrame(np_values[index], index=steps, columns=dirs)
df.to_csv(get_file_path(dpath, tag))
def get_file_path(dpath, tag):
file_name = tag.replace("/", "_") + '.csv'
folder_path = os.path.join(dpath, 'csv')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return os.path.join(folder_path, file_name)
if __name__ == '__main__':
path = "path_to_your_summaries"
to_csv(path)
My solution builds upon: https://stackoverflow.com/a/48774926/2230045
EDIT:
I created a more sophisticated version and released it on GitHub: https://github.com/Spenhouet/tensorboard-aggregator
This version aggregates multiple tensorboard runs and is able to save the aggregates to a new tensorboard summary or as a .csv file.
Just check the "Data download links" option on the upper-left in TensorBoard, and then click on the "CSV" button that will appear under your scalar summary.
Here is my solution which bases on the previous solutions but can scale up.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
final_out = {}
for dname in os.listdir(dpath):
print(f"Converting run {dname}",end="")
ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
tags = ea.Tags()['scalars']
out = {}
for tag in tags:
tag_values=[]
wall_time=[]
steps=[]
for event in ea.Scalars(tag):
tag_values.append(event.value)
wall_time.append(event.wall_time)
steps.append(event.step)
out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])
if len(tags)>0:
df= pd.concat(out.values(),keys=out.keys())
df.to_csv(f'{dname}.csv')
print("- Done")
else:
print('- Not scalers to write')
final_out[dname] = df
return final_out
if __name__ == '__main__':
path = "youre/path/here"
steps = tabulate_events(path)
pd.concat(steps.values(),keys=steps.keys()).to_csv('all_result.csv')
Very minimal example:
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
events = event_accumulator.Scalars("train_loss")
x = [x.step for x in events]
y = [x.value for x in events]
df = pd.DataFrame({"step": x, "train_loss": y})
df.to_csv("train_loss.csv")
print(df)
step train_loss
0 0 700.491516
1 1 163.593246
2 2 146.365448
3 3 153.830215
...
Plotting loss vs epochs example:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
y_key = "val_loss"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
steps = {x.step for x in event_accumulator.Scalars("epoch")}
x = list(range(len(steps)))
y = [x.value for x in event_accumulator.Scalars(y_key) if x.step in steps]
df = pd.DataFrame({"epoch": x, y_key: y})
df.to_csv(f"{y_key}.csv")
fig, ax = plt.subplots()
sns.lineplot(data=df, x="epoch", y=y_key)
fig.savefig("plot.png", dpi=300)
Just to add to #Spen
in case you want to export the data when you have varying numbers of steps.
This will make one large csv file.
Might need to change around the keys for it to work for you.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import glob
import pandas as pd
listOutput = (glob.glob("*/"))
listDF = []
for tb_output_folder in listOutput:
print(tb_output_folder)
x = EventAccumulator(path=tb_output_folder)
x.Reload()
x.FirstEventTimestamp()
keys = ['loss', 'mean_absolute_error', 'val_loss', 'val_mean_absolute_error']
listValues = {}
steps = [e.step for e in x.Scalars(keys[0])]
wall_time = [e.wall_time for e in x.Scalars(keys[0])]
index = [e.index for e in x.Scalars(keys[0])]
count = [e.count for e in x.Scalars(keys[0])]
n_steps = len(steps)
listRun = [tb_output_folder] * n_steps
printOutDict = {}
data = np.zeros((n_steps, len(keys)))
for i in range(len(keys)):
data[:,i] = [e.value for e in x.Scalars(keys[i])]
printOutDict = {keys[0]: data[:,0], keys[1]: data[:,1],keys[2]: data[:,2],keys[3]: data[:,3]}
printOutDict['Name'] = listRun
DF = pd.DataFrame(data=printOutDict)
listDF.append(DF)
df = pd.concat(listDF)
df.to_csv('Output.csv')