How to use binary data in SQLAlchemy? - sqlalchemy

How does one use binary data (BLOB type column) in SQLAlchemy.
I just created a table with fields key, val, where val is BLOB and when I query the table, SQLAlchemy returns:
<read-only buffer for 0x83c3040, size -1, offset 0 at 0x83c3120>
How do I use this read-only buffer?

You can iterate over it (e.g. for streaming it) or convert it to a string/binary if you want to have the whole binary in memory (which shouldn't be a problem as long as you are not dealing with movies in the database...)
>>> from sqlalchemy.util import buffer
>>> var = buffer('foo')
>>> var
<read-only buffer for 0xb727fb00, size -1, offset 0 at 0xb727fa80>
>>> str(var)
'foo'
>>> for i in var:
... print i
...
f
o
o
Regards,
Christoph

In SQLAlchemy 1.3, using Python2, binary data is returned as str, in Python 3 it is bytes:
# -*- coding: utf-8 -*-
import zlib
import sqlalchemy as sa
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import orm
Base = declarative_base()
class Blobby(Base):
__tablename__ = 'blobby'
id = sa.Column(sa.Integer, primary_key=True)
blob = sa.Column(sa.BLOB)
engine = sa.create_engine('mysql+pymysql:///test', echo=True)
Base.metadata.drop_all(bind=engine, checkfirst=True)
Base.metadata.create_all(bind=engine)
Session = orm.sessionmaker(bind=engine)
session = Session()
data = zlib.compress('Hello world'.encode('ascii'))
session.add(Blobby(blob=data))
session.commit()
blob, = session.query(Blobby.blob).first()
print(type(blob), blob)
session.close()
Python2 output
(<type 'str'>, 'x\x9c\xf3H\xcd\xc9\xc9W(\xcf/\xcaI\x01\x00\x18\xab\x04=')
Python3 output:
<class 'bytes'> b'x\x9c\xf3H\xcd\xc9\xc9W(\xcf/\xcaI\x01\x00\x18\xab\x04='

Related

How to download a trained model as a pickle file using Streamlit Download Button?

How do I download a trained model as a pickle file using Streamlit Download Button?
You can use io.BytesIO to store the pickled data inside bytes in RAM. Then, give these bytes as data argument in the st.download_button function.
import io
import pickle
import streamlit as st
def create_model():
"""Create an sklearn model so that we will have
something interesting to pickle.
Example taken from here:
https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
"""
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"]
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, _, Y_train, _ = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
# Fit the model on training set
model = LogisticRegression()
model.fit(X_train, Y_train)
return model
def pickle_model(model):
"""Pickle the model inside bytes. In our case, it is the "same" as
storing a file, but in RAM.
"""
f = io.BytesIO()
pickle.dump(model, f)
return f
st.title("My .pkl downloader")
model = create_model()
data = pickle_model(model)
st.download_button("Download .pkl file", data=data, file_name="my-pickled-model.pkl")

How do I split / chunk Large JSON Files with AWS glueContext before converting them to JSON?

I'm trying to convert a 20GB JSON gzip file to parquet using AWS Glue.
I've setup a job using Pyspark with the code below.
I got this log WARN message:
LOG.WARN: Loading one large unsplittable file s3://aws-glue-data.json.gz with only one partition, because the file is compressed by unsplittable compression codec.
I was wondering if there was a way to split / chunk the file? I know I can do it with pandas, but unfortunately that takes far too long (12+ hours).
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
import pyspark.sql.functions
from pyspark.sql.functions import col, concat, reverse, translate
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
test = glueContext.create_dynamic_frame_from_catalog(
database="test_db",
table_name="aws-glue-test_table")
# Create Spark DataFrame, remove timestamp field and re-name other fields
reconfigure = test.drop_fields(['timestamp']).rename_field('name', 'FirstName').rename_field('LName', 'LastName').rename_field('type', 'record_type')
# Create pyspark DF
spark_df = reconfigure.toDF()
# Filter and only return 'a' record types
spark_df = spark_df.where("record_type == 'a'")
# Once filtered, remove the record_type column
spark_df = spark_df.drop('record_type')
spark_df = spark_df.withColumn("LastName", translate("LastName", "LName:", ""))
spark_df = spark_df.withColumn("FirstName", reverse("FirstName"))
spark_df.write.parquet("s3a://aws-glue-bucket/parquet/test.parquet")
Spark does not parallelize reading a single gzip file. However, you can do split it in chunks.
Also, Spark is really slow at reading gzip files(since its not paralleized). You can do this to speed it up:
file_names_rdd = sc.parallelize(list_of_files, 100)
lines_rdd = file_names_rdd.flatMap(lambda _: gzip.open(_).readlines())

how to read large json file on s3 to dataframe using sagemaker

I tried using the code:
from sagemaker import get_execution_role
import pandas as pd
bucket = 'xxx'
data_key = 'TV.json'
data_location = 's3://{}/{}'.format(bucket, data_key)
textfilereader=pd.read_json(
data_location,lines=True,chunksize=1000)
dflist=[]
for df in textfilereader:
dfList.append(df)
df=pd.concat(dflist,sort=False)
error:sequence item 0: expected str instance, bytes found

how to print output results in HDBSCAN

I have ASCII data and i need to cluster the data using HDBSCAN.
I got the lables but i don't know how to print the output cluster results i.e unique and segregated results from hdbscan.
snippet:
import hdbscan
import numpy as np
datafile = "ascii.txt"
data = np.loadtxt(datafile, dtype = np.uint8)
clusterer = hdbscan.HDBSCAN(min_cluster_size = 20)
clusterer.fit(data)
print (np.unique(clusterer.labels_, return_counts = True))
You can use Pandas to read the file and then print out the cluster labels along with the dataset you have as the input. Try something like:
import pandas as pd
df = pd.read_csv("ascii.txt")
clusterer = hdbscan.HDBSCAN().fit_predict(df.ColumnName)
df_pd = pd.DataFrame({'Datapoints:' df.ColumnName, 'Cluster Labels:' clusterer)
import hdbscan
import numpy as np
datafile = "ascii.txt"
data = np.loadtxt(datafile, dtype = np.uint8)
Modified_data=pd.DataFrame(data)
clusterer = hdbscan.HDBSCAN(min_cluster_size = 20)
clusterer.fit(Modified_data)
Modified_data['Clusters']=clusterer.labels_
Now Modified_data returns a pandas dataframe where you have a column named "Clusters" and cluster corresponding to each instance will be specified in the Clusters column.
You can manipulate this dataframe as per your requirement

Can I export a tensorflow summary to CSV?

Is there a way to extract scalar summaries to CSV (preferably from within tensorboard) from tfevents files?
Example code
The following code generates tfevent files in a summary_dir within the same directory. Suppose you let it run and you find something interesting. You want to get the raw data for further investigation. How would you do that?
#!/usr/bin/env python
"""A very simple MNIST classifier."""
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
ce_with_logits = tf.nn.softmax_cross_entropy_with_logits
FLAGS = None
def inference(x):
"""
Build the inference graph.
Parameters
----------
x : placeholder
Returns
-------
Output tensor with the computed logits.
"""
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
return y
def loss(logits, labels):
"""
Calculate the loss from the logits and the labels.
Parameters
----------
logits : Logits tensor, float - [batch_size, NUM_CLASSES].
labels : Labels tensor, int32 - [batch_size]
"""
cross_entropy = tf.reduce_mean(ce_with_logits(labels=labels,
logits=logits))
return cross_entropy
def training(loss, learning_rate=0.5):
"""
Set up the training Ops.
Parameters
----------
loss : Loss tensor, from loss().
learning_rate : The learning rate to use for gradient descent.
Returns
-------
train_op: The Op for training.
"""
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(loss)
return train_step
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
y = inference(x)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
loss_ = loss(logits=y, labels=y_)
train_step = training(loss_)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.name_scope('accuracy'):
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
sess = tf.InteractiveSession()
train_writer = tf.summary.FileWriter('summary_dir/train', sess.graph)
test_writer = tf.summary.FileWriter('summary_dir/test', sess.graph)
tf.global_variables_initializer().run()
for train_step_i in range(100000):
if train_step_i % 100 == 0:
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.test.images,
y_: mnist.test.labels})
test_writer.add_summary(summary, train_step_i)
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.train.images,
y_: mnist.train.labels})
train_writer.add_summary(summary, train_step_i)
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
print(sess.run(accuracy, feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir',
type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
While the answer here is as requested within tensorboard it only allows to download a csv for a single run of a single tag.
If you have for example 10 tags and 20 runs (what is not at all much) you would need to do the above step 200 times (that alone will probably take you more than a hour).
If now you for some reason would like to actually do something with the data for all runs for a single tag you would need to write some weird CSV accumulation script or copy everything by hand (what will probably cost you more than a day).
Therefore I would like to add a solution that extracts a CSV file for every tag with all runs contained. Column headers are the run path names and row indices are the run step numbers.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in os.listdir(dpath)]
tags = summary_iterators[0].Tags()['scalars']
for it in summary_iterators:
assert it.Tags()['scalars'] == tags
out = defaultdict(list)
steps = []
for tag in tags:
steps = [e.step for e in summary_iterators[0].Scalars(tag)]
for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
assert len(set(e.step for e in events)) == 1
out[tag].append([e.value for e in events])
return out, steps
def to_csv(dpath):
dirs = os.listdir(dpath)
d, steps = tabulate_events(dpath)
tags, values = zip(*d.items())
np_values = np.array(values)
for index, tag in enumerate(tags):
df = pd.DataFrame(np_values[index], index=steps, columns=dirs)
df.to_csv(get_file_path(dpath, tag))
def get_file_path(dpath, tag):
file_name = tag.replace("/", "_") + '.csv'
folder_path = os.path.join(dpath, 'csv')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return os.path.join(folder_path, file_name)
if __name__ == '__main__':
path = "path_to_your_summaries"
to_csv(path)
My solution builds upon: https://stackoverflow.com/a/48774926/2230045
EDIT:
I created a more sophisticated version and released it on GitHub: https://github.com/Spenhouet/tensorboard-aggregator
This version aggregates multiple tensorboard runs and is able to save the aggregates to a new tensorboard summary or as a .csv file.
Just check the "Data download links" option on the upper-left in TensorBoard, and then click on the "CSV" button that will appear under your scalar summary.
Here is my solution which bases on the previous solutions but can scale up.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
final_out = {}
for dname in os.listdir(dpath):
print(f"Converting run {dname}",end="")
ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
tags = ea.Tags()['scalars']
out = {}
for tag in tags:
tag_values=[]
wall_time=[]
steps=[]
for event in ea.Scalars(tag):
tag_values.append(event.value)
wall_time.append(event.wall_time)
steps.append(event.step)
out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])
if len(tags)>0:
df= pd.concat(out.values(),keys=out.keys())
df.to_csv(f'{dname}.csv')
print("- Done")
else:
print('- Not scalers to write')
final_out[dname] = df
return final_out
if __name__ == '__main__':
path = "youre/path/here"
steps = tabulate_events(path)
pd.concat(steps.values(),keys=steps.keys()).to_csv('all_result.csv')
Very minimal example:
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
events = event_accumulator.Scalars("train_loss")
x = [x.step for x in events]
y = [x.value for x in events]
df = pd.DataFrame({"step": x, "train_loss": y})
df.to_csv("train_loss.csv")
print(df)
step train_loss
0 0 700.491516
1 1 163.593246
2 2 146.365448
3 3 153.830215
...
Plotting loss vs epochs example:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
y_key = "val_loss"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
steps = {x.step for x in event_accumulator.Scalars("epoch")}
x = list(range(len(steps)))
y = [x.value for x in event_accumulator.Scalars(y_key) if x.step in steps]
df = pd.DataFrame({"epoch": x, y_key: y})
df.to_csv(f"{y_key}.csv")
fig, ax = plt.subplots()
sns.lineplot(data=df, x="epoch", y=y_key)
fig.savefig("plot.png", dpi=300)
Just to add to #Spen
in case you want to export the data when you have varying numbers of steps.
This will make one large csv file.
Might need to change around the keys for it to work for you.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import glob
import pandas as pd
listOutput = (glob.glob("*/"))
listDF = []
for tb_output_folder in listOutput:
print(tb_output_folder)
x = EventAccumulator(path=tb_output_folder)
x.Reload()
x.FirstEventTimestamp()
keys = ['loss', 'mean_absolute_error', 'val_loss', 'val_mean_absolute_error']
listValues = {}
steps = [e.step for e in x.Scalars(keys[0])]
wall_time = [e.wall_time for e in x.Scalars(keys[0])]
index = [e.index for e in x.Scalars(keys[0])]
count = [e.count for e in x.Scalars(keys[0])]
n_steps = len(steps)
listRun = [tb_output_folder] * n_steps
printOutDict = {}
data = np.zeros((n_steps, len(keys)))
for i in range(len(keys)):
data[:,i] = [e.value for e in x.Scalars(keys[i])]
printOutDict = {keys[0]: data[:,0], keys[1]: data[:,1],keys[2]: data[:,2],keys[3]: data[:,3]}
printOutDict['Name'] = listRun
DF = pd.DataFrame(data=printOutDict)
listDF.append(DF)
df = pd.concat(listDF)
df.to_csv('Output.csv')