How to plot a 2 dimensional density / heatmap in plotnine? - function

I am trying to recreate the following graph in plotnine. It's asking me for more details but I don't want to distract from the example. I think it's pretty obvious what I'm trying to do. I have been given a function by a colleague. I'm not interested in rewriting the function. I want to take sm and use plotnine to plot it instead of matplotlib. I plot lots of dataframes with plotnine but I'm not sure how to use it in this case. I have tried on my own to figure it out and I keep getting lost. I hope that for someone more experienced I am overlooking something simple.
import matplotlib.pyplot as plt
def getSuccess(y,x):
return((y*(-x))*.5+.5)
steps = 100
stepSize = 1/steps
sm = []
for y in range(steps*2+1):
sm.append([getSuccess((y-steps)*stepSize,(x-steps)*stepSize) for x in range(steps*2+1)])
plt.imshow(sm)
plt.ylim(-1, 1)
plt.colorbar()
plt.yticks([0,steps,steps*2],[str(y) for y in [-1.0,0.0,1.0]])
plt.xticks([0,steps,steps*2],[str(x) for x in [-1.0,0.0,1.0]])
plt.show()

You could try geom_raster.
I have taken your synthetic data sm and converted to a dataframe as plotnine will need this.
import pandas as pd
import numpy as np
from plotnine import *
df = pd.DataFrame(sm).melt()
df.rename(columns={'variable':'x','value':'density'}, inplace=True)
df.insert(1,'y',df.index % 201)
p = (ggplot(df, aes('x','y'))
+ geom_raster(aes(fill='density'), interpolate=True)
+ labs(x=None,y=None)
+ scale_x_continuous(expand=(0,0), breaks=[0,100,200], labels=[-1,0,1])
+ scale_y_continuous(expand=(0,0), breaks=[0,100,200], labels=[-1,0,1])
+ theme_matplotlib()
+ theme(
text = element_text(family="Calibri", size=9),
legend_title = element_blank(),
axis_ticks = element_blank(),
legend_key_height = 29.6,
legend_key_width = 6,
)
)
p.save(filename='C:\\Users\\BRB\\geom_raster.png', height=10, width=10, units = 'cm', dpi=400)
This result is:

Related

Issues in creating pysimplegui or Tkinter graph GUI by reading csv file , cleaning it and plotting graph (histogram+PDF)

I want to create GUI which should automatically clean data in csv file once selected and plot superimposed PDF & histogram graph. I have uploaded basic python program which generates the required graph but I am unbale to convert it into interface. I guess, only "open file" & "plot" buttons would suffice the requirement. image- want to retrieve data from 'N'th column (13) only with skipping top 4 rows
I am basically from metallurgy background and trying my hands in this field.
Any help would be greatly appreciated
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
raw_data = pd.read_csv("D:/Project/Python/NDC/Outlier_ND/800016_DAT.csv",skiprows=4,header=None)
clean = pd.DataFrame(raw_data)
data1 = clean.iloc[:, [13]]
Q1 = data1.quantile(0.25)
Q3 = data1.quantile(0.75)
IQR = Q3 - Q1
data_IQR = data1[~((data1 < (Q1 - 1.5 * IQR)) |(data1 > (Q3 + 1.5 * IQR))).any(axis=1)]
data_IQR.shape
print(data1.shape)
print(data_IQR.shape)
headerList = ['Actual_MR']
data_IQR.to_csv(r'D:\Project\Python\NDC\Outlier_ND\800016_DAT_IQR.csv', header=headerList, index=False)
data = pd.read_csv("D:/Project/Python/NDC/Outlier_ND/800016_DAT_IQR.csv")
mean, sd = norm.fit(data)
plt.hist(data, bins=25, density=True, alpha=0.6, facecolor = '#2ab0ff', edgecolor='#169acf', linewidth=0.5)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mean, sd)
plt.plot(x, p, 'red', linewidth=2)
title = " Graph \n mean: {:.2f} and SD: {:.2f}".format(mean, sd)
plt.title(title)
plt.xlabel('MR')
plt.ylabel('Pr')
plt.show()
Following code demo how PySimpleGUI to work with matplotlib, detail please find all remark in script.
import math, random
from pathlib import Path
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from matplotlib.figure import Figure
import PySimpleGUI as sg
# 1. Define the class as the interface between matplotlib and PySimpleGUI
class Canvas(FigureCanvasTkAgg):
"""
Create a canvas for matplotlib pyplot under tkinter/PySimpleGUI canvas
"""
def __init__(self, figure=None, master=None):
super().__init__(figure=figure, master=master)
self.canvas = self.get_tk_widget()
self.canvas.pack(side='top', fill='both', expand=1)
# 2. create PySimpleGUI window, a fixed-size Frame with Canvas which expand in both x and y.
font = ("Courier New", 11)
sg.theme("DarkBlue3")
sg.set_options(font=font)
layout = [
[sg.Input(expand_x=True, key='Path'),
sg.FileBrowse(file_types=(("ALL CSV Files", "*.csv"), ("ALL Files", "*.*"))),
sg.Button('Plot')],
[sg.Frame("", [[sg.Canvas(background_color='green', expand_x=True, expand_y=True, key='Canvas')]], size=(640, 480))],
[sg.Push(), sg.Button('Exit')]
]
window = sg.Window('Matplotlib', layout, finalize=True)
# 3. Create a matplotlib canvas under sg.Canvas or sg.Graph
fig = Figure(figsize=(5, 4), dpi=100)
ax = fig.add_subplot()
canvas = Canvas(fig, window['Canvas'].Widget)
# 4. initial for figure
ax.set_title(f"Sensor Data")
ax.set_xlabel("X axis")
ax.set_ylabel("Y axis")
ax.set_xlim(0, 1079)
ax.set_ylim(-1.1, 1.1)
ax.grid()
canvas.draw() # do Update to GUI canvas
# 5. PySimpleGUI event loop
while True:
event, values = window.read()
if event in (sg.WINDOW_CLOSED, 'Exit'):
break
elif event == 'Plot':
"""
path = values['Path']
if not Path(path).is_file():
continue
"""
# 6. Get data from path and plot from here
ax.cla() # Clear axes first if required
ax.set_title(f"Sensor Data")
ax.set_xlabel("X axis")
ax.set_ylabel("Y axis")
ax.grid()
theta = random.randint(0, 359)
x = [degree for degree in range(1080)]
y = [math.sin((degree+theta)/180*math.pi) for degree in range(1080)]
ax.plot(x, y)
canvas.draw() # do Update to GUI canvas
# 7. Close window to exit
window.close()

How to make DataLoader load correct batch_size data while using pytorch_geometric Data object?

I am new to pytorch_geometric.
Here is my code to create a DataLoader which contains 5 samples, and each sample was represented by a 1 * 10 matrix.
import torch
import numpy as np
from torch_geometric.data import Data, DataLoader
x = torch.tensor(np.random((5, 10))
y = torch.tensor(np.random((5, 1))
data = Data(x = x, y = y)
loader = DataLoader([data], batch_size=2)
When I set batch_size to 2, I'd like the loader to load my 5 samples by 2 2 1, but now it pushes out all data in one batch, below code is the print result.
for batch in loader:
print(batch)
# Batch(batch=[5], x=[5, 10], y=[5, 1])
Please let me know how can I achieve the result I want if you are familiar with that.
By the way, I learned this from pytorch_geometric official repository dataset like this.

Dumping numpy arrays into neural network

I am trying to solve the titanic machine learning challenge from kaggle using a neural network. I removed most of the irrelevant data and converted the useful data into a 2D numpy array while the survival is converted into a 1D numpy array. For some reason it throws an error saying dimension 0 in both shape must be equal, I've been trying to solve it for quite a while and I hope that you guys can help out.
TensorFlowNumpy.py
import tensorflow as tf
def numpy2tensor(numpy):
sess = tf.Session()
with sess.as_default():
return tf.constant(numpy)
def tensor2numpy(tensor):
sess = tf.Session()
with sess.as_default():
return tensor.eval()
Dataset.py
import pandas
import numpy as np
dataset = pandas.read_csv('train.csv')
dataset2= dataset.drop(['PassengerId','Survived','Name','Ticket','Fare','Cabin','Embarked'],axis=1)
dataset3= dataset2.fillna(0)
survive = pandas.read_csv('train.csv')
survival = np.float32(survive.Survived)
dataset4 = np.float32(dataset3)
MainCode.py
import tensorflow as tf
import numpy
from dataset import dataset4,survival
from sklearn.model_selection import train_test_split
from TensorFlowNumpy import numpy2tensor
train_x,test_x,train_y,test_y = train_test_split(dataset4,survival,test_size
= 0.2)
tensor_train_x = numpy2tensor(train_x)
tensor_train_y = numpy2tensor(train_y)
tensor_test_x = numpy2tensor(test_x)
tensor_test_y = numpy2tensor(test_y)
n_nodes_hl1 = 10
n_nodes_hl2 = 10
n_classes = 2
x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.float32)
def neural_network_model(data):
hidden_1_layer = {'weights':tf.Variable(tf.random_normal([5,
n_nodes_hl1])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}
hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1,
n_nodes_hl2])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}
output_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2,
n_classes])),
'biases':tf.Variable(tf.random_normal([n_classes]))}
l1 = tf.add(tf.matmul(data,hidden_1_layer['weights']),
hidden_1_layer['biases'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1,hidden_2_layer['weights']),
hidden_2_layer['biases'])
l2 = tf.nn.relu(l2)
output = tf.matmul(l2,output_layer['weights']) + output_layer['biases']
return output
def train_neural_network(x):
prediction = neural_network_model(x)
cost =
tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,
labels=tensor_train_y))
optimizer1 = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
hm_epochs = 100
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
epoch_loss = 0
_, c = sess.run([optimizer1, cost], feed_dict={x:tensor_train_x,
y:tensor_train_y})
epoch_loss += c
print('Epoch', epoch+1, 'completed out
of',hm_epochs,'loss:',epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x:test_x, y:test_y}))
train_neural_network(tensor_train_x)
I have faced this error several times, problem is obviously in our code. I didn't look through your code thoroughly as i am leaving for the day, but i smell that your dependent variable/ output variable shape is [1,712] which should be [712,1] so some where in the code try to fix it. Basically what it meant is you are having one row with 712 columns but it should be 712 rows with 1 column(output). Please mark this as answer if it helps. Ping me tomorrow if problem still exists. I will take a look at it.

Can I export a tensorflow summary to CSV?

Is there a way to extract scalar summaries to CSV (preferably from within tensorboard) from tfevents files?
Example code
The following code generates tfevent files in a summary_dir within the same directory. Suppose you let it run and you find something interesting. You want to get the raw data for further investigation. How would you do that?
#!/usr/bin/env python
"""A very simple MNIST classifier."""
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
ce_with_logits = tf.nn.softmax_cross_entropy_with_logits
FLAGS = None
def inference(x):
"""
Build the inference graph.
Parameters
----------
x : placeholder
Returns
-------
Output tensor with the computed logits.
"""
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
return y
def loss(logits, labels):
"""
Calculate the loss from the logits and the labels.
Parameters
----------
logits : Logits tensor, float - [batch_size, NUM_CLASSES].
labels : Labels tensor, int32 - [batch_size]
"""
cross_entropy = tf.reduce_mean(ce_with_logits(labels=labels,
logits=logits))
return cross_entropy
def training(loss, learning_rate=0.5):
"""
Set up the training Ops.
Parameters
----------
loss : Loss tensor, from loss().
learning_rate : The learning rate to use for gradient descent.
Returns
-------
train_op: The Op for training.
"""
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(loss)
return train_step
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
y = inference(x)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
loss_ = loss(logits=y, labels=y_)
train_step = training(loss_)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.name_scope('accuracy'):
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
sess = tf.InteractiveSession()
train_writer = tf.summary.FileWriter('summary_dir/train', sess.graph)
test_writer = tf.summary.FileWriter('summary_dir/test', sess.graph)
tf.global_variables_initializer().run()
for train_step_i in range(100000):
if train_step_i % 100 == 0:
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.test.images,
y_: mnist.test.labels})
test_writer.add_summary(summary, train_step_i)
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.train.images,
y_: mnist.train.labels})
train_writer.add_summary(summary, train_step_i)
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
print(sess.run(accuracy, feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir',
type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
While the answer here is as requested within tensorboard it only allows to download a csv for a single run of a single tag.
If you have for example 10 tags and 20 runs (what is not at all much) you would need to do the above step 200 times (that alone will probably take you more than a hour).
If now you for some reason would like to actually do something with the data for all runs for a single tag you would need to write some weird CSV accumulation script or copy everything by hand (what will probably cost you more than a day).
Therefore I would like to add a solution that extracts a CSV file for every tag with all runs contained. Column headers are the run path names and row indices are the run step numbers.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in os.listdir(dpath)]
tags = summary_iterators[0].Tags()['scalars']
for it in summary_iterators:
assert it.Tags()['scalars'] == tags
out = defaultdict(list)
steps = []
for tag in tags:
steps = [e.step for e in summary_iterators[0].Scalars(tag)]
for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
assert len(set(e.step for e in events)) == 1
out[tag].append([e.value for e in events])
return out, steps
def to_csv(dpath):
dirs = os.listdir(dpath)
d, steps = tabulate_events(dpath)
tags, values = zip(*d.items())
np_values = np.array(values)
for index, tag in enumerate(tags):
df = pd.DataFrame(np_values[index], index=steps, columns=dirs)
df.to_csv(get_file_path(dpath, tag))
def get_file_path(dpath, tag):
file_name = tag.replace("/", "_") + '.csv'
folder_path = os.path.join(dpath, 'csv')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return os.path.join(folder_path, file_name)
if __name__ == '__main__':
path = "path_to_your_summaries"
to_csv(path)
My solution builds upon: https://stackoverflow.com/a/48774926/2230045
EDIT:
I created a more sophisticated version and released it on GitHub: https://github.com/Spenhouet/tensorboard-aggregator
This version aggregates multiple tensorboard runs and is able to save the aggregates to a new tensorboard summary or as a .csv file.
Just check the "Data download links" option on the upper-left in TensorBoard, and then click on the "CSV" button that will appear under your scalar summary.
Here is my solution which bases on the previous solutions but can scale up.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
final_out = {}
for dname in os.listdir(dpath):
print(f"Converting run {dname}",end="")
ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
tags = ea.Tags()['scalars']
out = {}
for tag in tags:
tag_values=[]
wall_time=[]
steps=[]
for event in ea.Scalars(tag):
tag_values.append(event.value)
wall_time.append(event.wall_time)
steps.append(event.step)
out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])
if len(tags)>0:
df= pd.concat(out.values(),keys=out.keys())
df.to_csv(f'{dname}.csv')
print("- Done")
else:
print('- Not scalers to write')
final_out[dname] = df
return final_out
if __name__ == '__main__':
path = "youre/path/here"
steps = tabulate_events(path)
pd.concat(steps.values(),keys=steps.keys()).to_csv('all_result.csv')
Very minimal example:
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
events = event_accumulator.Scalars("train_loss")
x = [x.step for x in events]
y = [x.value for x in events]
df = pd.DataFrame({"step": x, "train_loss": y})
df.to_csv("train_loss.csv")
print(df)
step train_loss
0 0 700.491516
1 1 163.593246
2 2 146.365448
3 3 153.830215
...
Plotting loss vs epochs example:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
y_key = "val_loss"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
steps = {x.step for x in event_accumulator.Scalars("epoch")}
x = list(range(len(steps)))
y = [x.value for x in event_accumulator.Scalars(y_key) if x.step in steps]
df = pd.DataFrame({"epoch": x, y_key: y})
df.to_csv(f"{y_key}.csv")
fig, ax = plt.subplots()
sns.lineplot(data=df, x="epoch", y=y_key)
fig.savefig("plot.png", dpi=300)
Just to add to #Spen
in case you want to export the data when you have varying numbers of steps.
This will make one large csv file.
Might need to change around the keys for it to work for you.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import glob
import pandas as pd
listOutput = (glob.glob("*/"))
listDF = []
for tb_output_folder in listOutput:
print(tb_output_folder)
x = EventAccumulator(path=tb_output_folder)
x.Reload()
x.FirstEventTimestamp()
keys = ['loss', 'mean_absolute_error', 'val_loss', 'val_mean_absolute_error']
listValues = {}
steps = [e.step for e in x.Scalars(keys[0])]
wall_time = [e.wall_time for e in x.Scalars(keys[0])]
index = [e.index for e in x.Scalars(keys[0])]
count = [e.count for e in x.Scalars(keys[0])]
n_steps = len(steps)
listRun = [tb_output_folder] * n_steps
printOutDict = {}
data = np.zeros((n_steps, len(keys)))
for i in range(len(keys)):
data[:,i] = [e.value for e in x.Scalars(keys[i])]
printOutDict = {keys[0]: data[:,0], keys[1]: data[:,1],keys[2]: data[:,2],keys[3]: data[:,3]}
printOutDict['Name'] = listRun
DF = pd.DataFrame(data=printOutDict)
listDF.append(DF)
df = pd.concat(listDF)
df.to_csv('Output.csv')

Linear Regression Function for Sin Graph

I am currently working on a project where I must analyze data and find a period for the graph. The data contains outliers. I need a function that will make a line of best fit for the function.
I attempted to simply get a sin graph on the plot, but I could not even do that. Can anyone give me a starting hint?
import os
import pyfits as fits
import numpy as np
import pylab
import random
import scipy.optimize
import scipy.signal
from numpy import arange
from matplotlib import pyplot
from scipy.optimize import curve_fit
filename = 'C:\Users\Ken Preiser\Desktop\Space thing\Snapshots\BAT_70m_snapshot_SWIFT_J1647.9-4511B.lc'
namePortion = filename[-39:]
hdulist = fits.open(filename, 'readonly', None, False) #{unpacks file) name, mode, memorymap, savebackup
data = hdulist[1].data
datapoints = 23310
def sinfunc(a, b, c): #I tried graphing a sinfunction, but it did not work...
return a*np.sin(bx-c)
time = data.field('TIME')
time = time / 86400.0
timeViewingThreshold = 10
rateViewingThreshold = .01
rate = np.sum(data['RATE'][:,:4], axis=1)
average = np.sum(rate)/23310
error = data.field('ERROR')
error = np.sqrt(np.sum(data['ERROR'][:,:4]**2, axis=1))
print rate.size,(", rate")
print time.size,(", time")
fig = pylab.figure()
ax = fig.add_subplot(111)
ax.set_xlabel('Time')
ax.set_ylabel('Rate')
ax.set_title('Rate vs Time graph: ' + namePortion)
pylab.plot(time, rate, 'o')
pyplot.xlim(min(time) - timeViewingThreshold, max(time) + timeViewingThreshold)
pyplot.ylim(min(rate) - rateViewingThreshold, max(rate) + rateViewingThreshold)
ax.errorbar(time, rate, xerr=0, yerr=error)
pylab.show()
(the outputs)
http://imgur.com/jbfuxOA
You're trying to fit points to the model: y = sin(ax + b). Since you're using linear regression, you need a linear model. So one way to do that is compute arcsin for each point and now compute the linear regression. The model is now: arcsin(y) = ax + b. The regression model gives you a and b which is what you're after. You should be able to test this out pretty quickly in excel, then code it up once the nuances are figured out.