Access variables from parallel job lib - function

I have set up a job lib for parallel computing, so far, I have been able to use it in computing several metrics. I intend to compute g_mean, after the roc_auc. However, I am unable to retrieve y_test and svm_probs from the function. It gives an error when I try to retrieve it.
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from joblib import Parallel, delayed
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target
skf = StratifiedKFold(n_splits=5)
clf = svm.SVC(kernel='rbf', probability=True)
def train(train_index, test_index):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(X_train, y_train)
r_probs = [0 for _ in range (len(y_test))]
svm_probs = clf.predict_proba(X_test)
svm_probs = svm_probs[:,1]
svm_auc = roc_auc_score(y_test, svm_probs)
return dict(svm_auc=svm_auc)
out = Parallel(n_jobs=2, verbose=100, pre_dispatch='1.5*n_jobs')(
delayed(train)(train_index, test_index) for train_index, test_index in skf.split(X, y))
svm_auc = [d['svm_auc'] for d in out]
print(np.mean(svm_auc))
rf_fpr,rf_tpr, _ = roc_curve(y_test,svm_probs)
gmeans_rf = np.sqrt(rf_tpr * (1-rf_fpr))
ix_rf = np.argmax(gmeans_rf)
print("%.3f" % gmeans_rf[ix_rf])

Related

How can I fix the xception model

I would like to train a model which can seperate different cat's can,but it's seems to be false,and I don't know why the model can't predict correctly.
The dataset we made by ourself,and we took photos from different angles.
I'm not sure which part is wrong.
Would you please help to see how can I get the model?
Here is my code:
import keras.backend
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Input, Dense, GlobalAveragePooling2D, BatchNormalization, Flatten, Dropout
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from keras.models import Model
import tensorflow as tf
from keras.applications import Xception
train_path = './train'
test_path = './test'
batch_size = 16
image_size = (224,224)
epoch = 30
FREEZE_LAYERS = 2
model = Xception(include_top=False,
weights='imagenet',
input_shape=(224,224, 3))
x = model.output
x = GlobalAveragePooling2D()(x)
x = Flatten()(x)
x = Dropout(0.5)(x)
predictions = Dense(26, activation='softmax')(x)
model = Model(inputs=model.input, outputs=predictions)
model.compile(optimizer=Adam(lr=0.001),
loss='categorical_crossentropy',
metrics=['accuracy'])
estop = EarlyStopping(monitor='val_loss', patience=10, mode='min', verbose=1)
checkpoint = ModelCheckpoint('Xception_checkpoint.h5', verbose=1,
monitor='val_loss', save_best_only=True,
mode='max')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
patience=5, mode='min', verbose=1,
min_lr=1e-4)
train_datagen = ImageDataGenerator(rescale= 1.0/255,
rotation_range=30,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
channel_shift_range=10,
horizontal_flip=True,
fill_mode='nearest')
val_datagen = ImageDataGenerator(rescale= 1.0/255)
train_generator = train_datagen.flow_from_directory(train_path,
target_size=image_size,
class_mode='categorical',
shuffle=True,
batch_size=batch_size)
valid_generator = val_datagen.flow_from_directory(test_path,
target_size=image_size,
class_mode='categorical',
shuffle=False,
batch_size=batch_size)
history = model.fit_generator(train_generator,
epochs=epoch, verbose=1,
steps_per_epoch=train_generator.samples//batch_size,
validation_data=valid_generator,
validation_steps=valid_generator.samples//batch_size,
callbacks=[checkpoint, estop, reduce_lr])
#class_weight=class_weights)
model.save('./Xception_retrained_v2.h5')
print('saved Xception_retrained_v2.h5')

Regarding Implementation of Gradient Descent for Polynomial Regression

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
from numpy.linalg import inv
import seaborn as sns
url = r'C:\Users\pchan\kc_house_train_data.csv'
df = pd.read_csv(url,index_col=0)
features_1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
x=df.filter(features_1)
x = np.c_[np.ones((x.shape[0], 1)), x]
x=pd.DataFrame(x)
y=df.filter(['price'])
y=y.reset_index(drop=True)
x_new=x.T
y.rename(columns = {'price':0}, inplace = True)
w=pd.DataFrame([0]*(x_new.shape[0]))
cost=[]
i=0
a=0.00001
while(i<50):
temp=x.T#(y-x#w)
w=w+(a*temp)
i+=1
print(w)
from sklearn.linear_model import LinearRegression
reg=LinearRegression().fit(x,y)
res=reg.coef_
print(res)
w_closed=np.linalg.inv(x.T#x) # x.T # y
print(w_closed)
Closed Form and Linear Regression from sklearn was able to get correct weights,
But not with gradient descent approach(using Matrix notation).
Whats wrong with Gradient Descent approach here?

Facing the issue while fitting my model (bi-lstm + crf). ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list)

I am trying to solve a problem which contains bi-LSTM and CRF, while fitting the model, i am facing this issue ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list). Below is the structure of the dataframe.
Columns named "CompanyId" that contains integer. "Name" that contains string. "TableTypeCode" that is a string that is constant and is same as "BS". and final column named "BlockName". I want to train a model using bidirectional lstm and crf . Input being "CompanyId", "Name", and "TableTypeCode" and should predict "BlockName".
import numpy as np
import pandas as pd
df=pd.read_excel("data.xlsx")
from keras.layers import TimeDistributed
from keras.layers import Dense
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional
from keras.models import Model
!pip install tensorflow-addons==0.16.1
import tensorflow_addons as tfa
X = df[['CompanyId', 'Name', 'TableTypeCode']]
y = df['BlockName']
# Preprocess the data
# One-hot encode the 'CompanyId' and 'TableTypeCode' columns
X = pd.get_dummies(X, columns=['CompanyId', 'TableTypeCode'])
# Tokenize the 'Name' column
X['Name'] = X['Name'].apply(str)
tokenizer = Tokenizer()
X['Name'] = X['Name'].apply(lambda x: x.split())
X['Name'] = tokenizer.texts_to_sequences(X['Name'])
# Encode the target column
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = to_categorical(y)
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
n_classes = df['BlockName'].nunique()
# Define the model architecture
input_ = Input(shape=(X.shape[1],))
embedding = Embedding(input_dim=X.shape[1], output_dim=50)(input_)
lstm = Bidirectional(LSTM(units=100))(embedding)
output = Dense(n_classes, activation='softmax')(lstm)
model = Model(input_, output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train)
There was no issue till the last line of code. Help me fix this and train my model.

How to fix "ValueError: Error when checking target: expected dense_39 to have 4 dimensions, but got array with shape (10, 2)"?

I am new to Transfer learning and Cnn's,was just playing around with cnn and got this error.Tried many solutions but none of them works.
import numpy as np
import keras
from keras import backend as k
from keras.layers.core import Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling2D
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
from keras.models import Model
from keras.applications import imagenet_utils
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
mobile = keras.applications.mobilenet.MobileNet()
#mobile.summary()
train_path = 'chest_xray/train'
val_path = 'chest_xray/val'
test_path = 'chest_xray/test'
train_batch = ImageDataGenerator(preprocessing_function=keras.applications.mobilenet.preprocess_input).flow_from_directory(
train_path,
target_size = (224,224),
batch_size = 10)
test_batch = ImageDataGenerator(preprocessing_function=keras.applications.mobilenet.preprocess_input).flow_from_directory(
test_path,
target_size = (224,224),
batch_size = 10,
shuffle = False)
val_batch = ImageDataGenerator(preprocessing_function=keras.applications.mobilenet.preprocess_input).flow_from_directory(
val_path,
target_size = (224,224),
batch_size = 10)
def prepare_image(file):
image_path = ''
img = image.load_img(image_path+file,target_size = (224,224))
img_array = image.img_to_array(img)
img_array_dims = np.expand_dims(img_array,axis = 0)
return keras.applications.mobilenet.preprocess_input(img_array_dims)
x = mobile.layers[-60].output
predictions = Dense(1,activation='softmax')(x)
model = Model(inputs = mobile.input,outputs = predictions)
print(mobile.input)
#model.summary()
for layer in model.layers[:-5]:
layer.trainable = False
model.compile(Adam(lr=.0001),loss='categorical_crossentropy',metrics=['accuracy'])
model.fit_generator(train_batch,
steps_per_epoch=4,
validation_data=val_batch,
validation_steps=2,
epochs = 30)
I am using mobilenet for transfer learning and an error is spotted every time.None of the solutins seems to work.Tried playing with the Flatten() then 2dmaxpooling() but no results.
ERROR:
ValueError Traceback (most recent call last)
<ipython-input-187-08820ea8d15a> in <module>()
3 validation_data=val_batch,
4 validation_steps=2,
----> 5 epochs = 30)
Value-error: Error when checking target: expected dense_39 to have 4 dimensions, but got array with shape (10, 2)
The layer of the MobileNet at which you are chopping of (-60) is conv_dw_5_relu which has output dimensions (None, 28, 28, 256). So you will have to flatten it before connecting a Dense layer to it.
Working code
mobile = keras.applications.mobilenet.MobileNet()
x = mobile.layers[-60].output
x = Flatten()(x)
predictions = Dense(2,activation='softmax')(x)
model = Model(inputs = mobile.input,outputs = predictions)
#model.summary()
model.compile(Adam(lr=.0001),loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(np.random.rand(10, 224, 224, 3), np.random.rand(10,2))

Can I export a tensorflow summary to CSV?

Is there a way to extract scalar summaries to CSV (preferably from within tensorboard) from tfevents files?
Example code
The following code generates tfevent files in a summary_dir within the same directory. Suppose you let it run and you find something interesting. You want to get the raw data for further investigation. How would you do that?
#!/usr/bin/env python
"""A very simple MNIST classifier."""
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
ce_with_logits = tf.nn.softmax_cross_entropy_with_logits
FLAGS = None
def inference(x):
"""
Build the inference graph.
Parameters
----------
x : placeholder
Returns
-------
Output tensor with the computed logits.
"""
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
return y
def loss(logits, labels):
"""
Calculate the loss from the logits and the labels.
Parameters
----------
logits : Logits tensor, float - [batch_size, NUM_CLASSES].
labels : Labels tensor, int32 - [batch_size]
"""
cross_entropy = tf.reduce_mean(ce_with_logits(labels=labels,
logits=logits))
return cross_entropy
def training(loss, learning_rate=0.5):
"""
Set up the training Ops.
Parameters
----------
loss : Loss tensor, from loss().
learning_rate : The learning rate to use for gradient descent.
Returns
-------
train_op: The Op for training.
"""
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(loss)
return train_step
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
y = inference(x)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
loss_ = loss(logits=y, labels=y_)
train_step = training(loss_)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.name_scope('accuracy'):
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
sess = tf.InteractiveSession()
train_writer = tf.summary.FileWriter('summary_dir/train', sess.graph)
test_writer = tf.summary.FileWriter('summary_dir/test', sess.graph)
tf.global_variables_initializer().run()
for train_step_i in range(100000):
if train_step_i % 100 == 0:
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.test.images,
y_: mnist.test.labels})
test_writer.add_summary(summary, train_step_i)
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.train.images,
y_: mnist.train.labels})
train_writer.add_summary(summary, train_step_i)
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
print(sess.run(accuracy, feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir',
type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
While the answer here is as requested within tensorboard it only allows to download a csv for a single run of a single tag.
If you have for example 10 tags and 20 runs (what is not at all much) you would need to do the above step 200 times (that alone will probably take you more than a hour).
If now you for some reason would like to actually do something with the data for all runs for a single tag you would need to write some weird CSV accumulation script or copy everything by hand (what will probably cost you more than a day).
Therefore I would like to add a solution that extracts a CSV file for every tag with all runs contained. Column headers are the run path names and row indices are the run step numbers.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in os.listdir(dpath)]
tags = summary_iterators[0].Tags()['scalars']
for it in summary_iterators:
assert it.Tags()['scalars'] == tags
out = defaultdict(list)
steps = []
for tag in tags:
steps = [e.step for e in summary_iterators[0].Scalars(tag)]
for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
assert len(set(e.step for e in events)) == 1
out[tag].append([e.value for e in events])
return out, steps
def to_csv(dpath):
dirs = os.listdir(dpath)
d, steps = tabulate_events(dpath)
tags, values = zip(*d.items())
np_values = np.array(values)
for index, tag in enumerate(tags):
df = pd.DataFrame(np_values[index], index=steps, columns=dirs)
df.to_csv(get_file_path(dpath, tag))
def get_file_path(dpath, tag):
file_name = tag.replace("/", "_") + '.csv'
folder_path = os.path.join(dpath, 'csv')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return os.path.join(folder_path, file_name)
if __name__ == '__main__':
path = "path_to_your_summaries"
to_csv(path)
My solution builds upon: https://stackoverflow.com/a/48774926/2230045
EDIT:
I created a more sophisticated version and released it on GitHub: https://github.com/Spenhouet/tensorboard-aggregator
This version aggregates multiple tensorboard runs and is able to save the aggregates to a new tensorboard summary or as a .csv file.
Just check the "Data download links" option on the upper-left in TensorBoard, and then click on the "CSV" button that will appear under your scalar summary.
Here is my solution which bases on the previous solutions but can scale up.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
final_out = {}
for dname in os.listdir(dpath):
print(f"Converting run {dname}",end="")
ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
tags = ea.Tags()['scalars']
out = {}
for tag in tags:
tag_values=[]
wall_time=[]
steps=[]
for event in ea.Scalars(tag):
tag_values.append(event.value)
wall_time.append(event.wall_time)
steps.append(event.step)
out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])
if len(tags)>0:
df= pd.concat(out.values(),keys=out.keys())
df.to_csv(f'{dname}.csv')
print("- Done")
else:
print('- Not scalers to write')
final_out[dname] = df
return final_out
if __name__ == '__main__':
path = "youre/path/here"
steps = tabulate_events(path)
pd.concat(steps.values(),keys=steps.keys()).to_csv('all_result.csv')
Very minimal example:
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
events = event_accumulator.Scalars("train_loss")
x = [x.step for x in events]
y = [x.value for x in events]
df = pd.DataFrame({"step": x, "train_loss": y})
df.to_csv("train_loss.csv")
print(df)
step train_loss
0 0 700.491516
1 1 163.593246
2 2 146.365448
3 3 153.830215
...
Plotting loss vs epochs example:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
y_key = "val_loss"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
steps = {x.step for x in event_accumulator.Scalars("epoch")}
x = list(range(len(steps)))
y = [x.value for x in event_accumulator.Scalars(y_key) if x.step in steps]
df = pd.DataFrame({"epoch": x, y_key: y})
df.to_csv(f"{y_key}.csv")
fig, ax = plt.subplots()
sns.lineplot(data=df, x="epoch", y=y_key)
fig.savefig("plot.png", dpi=300)
Just to add to #Spen
in case you want to export the data when you have varying numbers of steps.
This will make one large csv file.
Might need to change around the keys for it to work for you.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import glob
import pandas as pd
listOutput = (glob.glob("*/"))
listDF = []
for tb_output_folder in listOutput:
print(tb_output_folder)
x = EventAccumulator(path=tb_output_folder)
x.Reload()
x.FirstEventTimestamp()
keys = ['loss', 'mean_absolute_error', 'val_loss', 'val_mean_absolute_error']
listValues = {}
steps = [e.step for e in x.Scalars(keys[0])]
wall_time = [e.wall_time for e in x.Scalars(keys[0])]
index = [e.index for e in x.Scalars(keys[0])]
count = [e.count for e in x.Scalars(keys[0])]
n_steps = len(steps)
listRun = [tb_output_folder] * n_steps
printOutDict = {}
data = np.zeros((n_steps, len(keys)))
for i in range(len(keys)):
data[:,i] = [e.value for e in x.Scalars(keys[i])]
printOutDict = {keys[0]: data[:,0], keys[1]: data[:,1],keys[2]: data[:,2],keys[3]: data[:,3]}
printOutDict['Name'] = listRun
DF = pd.DataFrame(data=printOutDict)
listDF.append(DF)
df = pd.concat(listDF)
df.to_csv('Output.csv')