How to convert BatchNorm weight of caffe to pytorch BathNorm? - caffe

BathNorm and Scale weight of caffe model can be read from pycaffe, which are three weights in BatchNorm and two weights in Scale. I tried to copy those weights to pytorch BatchNorm with codes like this:
if 'conv3_final_bn' == name:
assert len(blobs) == 3, '{} layer blob count: {}'.format(name, len(blobs))
torch_mod['conv3_final_bn.running_mean'] = blobs[0].data
torch_mod['conv3_final_bn.running_var'] = blobs[1].data
elif 'conv3_final_scale' == name:
assert len(blobs) == 2, '{} layer blob count: {}'.format(name, len(blobs))
torch_mod['conv3_final_bn.weight'] = blobs[0].data
torch_mod['conv3_final_bn.bias'] = blobs[1].data
The two BatchNorm acts differently. I also tried to set conv3_final_bn.weight=1 and conv3_final_bn.bias=0 to verify the BN layer of caffe, the results didn't match either.
How should I deal with the wrong matching?

Got it! There is still a third parameter in BatchNorm of caffe. Codes should be:
if 'conv3_final_bn' == name:
assert len(blobs) == 3, '{} layer blob count: {}'.format(name, len(blobs))
torch_mod['conv3_final_bn.running_mean'] = blobs[0].data / blobs[2].data[0]
torch_mod['conv3_final_bn.running_var'] = blobs[1].data / blobs[2].data[0]
elif 'conv3_final_scale' == name:
assert len(blobs) == 2, '{} layer blob count: {}'.format(name, len(blobs))
torch_mod['conv3_final_bn.weight'] = blobs[0].data
torch_mod['conv3_final_bn.bias'] = blobs[1].data

Related

PyTorch: Target 1 is out of bounds

I am new to Deep Learning and wondering how to modify my model to fix it.
It says Target 1 is out of bounds, so what parameter should I change to make it works. When the output is changed to 2, it works. However, the goal for the model is to predict 2 classes classification. Also, when output is 2, the training loss becomes nan.
The data is a dataframe with shape (15958, 4) transformed into tensor format.
Sorry Split_NN is a class:
# SplitNN
# to protect privacy and split
class SplitNN:
def __init__(self, models, optimizers):
self.models = models
self.optimizers = optimizers
self.data = []
self.remote_tensors = []
def forward(self, x):
data = []
remote_tensors = []
data.append(self.models[0](x))
if data[-1].location == self.models[1].location:
remote_tensors.append(data[-1].detach().requires_grad_())
else:
remote_tensors.append(
data[-1].detach().move(self.models[1].location).requires_grad_()
)
i = 1
while i < (len(models) - 1):
data.append(self.models[i](remote_tensors[-1]))
if data[-1].location == self.models[i + 1].location:
remote_tensors.append(data[-1].detach().requires_grad_())
else:
remote_tensors.append(
data[-1].detach().move(self.models[i + 1].location).requires_grad_()
)
i += 1
data.append(self.models[i](remote_tensors[-1]))
self.data = data
self.remote_tensors = remote_tensors
return data[-1]
def backward(self):
for i in range(len(models) - 2, -1, -1):
if self.remote_tensors[i].location == self.data[i].location:
grads = self.remote_tensors[i].grad.copy()
else:
grads = self.remote_tensors[i].grad.copy().move(self.data[i].location)
self.data[i].backward(grads)
def zero_grads(self):
for opt in self.optimizers:
opt.zero_grad()
def step(self):
for opt in self.optimizers:
opt.step()
Below are the codes:
Model set up: The Model is a sequential deep learning model, which I tried to use nn.linear to generated binary prediction.
torch.manual_seed(0)
# Define our model segments
input_size = 3
hidden_sizes = [128, 640]
output_size = 1
# original models
models = [
nn.Sequential(
nn.Linear(input_size, hidden_sizes[0]),
nn.ReLU(),
nn.Linear(hidden_sizes[0], hidden_sizes[1]),
nn.ReLU(),
),
nn.Sequential(nn.Linear(hidden_sizes[1], output_size), nn.LogSoftmax(dim=1)),
]
# Create optimisers for each segment and link to them
optimizers = [
optim.SGD(model.parameters(), lr=0.03,)
for model in models
]
Train model is here:
def train(x, target, splitNN):
#1) Zero our grads
splitNN.zero_grads()
#2) Make a prediction
pred = splitNN.forward(x)
#3) Figure out how much we missed by
criterion = nn.NLLLoss()
loss = criterion(pred, target)
#4) Backprop the loss on the end layer
loss.backward()
#5) Feed Gradients backward through the nework
splitNN.backward()
#6) Change the weights
splitNN.step()
return loss, pred
Finally the training part, also the part where problem happen:
the send function is for assigning model to the nodes, cuz this is set up to simulating federated learning.
for i in range(epochs):
running_loss = 0
correct_preds = 0
total_preds = 0
for (data, ids1), (labels, ids2) in dataloader:
# Train a model
data = data.send(models[0].location)
data = data.view(data.shape[0], -1)
labels = labels.send(models[-1].location)
# Call model
loss, preds = train(data.float(), labels, splitNN)
# Collect statistics
running_loss += loss.get()
correct_preds += preds.max(1)[1].eq(labels).sum().get().item()
total_preds += preds.get().size(0)
print(f"Epoch {i} - Training loss: {running_loss/len(dataloader):.3f} - Accuracy: {100*correct_preds/total_preds:.3f}")
The error show the problem occurs at loss, preds = train(data.float(), labels, splitNN)
The actual error message:
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
1836 .format(input.size(0), target.size(0)))
1837 if dim == 2:
-> 1838 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
1839 elif dim == 4:
1840 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
IndexError: Target 1 is out of bounds.
Please help me. Thank you

what should I do if my regression model stuck at a high value loss?

I'm using neural nets for a regression problem where I have 3 features and I'm trying to predict one continuous value. I noticed that my neural net start learning good but after 10 epochs it get stuck on a high loss value and could not improve anymore.
I tried to use Adam and other adaptive optimizers instead of SGD but that didn't work. I tried a complex architectures like adding layers, neurons, batch normalization and other activations etc.. and that also didn't work.
I tried to debug and try to find out if something is wrong with the implementation but when I use only 10 examples of the data my model learn fast so there are no errors. I start to increase the examples of the data and monitoring my model results as I increase the data examples. when I reach 3000 data examples my model start to get stuck on a high value loss.
I tried to increase layers, neurons and also to try other activations, batch normalization. My data are also normalized between [-1, 1], my target value is not normalized since it is regression and I'm predicting a continuous value. I also tried using keras but I've got the same result.
My real dataset have 40000 data, I don't know what should I try, I almost try all things that I know for optimization but none of them worked. I would appreciate it if someone can guide me on this. I'll post my Code but maybe it is too messy to try to understand, I'm sure there is no problem with my implementation, I'm using skorch/pytorch and some SKlearn functions:
# take all features as an Independant variable except the bearing and distance
# here when I start small the model learn good but from 3000 data points as you can see the model stuck on a high value. I mean the start loss is 15 and it start to learn good but when it reach 9 it stucks there
# and if I try to use the whole dataset for training then the loss start at 47 and start decreasing until it reach 36 and then stucks there too
X = dataset.iloc[:3000, 0:-2].reset_index(drop=True).to_numpy().astype(np.float32)
# take distance and bearing as the output values:
y = dataset.iloc[:3000, -2:].reset_index(drop=True).to_numpy().astype(np.float32)
y_bearing = y[:, 0].reshape(-1, 1)
y_distance = y[:, 1].reshape(-1, 1)
# normalize the input values
scaler = StandardScaler()
X_norm = scaler.fit_transform(X, y)
X_br_train, X_br_test, y_br_train, y_br_test = train_test_split(X_norm,
y_bearing,
test_size=0.1,
random_state=42,
shuffle=True)
X_dis_train, X_dis_test, y_dis_train, y_dis_test = train_test_split(X_norm,
y_distance,
test_size=0.1,
random_state=42,
shuffle=True)
bearing_trainset = Dataset(X_br_train, y_br_train)
bearing_testset = Dataset(X_br_test, y_br_test)
distance_trainset = Dataset(X_dis_train, y_dis_train)
distance_testset = Dataset(X_dis_test, y_dis_test)
def root_mse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
class RMSELoss(nn.Module):
def __init__(self):
super().__init__()
self.mse = nn.MSELoss()
def forward(self, yhat, y):
return torch.sqrt(self.mse(yhat, y))
class AED(nn.Module):
"""custom average euclidean distance loss"""
def __init__(self):
super().__init__()
def forward(self, yhat, y):
return torch.dist(yhat, y)
def train(on_target,
hidden_units,
batch_size,
epochs,
optimizer,
lr,
regularisation_factor,
train_shuffle):
network = None
trainset = distance_trainset if on_target.lower() == 'distance' else bearing_trainset
testset = distance_testset if on_target.lower() == 'distance' else bearing_testset
print(f"shape of trainset.X = {trainset.X.shape}, shape of trainset.y = {trainset.y.shape}")
print(f"shape of testset.X = {testset.X.shape}, shape of testset.y = {testset.y.shape}")
mse = EpochScoring(scoring=mean_squared_error, lower_is_better=True, name='MSE')
r2 = EpochScoring(scoring=r2_score, lower_is_better=False, name='R2')
rmse = EpochScoring(scoring=make_scorer(root_mse), lower_is_better=True, name='RMSE')
checkpoint = Checkpoint(dirname=f'results/{on_target}/checkpoints')
train_end_checkpoint = TrainEndCheckpoint(dirname=f'results/{on_target}/checkpoints')
if on_target.lower() == 'bearing':
network = BearingNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=y_distance.shape[1])
elif on_target.lower() == 'distance':
network = DistanceNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=1)
model = NeuralNetRegressor(
module=network,
criterion=RMSELoss,
device='cpu',
batch_size=batch_size,
lr=lr,
optimizer=optim.Adam if optimizer.lower() == 'adam' else optim.SGD,
optimizer__weight_decay=regularisation_factor,
max_epochs=epochs,
iterator_train__shuffle=train_shuffle,
train_split=predefined_split(testset),
callbacks=[mse, r2, rmse, checkpoint, train_end_checkpoint]
)
print(f"{'*' * 10} start training the {on_target} model {'*' * 10}")
history = model.fit(trainset, y=None)
print(f"{'*' * 10} End Training the {on_target} Model {'*' * 10}")
if __name__ == '__main__':
args = parser.parse_args()
train(on_target=args.on_target,
hidden_units=args.hidden_units,
batch_size=args.batch_size,
epochs=args.epochs,
optimizer=args.optimizer,
lr=args.learning_rate,
regularisation_factor=args.regularisation_lambda,
train_shuffle=args.shuffle)
and this is my network declaration:
class DistanceNetwork(nn.Module):
"""separate NN for predicting distance"""
def __init__(self, n_features=5, n_hidden=16, n_out=1):
super().__init__()
self.model = nn.Sequential(
nn.Linear(n_features, n_hidden),
nn.LeakyReLU(),
nn.Linear(n_hidden, 5),
nn.LeakyReLU(),
nn.Linear(5, n_out)
)
here is the log while training:

How do I get CSV files into an Estimator in Tensorflow 1.6

I am new to tensorflow (and my first question in StackOverflow)
As a learning tool, I am trying to do something simple. (4 days later I am still confused)
I have one CSV file with 36 columns (3500 records) with 0s and 1s.
I am envisioning this file as a flattened 6x6 matrix.
I have another CSV file with 1 columnn of ground truth 0 or 1 (3500 records) which indicates if at least 4 of the 6 of elements in the 6x6 matrix's diagonal are 1's.
I am not sure I have processed the CSV files correctly.
I am confused as to how I create the features dictionary and Labels and how that fits into the DNNClassifier
I am using TensorFlow 1.6, Python 3.6
Below is the small amount of code I have so far.
import tensorflow as tf
import os
def x_map(line):
rDefaults = [[] for cl in range(36)]
x_row = tf.decode_csv(line, record_defaults=rDefaults)
return x_row
def y_map(line):
line = tf.string_to_number(line, out_type=tf.int32)
y_row = tf.one_hot(line, depth=2)
return y_row
x_path_file = os.path.join('D:', 'Diag', '6x6_train.csv')
y_path_file = os.path.join('D:', 'Diag', 'HasDiag_train.csv')
filenames = [x_path_file]
x_dataset = tf.data.TextLineDataset(filenames)
x_dataset = x_dataset.map(x_map)
x_dataset = x_dataset.batch(1)
x_iter = x_dataset.make_one_shot_iterator()
x_next_el = x_iter.get_next()
filenames = [y_path_file]
y_dataset = tf.data.TextLineDataset(filenames)
y_dataset = y_dataset.map(y_map)
y_dataset = y_dataset.batch(1)
y_iter = y_dataset.make_one_shot_iterator()
y_next_el = y_iter.get_next()
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
x_el = (sess.run(x_next_el))
y_el = (sess.run(y_next_el))
The output for x_el is:
(array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([0.] ... it goes on...
The output for y_el is:
[[1. 0.]]
You're pretty much there for a minimal working model. The main issue I see is that tf.decode_csv returns a tuple of tensors, where as I expect you want a single tensor with all values. Easy fix:
x_row = tf.stack(tf.decode_csv(line, record_defaults=rDefaults))
That should work... but it fails to take advantage of many of the awesome things the tf.data.Dataset API has to offer, like shuffling, parallel threading etc. For example, if you shuffle each dataset, those shuffling operations won't be consistent. This is because you've created two separate datasets and manipulated them independently. If you create them independently, zip them together then manipulate, those manipulations will be consistent.
Try something along these lines:
def get_inputs(
count=None, shuffle=True, buffer_size=1000, batch_size=32,
num_parallel_calls=8, x_paths=[x_path_file], y_paths=[y_path_file]):
"""
Get x, y inputs.
Args:
count: number of epochs. None indicates infinite epochs.
shuffle: whether or not to shuffle the dataset
buffer_size: used in shuffle
batch_size: size of batch. See outputs below
num_parallel_calls: used in map. Note if > 1, intra-batch ordering
will be shuffled
x_paths: list of paths to x-value files.
y_paths: list of paths to y-value files.
Returns:
x: (batch_size, 6, 6) tensor
y: (batch_size, 2) tensor of 1-hot labels
"""
def x_map(line):
rDefaults = [[] for cl in range(n_dims**2)]
x_row = tf.stack(tf.decode_csv(line, record_defaults=rDefaults))
return x_row
def y_map(line):
line = tf.string_to_number(line, out_type=tf.int32)
y_row = tf.one_hot(line, depth=2)
return y_row
def xy_map(x, y):
return x_map(x), y_map(y)
x_ds = tf.data.TextLineDataset(x_paths)
y_ds = tf.data.TextLineDataset(y_paths)
combined = tf.data.Dataset.zip((x_ds, y_ds))
combined = combined.repeat(count=count)
if shuffle:
combined = combined.shuffle(buffer_size)
combined = combined.map(xy_map, num_parallel_calls=num_parallel_calls)
combined = combined.batch(batch_size)
x, y = combined.make_one_shot_iterator().get_next()
return x, y
To experiment/debug,
x, y = get_inputs()
with tf.Session() as sess:
xv, yv = sess.run((x, y))
print(xv.shape, yv.shape)
For use in an estimator, pass the function itself.
estimator.train(get_inputs, max_steps=10000)
def get_eval_inputs():
return get_inputs(
count=1, shuffle=False
x_paths=[x_eval_paths],
y_paths=[y_eval_paths])
estimator.eval(get_eval_inputs)

How to test the correctness of a Keras custom layer?

After creating a Keras custom layer with training weight, how can one test the correctness of the code? It does not seem to be described in Keras' manual.
For example, to test the expected behavior of a function, one can write a unit test. How can we do this for a Keras custom layer?
You can still do something like unit test by getting the output of the custom layer for the given input and verifying it against the manually calculated output,
Let's say your custom layer Custom takes (None, 3, 200) as input shape and returns (None, 3)
from keras.layers import Input
from keras.models import Model
inp = Input(shape=(3, 200))
out = Custom()(inp)
model = Model(inp, out)
output = model.predict(your_input)
You can verify the layer output output with your expected output for a known input your_input.
layer_test in keras utils.
https://github.com/keras-team/keras/blob/master/keras/utils/test_utils.py
They provide following code, which tests the shape, the actual result, serializing and training:
def layer_test(layer_cls, kwargs={}, input_shape=None, input_dtype=None,
input_data=None, expected_output=None,
expected_output_dtype=None, fixed_batch_size=False):
"""Test routine for a layer with a single input tensor
and single output tensor.
"""
# generate input data
if input_data is None:
assert input_shape
if not input_dtype:
input_dtype = K.floatx()
input_data_shape = list(input_shape)
for i, e in enumerate(input_data_shape):
if e is None:
input_data_shape[i] = np.random.randint(1, 4)
input_data = (10 * np.random.random(input_data_shape))
input_data = input_data.astype(input_dtype)
else:
if input_shape is None:
input_shape = input_data.shape
if input_dtype is None:
input_dtype = input_data.dtype
if expected_output_dtype is None:
expected_output_dtype = input_dtype
# instantiation
layer = layer_cls(**kwargs)
# test get_weights , set_weights at layer level
weights = layer.get_weights()
layer.set_weights(weights)
expected_output_shape = layer.compute_output_shape(input_shape)
# test in functional API
if fixed_batch_size:
x = Input(batch_shape=input_shape, dtype=input_dtype)
else:
x = Input(shape=input_shape[1:], dtype=input_dtype)
y = layer(x)
assert K.dtype(y) == expected_output_dtype
# check with the functional API
model = Model(x, y)
actual_output = model.predict(input_data)
actual_output_shape = actual_output.shape
for expected_dim, actual_dim in zip(expected_output_shape,
actual_output_shape):
if expected_dim is not None:
assert expected_dim == actual_dim
if expected_output is not None:
assert_allclose(actual_output, expected_output, rtol=1e-3)
# test serialization, weight setting at model level
model_config = model.get_config()
recovered_model = model.__class__.from_config(model_config)
if model.weights:
weights = model.get_weights()
recovered_model.set_weights(weights)
_output = recovered_model.predict(input_data)
assert_allclose(_output, actual_output, rtol=1e-3)
# test training mode (e.g. useful when the layer has a
# different behavior at training and testing time).
if has_arg(layer.call, 'training'):
model.compile('rmsprop', 'mse')
model.train_on_batch(input_data, actual_output)
# test instantiation from layer config
layer_config = layer.get_config()
layer_config['batch_input_shape'] = input_shape
layer = layer.__class__.from_config(layer_config)
# for further checks in the caller function
return actual_output

Simple Multilayer Perceptron model does not converge in TensorFlow

I am new to TensorFlow. Today I tried to implement my first model in TF but it returned strange results. I know that I am missing something here but I was not able to figure it out. Here is the story.
Model
I have a simple Multilayer Perceptron model with only a single hidden layer applied on MNIST databse. Layers are defined like [input(784) , hidden_layer(470) , output_layer(10)] with tanh as non-linearity for hidden layer and softmax as the loss for output layer. The optimizer I am using is Gradient Descent Algorithm with learning rate of 0.01. My mini batch size is 1 (I am training model with samples one by one).
My implementations :
First I implemented my model in C++ and got around 96% accuracy.Here is the repository : https://github.com/amin2ros/Artificog
I implemented the exact model in TensorFlow but surprisingly the model didn't converge at all. Here is the code.
Code:
import sys
import input_data
import matplotlib.pyplot as plt
from pylab import *
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
import tensorflow as tf
# Parameters
learning_rate = 0.1
training_epochs = 1
batch_size = 1
display_step = 1
# Network Parameters
n_hidden_1 = 470 # 1st layer num features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
# Create model
def multilayer_perceptron(_X, _weights, _biases):
layer_1 = tf.tanh(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1']))
return tf.matmul(layer_1, _weights['out']) + _biases['out']
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax(pred)) # Softmax loss
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(cost) #
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
m= 0
total_batch = int(mnist.train.num_examples/batch_size)
counter=0
#print 'count = ' , total_batch
#sys.stdin.read(1)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
label = tf.argmax(batch_ys,1).eval()[0]
counter+=1
sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
wrong_prediction = tf.not_equal(tf.argmax(pred, 1), tf.argmax(y, 1))
missed=tf.cast(wrong_prediction, "float")
m += missed.eval({x: batch_xs, y: batch_ys})[0]
print "Sample #", counter , " - Label : " , label , " - Prediction :" , tf.argmax(pred, 1).eval({x: batch_xs, y: batch_ys})[0] ,\
"- Missed = " , m , " - Error Rate = " , 100 * float(m)/counter
print "Optimization Finished!"
I am very curious why this happens. Any help is appreciated.
Edit:
As commented below definition of cost function was incorrect so it should be like
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred,y))
Now model converges :)