Why does loss history go to zero for momentum - deep-learning

I am trying to implement momentum for a linear regression model, I will like to save my plus at each point and plot against the iterations, but the loss decreases in the first few epochs and goes to zero, how do I solve this
''' def compute_cost(self, X, y):
loss = float((np.sum((X.dot(self.theta) - y) ** 2) ))/ 2
return loss
def fitmomentum(self, X, y,momentum = 0.9 ):
X = self.add_ones(X.values)
self.theta = np.zeros(X.shape[1])
self.cost_history = np.zeros(self.epoch)
self.momentum = momentum
self.velocity = np.zeros(self.epoch)
self.theta = self.momentum(g, self.velocity)
for i in range(len((X.T # (X # self.theta - y)))):
self.velocity[i] = self.momentum * self.velocity[i] + self.lr * ((X.T # (X # self.theta - y)))[i]
self.theta[i] += self.velocity[i]
self.cost_history[i] = self.compute_cost(X, y)
return self.theta, self.cost_history '''
This is what I see after printing the loss
'''array([4.20078101e+13, 6.06945342e+13, 7.01728125e+17, ...,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00])'''
This is what the graph looks like..
plot of epochs against loss history

Related

Poor results fine-tunned GoogLeNet, how to improve them?

I've trained the GooogLeNet from scratch on the MNIST dataset. It achieved very good results (top-1 accuracy of 99% on test set).
Now I want to do transfer learning in order to adapt it to the FashionMNIST dataset. For that I'm doing the following:
# Loading trained model on MNIST
googlenet = torch.load('googlenet-mnist.pth')
# Freeze the network
def freeze(net):
for param in net.parameters():
param.requires_grad = False
return net
# Override all the Linear layers and initialize them
# (including the ones that produce auxiliarity logits)
def forget_FC(net):
net.aux1.fc1 = nn.Linear(in_features=net.aux1.fc1.in_features, out_features=net.aux1.fc1.out_features, bias=True)
net.aux1.fc2 = nn.Linear(in_features=net.aux1.fc2.in_features, out_features=net.aux1.fc2.out_features, bias=True)
net.aux2.fc1 = nn.Linear(in_features=net.aux2.fc1.in_features, out_features=net.aux2.fc1.out_features, bias=True)
net.aux2.fc2 = nn.Linear(in_features=net.aux2.fc2.in_features, out_features=net.aux2.fc2.out_features, bias=True)
# Override the classification layer
net.fc = nn.Sequential(
nn.Linear(num_in_features, num_in_features),
nn.Linear(num_in_features, num_in_features),
nn.Linear(num_in_features, 10))
# Initialize weights auxiliarity logits branches
torch.nn.init.trunc_normal_(net.aux1.fc1.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux1.fc2.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux2.fc1.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux2.fc2.weight, mean=0.0, std=0.01, a=-2, b=2)
# Initialize weights each Linear module in the classification layer
for module in net.fc.modules():
if isinstance(module, nn.Linear):
torch.nn.init.trunc_normal_(module.weight, mean=0.0, std=0.01, a=-2, b=2)
return net
# The training algorithm
def train(net, train_iter, test_iter, num_epochs, lr, device, plot_title, fine_tune=False):
"""Train a model with a GPU.
"""
# def init_weights(m):
# if type(m) == nn.Linear or type(m) == nn.Conv2d:
# nn.init.xavier_uniform_(m.weight)
# net.apply(init_weights)
print('training on', device)
progress = ""
net.to(device)
if fine_tune:
optimizer = torch.optim.SGD(net.fc.parameters(), lr=lr, momentum=.9)
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=.9)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=1, verbose=True)
loss = nn.CrossEntropyLoss()
animator = Animator(xlabel='epoch', xlim=[1, num_epochs], title=plot_title, ylim=[0, 1], figsize=(5,5),
legend=['train loss', 'train acc', 'val acc'])
timer, num_batches = d2l.Timer(), len(train_iter)
for epoch in range(num_epochs):
# Sum of training loss, sum of training accuracy, sum of top 5 training accuracy, no. of examples
metric = d2l.Accumulator(4)
net.train()
# Training
for i, (X, y) in enumerate(train_iter):
timer.start()
optimizer.zero_grad()
X, y = X.to(device), y.to(device)
# Mini-batch inference
y_hat = net(X)
# Take into account the auxiliarity logits (see link cell above)
if isinstance(y_hat, GoogLeNetOutputs):
aux_logit1, aux_logit2, y_hat = y_hat
l1 = loss(y_hat, y)
l2 = loss(aux_logit1, y)
l3 = loss(aux_logit2, y)
l = l1 + .3 * (l2 + l3)
else:
l = loss(y_hat, y)
l.backward()
optimizer.step()
# Training accuracies
with torch.no_grad():
acc_1, acc_5 = accuracy(y_hat, y)
metric.add(l * X.shape[0], acc_1, acc_5, X.shape[0])
timer.stop()
train_l = metric[0] / metric[3]
train_acc_1 = metric[1] / metric[3]
train_acc_5 = metric[2] / metric[3]
if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
animator.add(epoch + (i + 1) / num_batches,
(train_l, train_acc_1, None), plot_title)
# Validation, (validation loss computed when model in eval mode, is that correct?)
val_l, test_acc_1, test_acc_5 = evaluate_accuracy_gpu(net, test_iter)
scheduler.step(val_l)
animator.add(epoch + 1, (None, None, test_acc_1), plot_title)
# Un-comment to see memory consumption, modify batch size to see effects
# print(os.popen('nvidia-smi').read())
# break
progress += f"----\nEpoch {epoch}/{num_epochs}\n\ttrain loss={train_l}[{train_acc_1}]\tval loss={val_l} [{test_acc_1}]\n----"
print(progress)
print(f'loss={train_l:.3f}, train=[1-acc {train_acc_1:.3f}, 5-acc {train_acc_5:.3f}]'
f'test=[1-acc {test_acc_1:.3f}, 5-acc {test_acc_5:.3f}]')
print(f'{metric[3] * num_epochs / timer.sum():.1f} examples/sec '
f'on {str(device)}')
print(f'total training time: {timer.sum()} seconds')
With this approach 34% training accuracy is achieved. Honestly, I was expecting more close to results obtained in MNIST. What is wrong with my current approach?

Is it possible to combine 2 neural networks?

I have a NET like (exemple from here)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square, you can specify with a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
and another net like (exemple from here)
class binaryClassification(nn.Module):
def __init__(self):
super(binaryClassification, self).__init__()
# Number of input features is 12.
self.layer_1 = nn.Linear(12, 64)
self.layer_2 = nn.Linear(64, 64)
self.layer_out = nn.Linear(64, 1)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.1)
self.batchnorm1 = nn.BatchNorm1d(64)
self.batchnorm2 = nn.BatchNorm1d(64)
def forward(self, inputs):
x = self.relu(self.layer_1(inputs))
x = self.batchnorm1(x)
x = self.relu(self.layer_2(x))
x = self.batchnorm2(x)
x = self.dropout(x)
x = self.layer_out(x)
return x
I'd like to change, for exemple "self.fc2 = nn.Linear(120, 84)" in order to have 121 inputs, where the 121th is the x (output) of the binaryClassification network.
The idea is: I'd like to use in the same time, CNN network, and not-CNN network, to train both, with influence one on the other.
Is it possible? How can I perform that? (Keras or Pytorch examples are both ok).
Or maybe the idea is crazy and there is easier way to mix data and image as input of an unique network?
It is a perfectly valid approach, you are taking two different input data sources, processing them and combining the result to solve a common goal (in this case it seems like a 10-class image classification). You can define the input to your Net network to be a tuple of the image you need for the original Net and the features 12-value vector for your BinaryClassificator. An example code would be:
import torch
import torch.nn as nn
class binaryClassification(nn.Module):
#> ...same as above
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
self.binClas = binaryClassification()
self.fc2 = nn.Linear(121, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, inputs):
x, features = inputs # split tuple
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square, you can specify with a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
# Concatenate with BinaryClassification
x = torch.cat([F.relu(self.fc1(x)), self.binClas(features)])
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
However! Be careful about training them together, it is hard to balance both branches in the network to make them learn. I would recommend you to train them separately for a while before plugging them together (generally speaking, the hyperparameters of one part of the network will probably not be optimal for the other). To do this, you could freeze one part of the network while training the other, and viceversa. (check this link to see how to freeze parts of a torch nn)
The most naive way to do it would be to instantiate both models, sum the two predictions and compute the loss with it. This will backpropagate through both models:
net1 = Net1()
net2 = Net2()
bce = torch.nn.BCEWithLogitsLoss()
params = list(net1.parameters()) + list(net2.parameters())
optimizer = optim.SGD(params)
for (x, ground_truth) in enumerate(your_data_loader):
optimizer.zero_grad()
prediction = net1(x) + net2(x) # the 2 models must output tensors of same shape
loss = bce(prediction, ground_truth)
train_loss.backward()
optimizer.step()
You could also e.g.
implement the layers of Net1 and Net2 in a single model
train Net1 and Net2 separately and ensemble them later

Pytorch - Loss for Object Localization

I am trying to perform an object localization task with MNIST based on Andrew Ng's lecture here. I am taking the MNIST digits and randomly placing them into a 90x90 shaped image and predicting the digit and it's center point. When I train, I am getting very poor results and my question is about whether or not my loss function is set up correctly. I basically just take the CrossEntropy for the digit, the MSE for the coordinates, and then add them all up. Is this correct? I don't get any errors, but the performance is just horrendous.
My dataset is defined as follows (which returns the label and the x y coordinates of the center of the digit):
class CustomMnistDataset_OL(Dataset):
def __init__(self, df, test=False):
'''
df is a pandas dataframe with 28x28 columns for each pixel value in MNIST
'''
self.df = df
self.test = test
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
if self.test:
image = np.reshape(np.array(self.df.iloc[idx,:]), (28,28)) / 255.
else:
image = np.reshape(np.array(self.df.iloc[idx,1:]), (28,28)) / 255.
# create the new image
new_img = np.zeros((90, 90)) # images will be 90x90
# randomly select a bottom left corner to use for img
x_min, y_min = randrange(90 - image.shape[0]), randrange(90 - image.shape[0])
x_max, y_max = x_min + image.shape[0], y_min + image.shape[0]
x_center = x_min + (x_max-x_min)/2
y_center = y_min + (y_max-x_min)/2
new_img[x_min:x_max, y_min:y_max] = image
label = [int(self.df.iloc[idx,0]), x_center, y_center] # the label consists of the digit and the center of the number
sample = {"image": new_img, "label": label}
return sample['image'], sample['label']
My training function is set up as follows:
loss_fn = nn.CrossEntropyLoss()
loss_mse = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
def train(dataloader, model, loss_fn, loss_mse, optimizer):
model.train() # very important... This turns the model back to training mode
size = len(train_dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y0, y1, y2 = X.to(device), y[0].to(device), y[1].to(device), y[2].to(device)
pred = model(X.float())
# DEFINE LOSS HERE -------
loss = loss_fn(pred[0], y0) + loss_mse(pred[1], y1.float()) + loss_mse(pred[2], y2.float())
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch*len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

Losses are increasing in Binary classification using gradient descent optimization method

This my program for Binary classification using gradient descent optimization method. I am not sure about my loss function. The error in my case is incresing when plotted
def sigmoid_activation(x):
return 1.0 / (1 + np.exp(-x))
def predict(testX, W):
preds= sigmoid_activation(np.dot(testX,W))
# apply a step function to threshold (=0.5) the outputs to binary class
#labels
#start your code here
for i in range(len(preds)):
if preds[i]<0.5:
p.append(0)
if preds[i]>=0.5:
p.append(1)
return p
epochs = 50
alpha = 0.01
(X,y)=make_moons(n_samples=1000, noise = 0.15)
y=y.reshape(y.shape[0],1)
X = np.c_[X, np.ones((X.shape[0]))]
(trainX, testX, trainY, testY) = train_test_split(X, y,
test_size=0.5, random_state=42)
print("[INFO] training...")
W = np.random.randn(X.shape[1], 1)
losses = []
for epoch in np.arange(0, epochs):
#start your code here
Z=np.dot(trainX, W)
yhat= sigmoid_activation(Z)
error=trainY-yhat
loss = np.sum(error ** 2)
losses.append(loss)
gradient = trainX.T.dot(error) / trainX.shape[0]
W= W-alpha*gradient #moving in -ve direction
# check to see if an update should be displayed
if epoch == 0 or (epoch + 1) % 5 == 0:
print("[INFO] epoch={}, loss={:.7f}".format(int(epoch + 1),
loss))
# evaluate our model
print("[INFO] evaluating...")
preds = predict(testX, W)
print(classification_report(testY, preds))
# plot the (testing) classification data
plt.style.use("ggplot")
plt.figure()
plt.title("Data")
plt.scatter(testX[:, 0], testX[:, 1], marker="o", c=testY[:,0], s=30)
# construct a figure that plots the loss over time
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, epochs), losses)
plt.title("Training Loss")
plt.xlabel("Epoch #")
plt.ylabel("Loss")
plt.show()

Getting polynomial regression to overfit with TensorFlow

The Sklearn documentation contains an example of a polynomial regression which beautifully illustrates the idea of overfitting (link).
The third plot shows a 15th order polynomial that overfits the simulated data. I replicated this model in TensorFlow, but I cannot get it to overfit.
Even when tuning the learning rate and the numbers of learning epochs, I cannot get the model to overfit. What am I missing?
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
def true_fun(X):
return np.cos(1.5 * np.pi * X)
# Generate dataset
n_samples = 30
np.random.seed(0)
x_train = np.sort(np.random.rand(n_samples)) # Draw from uniform distribution
y_train = true_fun(x_train) + np.random.randn(n_samples) * 0.1
x_test = np.linspace(0, 1, 100)
y_true = true_fun(x_test)
# Helper function
def run_dir(base_dir, dirname='run'):
"Number log directories incrementally"
import os
import re
pattern = re.compile(dirname+'_(\d+)')
try:
previous_runs = os.listdir(base_dir)
except FileNotFoundError:
previous_runs = []
run_number = 0
for name in previous_runs:
match = pattern.search(name)
if match:
number = int(match.group(1))
if number > run_number:
run_number = number
run_number += 1
logdir = os.path.join(base_dir, dirname + '_%02d' % run_number)
return(logdir)
# Define the polynomial model
def model(X, w):
"""Polynomial model
param X: data
param y: coeficients in the polynomial regression
returns: Polynomial function Y(X, w)
"""
terms = []
for i in range(int(w.shape[0])):
term = tf.multiply(w[i], tf.pow(X, i))
terms.append(term)
return(tf.add_n(terms))
# Create the computation graph
order = 15
tf.reset_default_graph()
X = tf.placeholder("float")
Y = tf.placeholder("float")
w = tf.Variable([0.]*order, name="parameters")
lambda_reg = tf.placeholder('float', shape=[])
learning_rate_ph = tf.placeholder('float', shape=[])
y_model = model(X, w)
loss = tf.div(tf.reduce_mean(tf.square(Y-y_model)), 2) # Square error
loss_rg = tf.multiply(lambda_reg, tf.reduce_sum(tf.square(w))) # L2 pentalty
loss_total = tf.add(loss, loss_rg)
loss_hist1 = tf.summary.scalar('loss', loss)
loss_hist2 = tf.summary.scalar('loss_rg', loss_rg)
loss_hist3 = tf.summary.scalar('loss_total', loss_total)
summary = tf.summary.merge([loss_hist1, loss_hist2, loss_hist3])
train_op = tf.train.GradientDescentOptimizer(learning_rate_ph).minimize(loss_total)
init = tf.global_variables_initializer()
def train(sess, x_train, y_train, lambda_val=0, epochs=2000, learning_rate=0.01):
feed_dict={X: x_train, Y: y_train, lambda_reg: lambda_val, learning_rate_ph: learning_rate}
logdir = run_dir("logs/polynomial_regression2/")
writer = tf.summary.FileWriter(logdir)
sess.run(init)
for epoch in range(epochs):
_, summary_str = sess.run([train_op, summary], feed_dict=feed_dict)
writer.add_summary(summary_str, global_step=epoch)
final_cost, final_cost_rg, w_learned = sess.run([loss, loss_rg, w], feed_dict=feed_dict)
return final_cost, final_cost_rg, w_learned
def plot_test(w_learned, x_test, x_train, y_train):
y_learned = calculate_y(x_test, w_learned)
plt.scatter(x_train, y_train)
plt.plot(x_test, y_true, label="true function")
plt.plot(x_test, y_learned,'r', label="learned function")
#plt.title('$\lambda = {:03.2f}$'.format(lambda_values[i]))
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.show()
def calculate_y(x, w):
y = 0
for i in range(w.shape[0]):
y += w[i] * np.power(x, i)
return y
sess = tf.Session()
final_cost, final_cost_rg, w_learned = train(sess, x_train, y_train, lambda_val=0,
learning_rate=0.3, epochs=2000)
sess.close()
plot_test(w_learned, x_test, x_train, y_train)
I have same problem about this. When I do polynomial regression, I also can't overfit the data by using GD in Tensorflow.
Then I compare the coefficients(weights) of the model by using sklearn LinearRegression, I found when the polynomial degree is larger the coefficient of high order is very smaller(i.e. 1e-4), and the low order is relative large(i.e. 0.1).
That's mean when you using GD algorithm for searching the best value of weights, the high order coefficient become extreme sensitive about the value change, and the low order coefficient is not.
And I guess the best coefficient(overfit with data) of low order term is large, and of high order term is tiny. When you set large learning rate, it's impossible to find the right answer, and when you set tiny learning rate, you need lots of iterations.
It's obvious when you using GD algorithm with small data set to make overfit.