LSTM layer returns nan when fed by its own output in PyTorch - deep-learning

I’m trying to generate time-series data with an LSTM and a Mixture Density Network as described in https://arxiv.org/pdf/1308.0850.pdf
Here is a link to my implementation: https://github.com/NeoVand/MDNLSTM
The repository contains a toy dataset to train the network.
On training, the LSTM layer returns nan for its hidden state after one iteration. A similar issue is reported here.
For your convenience, here is the code:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import numpy.random as npr
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
ts = torch.load('LDS_Toy_Data.pt')
def detach(states):
return [state.detach() for state in states]
class MDNLSTM(nn.Module):
def __init__(self, d_obs, d_lat=2, n_gaussians=2, n_layers=1):
super(MDNLSTM, self).__init__()
self.d_obs = d_obs
self.d_lat = d_lat
self.n_gaussians = n_gaussians
self.n_layers = n_layers
self.lstm = nn.LSTM(d_obs, d_lat, n_layers, batch_first=True)
self.fcPi = nn.Linear(d_lat, n_gaussians*d_obs)
self.fcMu = nn.Linear(d_lat, n_gaussians*d_obs)
self.fcSigma = nn.Linear(d_lat, n_gaussians*d_obs)
def get_mixture_coef(self, y):
time_steps = y.size(1)
pi, mu, sigma = self.fcPi(y), self.fcMu(y), self.fcSigma(y)
pi = pi.view(-1, time_steps, self.n_gaussians, self.d_obs)
mu = mu.view(-1, time_steps, self.n_gaussians, self.d_obs)
sigma = sigma.view(-1, time_steps, self.n_gaussians, self.d_obs)
pi = F.softmax(pi, 2)
sigma = torch.exp(sigma)
return pi, mu, sigma
def forward(self, x, h):
y, (h, c) = self.lstm(x, h)
#print(h)
pi, mu, sigma = self.get_mixture_coef(y)
return (pi, mu, sigma), (h, c)
def init_hidden(self, bsz):
return (torch.zeros(self.n_layers, bsz, self.d_lat).to(device),
torch.zeros(self.n_layers, bsz, self.d_lat).to(device))
def mdn_loss_fn(y, pi, mu, sigma):
m = torch.distributions.Normal(loc=mu, scale=sigma)
loss = torch.exp(m.log_prob(y))
loss = torch.sum(loss * pi, dim=2)
loss = -torch.log(loss)
return loss.mean()
def criterion(y, pi, mu, sigma):
y = y.unsqueeze(2)
return mdn_loss_fn(y, pi, mu, sigma)
DOBS = 10
DLAT = 2
INSTS = 100
seqlen = 30
epochs = 200
mdnlstm = MDNLSTM(DOBS, DLAT).to(device)
optimizer = torch.optim.Adam(mdnlstm.parameters())
z = torch.from_numpy(ts[:INSTS,:,:]).float().to(device)
# hiddens=[]
# Train the model
for epoch in range(epochs):
# Set initial hidden and cell states
hidden = mdnlstm.init_hidden(INSTS)
for i in range(0, z.size(1) - seqlen, seqlen):
# Get mini-batch inputs and targets
inputs = z[:, i:i+seqlen, :]
targets = z[:, (i+1):(i+1)+seqlen, :]
hidden = detach(hidden)
# hiddens.append(hidden)
(pi, mu, sigma), hidden = mdnlstm(inputs, hidden)
loss = criterion(targets, pi, mu, sigma)
mdnlstm.zero_grad()
loss.backward()
optimizer.step()
if epoch % 100 == 0:
print ('Epoch [{}/{}], Loss: {:.4f}'
.format(epoch, epochs, loss.item()))
I would appreciate any help on this.

The issue was caused by the log-sum-exp operation not being done in a stable way. Here is an implementation of a weighted log-sum-exp trick that I used and could fix the problem:
def weighted_logsumexp(x,w, dim=None, keepdim=False):
if dim is None:
x, dim = x.view(-1), 0
xm, _ = torch.max(x, dim, keepdim=True)
x = torch.where(
# to prevent nasty nan's
(xm == float('inf')) | (xm == float('-inf')),
xm,
xm + torch.log(torch.sum(torch.exp(x - xm)*w, dim, keepdim=True)))
return x if keepdim else x.squeeze(dim)
and using that implemented the stable loss function:
def mdn_loss_stable(y,pi,mu,sigma):
m = torch.distributions.Normal(loc=mu, scale=sigma)
m_lp_y = m.log_prob(y)
loss = -weighted_logsumexp(m_lp_y,pi,dim=2)
return loss.mean()
This worked like a charm. In general, the problem is that torch won't report under-flows.

Related

issue with arcface ( 0 accuracy)

Hello guys I've joined a university-level image recognition competition.
In the test, they will give two images (people face) and my model need to detect pair of the image is the same person or not
My model is resnet18 with IR block and SE block. and it will use Arcface loss.
I can use only the MS1M dataset with a total of 86876 classes
The problem is that loss is getting better, but accuracy is 0 and not changing.
Here's part of code I'm working on.
Train
def train_model(model, net, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
for phase in ['train']:
if phase == 'train':
model.train() # Set model to training mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in notebook.tqdm(dataloader):
inputs = inputs.to(device)
labels = labels.to(device).long()
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
features = model(inputs)
outputs = net(features, labels)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / len(dataloader)
epoch_acc = running_corrects.double() / len(dataloader)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'train' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
torch.save({'epoch': epoch,
'mode_state_dict': model.state_dict(),
'fc_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler': scheduler.state_dict(), # HERE IS THE CHANGE
}, f'/content/drive/MyDrive/inha_data/training_saver/training_stat{epoch}.pth')
print(f'finished {epoch} and saved model_save_{epoch}.pt')
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best train Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), 'model_save.pt')
return model
Parameters
train_dataset = MS1MDataset('train')
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True,num_workers=4)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 디바이스 설정
num_classes = 86876
# normal classifier
# net = nn.Sequential(nn.Linear(512, num_classes))
# Feature extractor backbone, input is 112x112 image output is 512 feature vector
model_ft = resnet18(True)
#set metric
metric_fc = metrics.ArcMarginProduct(512, num_classes, s = 30.0, m = 0.50, easy_margin = False)
metric_fc.to(device)
# net = net.to(device)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = torch.optim.Adam([{'params': model_ft.parameters()}, {'params': metric_fc.parameters()}],
lr=0.1)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=4, gamma=0.1)
Arcface
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import math
class ArcMarginProduct(nn.Module):
r"""Implement of large margin arc distance: :
Args:
in_features: size of each input sample
out_features: size of each output sample
s: norm of input feature
m: margin
cos(theta + m)
"""
def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
super(ArcMarginProduct, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.s = s
self.m = m
self.weight = Parameter(torch.FloatTensor(out_features, in_features))
nn.init.xavier_uniform_(self.weight)
self.easy_margin = easy_margin
self.cos_m = math.cos(m)
self.sin_m = math.sin(m)
self.th = math.cos(math.pi - m)
self.mm = math.sin(math.pi - m) * m
def forward(self, input, label):
# --------------------------- cos(theta) & phi(theta) ---------------------------
cosine = F.linear(F.normalize(input), F.normalize(self.weight))
sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
phi = cosine * self.cos_m - sine * self.sin_m
if self.easy_margin:
phi = torch.where(cosine > 0, phi, cosine)
else:
phi = torch.where(cosine > self.th, phi, cosine - self.mm)
# --------------------------- convert label to one-hot ---------------------------
# one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
one_hot = torch.zeros(cosine.size(), device='cuda')
one_hot.scatter_(1, label.view(-1, 1).long(), 1)
# -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
output = (one_hot * phi) + ((1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4
output *= self.s
# print(output)
return output
dataset
data_transforms = {
'train': transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.125, contrast=0.125, saturation=0.125),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
#train_ms1_data = torchvision.datasets.ImageFolder('/content/drive/MyDrive/inha_data/train', transform = data_transforms)
class MS1MDataset(Dataset):
def __init__(self,split):
self.file_list = '/content/drive/MyDrive/inha_data/ID_List.txt'
self.images = []
self.labels = []
self.transformer = data_transforms['train']
with open(self.file_list) as f:
files = f.read().splitlines()
for i, fi in enumerate(files):
fi = fi.split()
image = "/content/" + fi[1]
label = int(fi[0])
self.images.append(image)
self.labels.append(label)
def __getitem__(self, index):
img = Image.open(self.images[index])
img = self.transformer(img)
label = self.labels[index]
return img, label
def __len__(self):
return len(self.images)
You can try to use a smaller m in ArcFace, even a minus value.

How to output the accuracy alongside with the loss when training the MNIST dataset after each epoch

from __future__ import print_function
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from tensorflow.examples.tutorials.mnist import input_data
import torch.optim as optim
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
import matplotlib.pyplot as plt
%matplotlib inline
from plot import plot_loss_and_acc
mnist = input_data.read_data_sets("MNIST_data", one_hot=False)
batch_size = 250
epoch_num = 10
lr = 0.0001
disp_freq = 20
def next_batch(train=True):
# Reads the next batch of MNIST images and labels and returns them
if train:
batch_img, batch_label = mnist.train.next_batch(batch_size)
else:
batch_img, batch_label = mnist.test.next_batch(batch_size)
batch_label = torch.from_numpy(batch_label).long() # convert the numpy array into torch tensor
batch_label = Variable(batch_label) # create a torch variable
batch_img = torch.from_numpy(batch_img).float() # convert the numpy array into torch tensor
batch_img = Variable(batch_img) # create a torch variable
return batch_img, batch_label
class MLP(nn.Module):
def __init__(self, n_features, n_classes):
super(MLP, self).__init__()
self.layer1 = nn.Linear(n_features, 128)
self.layer2 = nn.Linear(128, 128)
self.layer3 = nn.Linear(128, n_classes)
def forward(self, x, training=True):
# a neural network with 2 hidden layers
# x -> FC -> relu -> dropout -> FC -> relu -> dropout -> FC -> output
x = F.relu(self.layer1(x))
x = F.dropout(x, 0.5, training=training)
x = F.relu(self.layer2(x))
x = F.dropout(x, 0.5, training=training)
x = self.layer3(x)
return x
def predict(self, x):
# a function to predict the labels of a batch of inputs
x = F.softmax(self.forward(x, training=False))
return x
def accuracy(self, x, y):
# a function to calculate the accuracy of label prediction for a batch of inputs
# x: a batch of inputs
# y: the true labels associated with x
prediction = self.predict(x)
maxs, indices = torch.max(prediction, 1)
acc = 100 * torch.sum(torch.eq(indices.float(), y.float()).float())/y.size()[0]
print(acc.data)
return acc.data
# define the neural network (multilayer perceptron)
net = MLP(784, 10)
# calculate the number of batches per epoch
batch_per_ep = mnist.train.num_examples // batch_size
# define the loss (criterion) and create an optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=lr)
print(' ')
print("__________Training__________________")
xArray = []
yLoss = []
yAcc = []
for ep in range(epoch_num): # epochs loop
for batch_n in range(batch_per_ep): # batches loop
features, labels = next_batch()
# Reset gradients
optimizer.zero_grad()
# Forward pass
output = net(features)
loss = criterion(output, labels)
# Backward pass and updates
loss.backward() # calculate the gradients (backpropagation)
optimizer.step() # update the weights
if batch_n % disp_freq == 0:
print('epoch: {} - batch: {}/{} '.format(ep, batch_n, batch_per_ep))
xArray.append(ep)
yLoss.append(loss.data)
#yAcc.append(acc.data)
print('loss: ', loss.data)
print('__________________________________')
# test the accuracy on a batch of test data
features, labels = next_batch(train=False)
print("Result")
print('Test accuracy: ', net.accuracy(features, labels))
print('loss: ', loss.data)
accuracy = net.accuracy(features, labels)
#Loss Plot
# plotting the points
plt.plot(xArray, yLoss)
# naming the x axis
plt.xlabel('epoch')
# naming the y axis
plt.ylabel('loss')
# giving a title to my graph
plt.title('Loss Plot')
# function to show the plot
plt.show()
#Accuracy Plot
# plotting the points
plt.plot(xArray, yAcc)
# naming the x axis
plt.xlabel('epoch')
# naming the y axis
plt.ylabel(' accuracy')
# giving a title to my graph
plt.title('Accuracy Plot ')
# function to show the plot
plt.show()
I want to display the accuracy of my training dataset. I have managed to display and plot the loss but I didn't manage to do it for accuracy. I know I am missing 1 or 2 lines of code and I don't know how to do it.
I mean if I can display the accuracy alongside each epoch like the loss I can do the plotting myself.
Hi replace this code print('epoch: {} - batch: {}/{} '.format(ep, batch_n, batch_per_ep)) with
print('epoch: {} - batch: {}/{} - accuracy: {}'.format(ep, batch_n, batch_per_ep, net.accuracy(features,labels)))
Hope this helps.

Pytorch Fully Connected Feed Forward Network for a regression problem - Gives same result for all inputs

I build a neural network model in Pytorch for a simple regression problem (w1x1+w2x2+w3x3 = y) where I generated 2000 records for training data with random values for x1,x2,x3 and W1=4, W2=6, W3=2. I created a test dataset of 20 records with just values for x1,x2,x3 and I was hoping to get the result for But, the model returns same value for all 20 input rows. I don't know where the issue is. Below is the code snippet.
inputs = df[['x1', 'x2', 'x3']]
target = df['y']
inputs = torch.tensor(inputs.values).float()
target = torch.tensor(target.values).float()
test_data = torch.tensor(test_data.values).float()
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net,self).__init__()
hidden1 = 10
hidden2 = 15
self.fc1 = nn.Linear(3,hidden1)
self.fc2 = nn.Linear(hidden1,hidden2)
self.fc3 = nn.Linear(hidden2,1)
def forward(self,x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
#instantiate the model
model = Net()
print(model)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.01)
model.train()
#epochs
epochs = 500
for x in range(epochs):
#initialize the training loss to 0
train_loss = 0
#clear out gradients
optimizer.zero_grad()
#calculate the output
output = model(inputs)
#calculate loss
loss = criterion(output,target)
#backpropagate
loss.backward()
#update parameters
optimizer.step()
if ((x%5)==0):
print('Training Loss after epoch {:2d} is {:2.6f}'.format(x,loss))
#set the model in evaluation mode
model.eval()
#Test the model on unseen data
test_output = model(test_data)
print(test_output)

MXNET CNN+LSTM save/serialize to json

I'm finding a hardtime figuring out how to correctly define a mxnet net so that i can serialize/convert this model to a json file.
The pipeline is composed of a CNN + biLSTM + CTC.
I now i must use HybridBlock and hybridize() but i can't seem to make it work or if its even possible or if there is any other way around.
I'm sure its lack of knowledge on my part and wonder is anyone can help.
Here is the net definition in python:
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
# conv layer
featurizer.add(gluon.nn.Conv2D(kernel_size=(3,3), padding=(1,1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
....
featurizer.hybridize()
return featurizer
class EncoderLayer(gluon.Block):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def forward(self, x):
x = x.transpose((0,3,1,2))
x = x.flatten()
x = x.split(num_outputs=SEQ_LEN, axis = 1) # (SEQ_LEN, N, CHANNELS)
x = nd.concat(*[elem.expand_dims(axis=0) for elem in x], dim=0)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.Sequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
decoder.hybridize()
return decoder
def get_net():
net = gluon.nn.Sequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
Any help would be highly appreciated.
Thank you very much.
There are few requirements for a model in Gluon to be exportable to json:
It needs to be hybridizable, meaning that each children block should be hybridizable as well and the model works in both modes
All parameters should be initialized. Since Gluon uses deferred parameter initialization, that means that you should do forward pass at least once before you can save the model.
I did some fixes for your code also introducing new constants when I needed. The most significant changes are:
Don't use split if you can avoid it, because it returns list of NDArrays. Use reshape, which works seemlessly with Symbol as well.
Starting from 1.3.0 version of MXNet, LSTM is also hybridizable, so you can wrap it in a HybridBlock instead of just a Block.
Use HybridSequential.
Here is the adjusted code with an example at the bottom how to save the model and how to load it back. You can find more information in this tutorial.
import mxnet as mx
from mxnet import gluon
from mxnet import nd
BATCH_SIZE = 1
CHANNELS = 100
ALPHABET_SIZE = 1000
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
HEIGHT = 100
WIDTH = 100
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
featurizer.add(
gluon.nn.Conv2D(kernel_size=(3, 3), padding=(1, 1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
return featurizer
class EncoderLayer(gluon.HybridBlock):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def hybrid_forward(self, F, x):
x = x.transpose((0, 3, 1, 2))
x = x.flatten()
x = x.reshape(shape=(SEQ_LEN, -1, CHANNELS)) #x.split(num_outputs=SEQ_LEN, axis=1) # (SEQ_LEN, N, CHANNELS)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.HybridSequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
return decoder
def get_net():
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
if __name__ == '__main__':
net = get_net()
net.initialize()
net.hybridize()
fake_data = mx.random.uniform(shape=(BATCH_SIZE, HEIGHT, WIDTH, CHANNELS))
out = net(fake_data)
net.export("mymodel")
deserialized_net = gluon.nn.SymbolBlock.imports("mymodel-symbol.json", ['data'],
"mymodel-0000.params", ctx=mx.cpu())
out2 = deserialized_net(fake_data)
# just to check that we get the same results
assert (out - out2).sum().asscalar() == 0

Getting polynomial regression to overfit with TensorFlow

The Sklearn documentation contains an example of a polynomial regression which beautifully illustrates the idea of overfitting (link).
The third plot shows a 15th order polynomial that overfits the simulated data. I replicated this model in TensorFlow, but I cannot get it to overfit.
Even when tuning the learning rate and the numbers of learning epochs, I cannot get the model to overfit. What am I missing?
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
def true_fun(X):
return np.cos(1.5 * np.pi * X)
# Generate dataset
n_samples = 30
np.random.seed(0)
x_train = np.sort(np.random.rand(n_samples)) # Draw from uniform distribution
y_train = true_fun(x_train) + np.random.randn(n_samples) * 0.1
x_test = np.linspace(0, 1, 100)
y_true = true_fun(x_test)
# Helper function
def run_dir(base_dir, dirname='run'):
"Number log directories incrementally"
import os
import re
pattern = re.compile(dirname+'_(\d+)')
try:
previous_runs = os.listdir(base_dir)
except FileNotFoundError:
previous_runs = []
run_number = 0
for name in previous_runs:
match = pattern.search(name)
if match:
number = int(match.group(1))
if number > run_number:
run_number = number
run_number += 1
logdir = os.path.join(base_dir, dirname + '_%02d' % run_number)
return(logdir)
# Define the polynomial model
def model(X, w):
"""Polynomial model
param X: data
param y: coeficients in the polynomial regression
returns: Polynomial function Y(X, w)
"""
terms = []
for i in range(int(w.shape[0])):
term = tf.multiply(w[i], tf.pow(X, i))
terms.append(term)
return(tf.add_n(terms))
# Create the computation graph
order = 15
tf.reset_default_graph()
X = tf.placeholder("float")
Y = tf.placeholder("float")
w = tf.Variable([0.]*order, name="parameters")
lambda_reg = tf.placeholder('float', shape=[])
learning_rate_ph = tf.placeholder('float', shape=[])
y_model = model(X, w)
loss = tf.div(tf.reduce_mean(tf.square(Y-y_model)), 2) # Square error
loss_rg = tf.multiply(lambda_reg, tf.reduce_sum(tf.square(w))) # L2 pentalty
loss_total = tf.add(loss, loss_rg)
loss_hist1 = tf.summary.scalar('loss', loss)
loss_hist2 = tf.summary.scalar('loss_rg', loss_rg)
loss_hist3 = tf.summary.scalar('loss_total', loss_total)
summary = tf.summary.merge([loss_hist1, loss_hist2, loss_hist3])
train_op = tf.train.GradientDescentOptimizer(learning_rate_ph).minimize(loss_total)
init = tf.global_variables_initializer()
def train(sess, x_train, y_train, lambda_val=0, epochs=2000, learning_rate=0.01):
feed_dict={X: x_train, Y: y_train, lambda_reg: lambda_val, learning_rate_ph: learning_rate}
logdir = run_dir("logs/polynomial_regression2/")
writer = tf.summary.FileWriter(logdir)
sess.run(init)
for epoch in range(epochs):
_, summary_str = sess.run([train_op, summary], feed_dict=feed_dict)
writer.add_summary(summary_str, global_step=epoch)
final_cost, final_cost_rg, w_learned = sess.run([loss, loss_rg, w], feed_dict=feed_dict)
return final_cost, final_cost_rg, w_learned
def plot_test(w_learned, x_test, x_train, y_train):
y_learned = calculate_y(x_test, w_learned)
plt.scatter(x_train, y_train)
plt.plot(x_test, y_true, label="true function")
plt.plot(x_test, y_learned,'r', label="learned function")
#plt.title('$\lambda = {:03.2f}$'.format(lambda_values[i]))
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.show()
def calculate_y(x, w):
y = 0
for i in range(w.shape[0]):
y += w[i] * np.power(x, i)
return y
sess = tf.Session()
final_cost, final_cost_rg, w_learned = train(sess, x_train, y_train, lambda_val=0,
learning_rate=0.3, epochs=2000)
sess.close()
plot_test(w_learned, x_test, x_train, y_train)
I have same problem about this. When I do polynomial regression, I also can't overfit the data by using GD in Tensorflow.
Then I compare the coefficients(weights) of the model by using sklearn LinearRegression, I found when the polynomial degree is larger the coefficient of high order is very smaller(i.e. 1e-4), and the low order is relative large(i.e. 0.1).
That's mean when you using GD algorithm for searching the best value of weights, the high order coefficient become extreme sensitive about the value change, and the low order coefficient is not.
And I guess the best coefficient(overfit with data) of low order term is large, and of high order term is tiny. When you set large learning rate, it's impossible to find the right answer, and when you set tiny learning rate, you need lots of iterations.
It's obvious when you using GD algorithm with small data set to make overfit.