Cannot properly load PyTorch Lightning model from checkpoint - deep-learning

I have trained a Pytorch lightning model of the following class:
class LSTMClassifier(pl.LightningModule):
def __init__(self, n_features, hidden_size, batch_size, num_layers, dropout, learning_rate):
super(LSTMClassifier, self).__init__()
self.save_hyperparameters()
# Params
self.n_features = n_features
self.hidden_size = hidden_size
self.batch_size = batch_size
self.num_layers = num_layers
self.dropout = dropout
self.learning_rate = learning_rate
# Architecture Baseline
self.lstm = nn.LSTM(input_size=n_features,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=dropout,
batch_first=True)
self.relu = nn.ReLU()
self.fc = nn.Linear(hidden_size, 2)
self.sigmoid = nn.Sigmoid()
It gives me a test set accuracy of 0.76 when I call the trainer.test() function directly after training:
# Init PyTorch model
model = LSTMClassifier(
n_features=p['n_features'],
hidden_size=p['hidden_size'],
batch_size=p['batch_size'],
num_layers=p['num_layers'],
dropout=p['dropout'],
learning_rate=p['learning_rate']
)
model_checkpoint = ModelCheckpoint(
filename='[PATH.ckpt]'
)
# Trainer GPU
trainer = Trainer(max_epochs=p['max_epochs'], callbacks=[model_checkpoint], gpus=int(GPU))
trainer.fit(model, dm)
trainer.test(model, test_dataloaders=dm.test_dataloader())
However, when I load the checkpoint at a later time with the exact same dataloader, it gives me an accuracy of 0.48:
model_checkpoint = ModelCheckpoint(
filename='LSTM-batch-{batch_size}-epoch-{max_epochs}-hidden-{hidden_size}-layers-{'
'num_layers}-dropout-{dropout}-lr-{learning_rate}'.format(**p)
)
# Trainer GPU
trainer = Trainer(max_epochs=p['max_epochs'], callbacks=[model_checkpoint], gpus=int(GPU))
model = LSTMClassifier.load_from_checkpoint([PATH TO CHECKPOINT])
model.eval()
trainer.test(model, test_dataloaders=dm.test_dataloader())
I suspect the model does not load correctly, but I cannot figure out what to do differently. Any ideas?
Using PyTorch Lightning 1.4.4

Turns out, that trainer.test(model, test_dataloaders=dm.test_dataloader()) was the issue. Once I replaced it with trainer.test(model, datamodule=dm) as per the updated documentation, it works.

Related

LSTM for time series forecasting

I have created a LSTM model in Pytorch which looks like this:
LSTMNet
Now I want to build another LSTM model (NewLSTMNet) on top of it (LSTMNet) by freezing the fc1 layer. I used:
model.fc1.weight.requires_grad = False
model.fc1.bias.requires_grad = False
and then I changed fc2 layer with a linear layer with input features = 40 and output features = 40.
So far I did:
class NewLSTMNet(nn.Module):
def __init__(self, model, input_size, hidden_size, num_layers):
super(NewLSTMNet, self).__init__()
self.model = model
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.model.fc2 = nn.Linear(40, 40)
# self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc3 = nn.Sequential(
# nn.ReLU(),
nn.Linear (40 , 128),
nn.ReLU(),
nn.Linear(128, 40),
nn.ReLU(),
nn.Linear(40,1),
nn.ReLU(),
)
def forward(self,x):
# input = self.model(x)
# h0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# c0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# _, (h_out,_) = self.lstm(input, (h0,c0))
# h_out = h_out.view(-1, self.hidden_size)
# print(h_out.shape)
# out = self.fc3(out)
out = self.model(x)
out = self.fc3(out)
return out
Now my new LSTM model looks like this:
NewLSTMNet
My training loop looks like this:
for epoch in range(EPOCHS):
model.train()
output = model(X_train)
train_loss = criterion(output, y_train)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
with torch.no_grad():
model.eval()
output_val = model(X_valid)
valid_loss = criterion(output_val, y_valid)
if valid_loss <= valid_loss_min:
torch.save(model.state_dict(), './state_dict_new.pt')
print(
f'Epoch {epoch + 0:01}: Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
early_stopping_counter = 0 # reset counter if validation loss decreases
else:
print(f'Epoch {epoch + 0:01}: Validation loss did not decrease')
early_stopping_counter += 1
if early_stopping_counter > early_stopping_patience:
print('Early stopped at epoch :', epoch)
break
print(f'\t Train_Loss: {train_loss:.4f} Val_Loss: {valid_loss:.4f} BEST VAL Loss: {valid_loss_min:.4f}\n')
Now the model is working fine. But I want to create a LSTM layer in the NewLSTMNet model. I already tried to add a LSTM layer but I was expecting a vector in the output but I am getting a matrix in the output of the prediction. So there is shape mismatch!
How should I modify my code? Any help is appreciated. Thanks in advance!

Resolve overfitting in a convolutional network

I've written a snippet to classify Omniglot images. I calculate the training and validation losses in each epoch, where the latter is computed using images that were not seen by the network before. The two plots are as below:
Since the training loss decreases while the validation loss increases, I have concluded that my model overfits. I've tried several suggestions (e.g. here) to overcome this, including:
Increasing the size of the training set.
shuffling the data.
Adding dropout layers (up to p=0.9).
Using smaller model.
Altering the architecture.
Changing the learning rate.
Reducing the batch size.
Adding weight decay.
However, the validation loss still increases. I wonder if there are any other suggestions to improve this behavior or if this is not overfitting, but the problem is something else. Below is the snippet used in this question.
import torch
import torchvision
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
dim_out = 964
# -- embedding params
self.cn1 = nn.Conv2d(1, 16, 7)
self.cn2 = nn.Conv2d(16, 32, 4)
self.cn3 = nn.Conv2d(32, 64, 3)
self.pool = nn.MaxPool2d(2)
self.bn1 = nn.BatchNorm2d(16)
self.bn2 = nn.BatchNorm2d(32)
self.bn3 = nn.BatchNorm2d(64)
# -- prediction params
self.fc1 = nn.Linear(256, 170)
self.fc2 = nn.Linear(170, 50)
self.fc3 = nn.Linear(50, dim_out)
# -- non-linearity
self.relu = nn.ReLU()
self.Beta = 10
self.sopl = nn.Softplus(beta=self.Beta)
def forward(self, x):
y1 = self.pool(self.bn1(self.relu(self.cn1(x))))
y2 = self.pool(self.bn2(self.relu(self.cn2(y1))))
y3 = self.relu(self.bn3(self.cn3(y2)))
y3 = y3.view(y3.size(0), -1)
y5 = self.sopl(self.fc1(y3))
y6 = self.sopl(self.fc2(y5))
return self.fc3(y6)
class Train:
def __init__(self):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# -- data
dim = 28
batch_size = 400
my_transforms = transforms.Compose([transforms.Resize((dim, dim)), transforms.ToTensor()])
trainset = torchvision.datasets.Omniglot(root="./data/omniglot_train/", download=False, transform=my_transforms)
validset = torchvision.datasets.Omniglot(root="./data/omniglot_train/", background=False, download=False,
transform=my_transforms)
self.TrainDataset = DataLoader(dataset=trainset, batch_size=batch_size, shuffle=True)
self.ValidDataset = DataLoader(dataset=validset, batch_size=len(validset), shuffle=False)
self.N_train = len(trainset)
self.N_valid = len(validset)
# -- model
self.model = MyModel().to(self.device)
# -- train
self.epochs = 3000
self.loss = nn.CrossEntropyLoss()
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
def train_epoch(self):
self.model.train()
train_loss = 0
for batch_idx, data_batch in enumerate(self.TrainDataset):
# -- predict
predict = self.model(data_batch[0].to(self.device))
# -- loss
loss = self.loss(predict, data_batch[1].to(self.device))
# -- optimize
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
train_loss += loss.item()
return train_loss/(batch_idx+1)
def valid_epoch(self):
with torch.no_grad():
self.model.eval()
for data_batch in self.ValidDataset:
# -- predict
predict = self.model(data_batch[0].to(self.device))
# -- loss
loss = self.loss(predict, data_batch[1].to(self.device))
return loss.item()
def __call__(self):
for epoch in range(self.epochs):
train_loss = self.train_epoch()
valid_loss = self.valid_epoch()
print('Epoch {}: Training loss = {:.5f}, Validation loss = {:.5f}.'.format(epoch, train_loss, valid_loss))
torch.save(self.model.state_dict(), './model_stat.pth')
if __name__ == '__main__':
my_train = Train()
my_train()
If your train accuracy is good but testing (data not used in training) accuracy is bad then you have an overfitting problem. I had the same problem with a CNN model. You can use two methods to overcome overfitting. First is early stopping for your train and second is regularization. Check the below example:
# L2 regularizers for layers
model = keras.Sequential([
keras.layers.InputLayer(input_shape=(32, 32)),
keras.layers.Reshape(target_shape=(32, 32, 1)),
keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu, use_bias=True , kernel_regularizer =tf.keras.regularizers.l2( l=0.01)),
keras.layers.MaxPooling2D(pool_size=(2, 2)),
keras.layers.Flatten(),
keras.layers.Dense(10, activation = 'softmax', use_bias=True)
])
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
#Early Stopping
history = model.fit(X_train, Y_train,
validation_data=(X_dev, Y_dev),
epochs=4000,
callbacks=EarlyStopping(monitor='val_loss'))
Do not forget to import for early stopping.
from tensorflow.keras.callbacks import EarlyStopping

Pytorch:Network not learning at all + Weights are too low

About the input. Sorry for the bad formatting. The for each two rows first row is the key and second row is the value. 18~20_ride is the label and is not included in the input. Below is one input. And train set consists of 400000 of these.
bus_route_id station_code latitude longitude 6~7_ride
0 4270000 344 33.48990 126.49373
7~8_ride 8~9_ride 9~10_ride 10~11_ride 11~12_ride 6~7_takeoff
0.0 1.0 2.0 5.0 2.0 6.0
7~8_takeoff 8~9_takeoff 9~10_takeoff 10~11_takeoff 11~12_takeoff
0.0 0.0 0.0 0.0 0.0
18~20_ride weekday dis_jejusi dis_seoquipo
0.0 6 2.954920 26.256744
Example weights: Captured at 4th epoch. After 20 epochs of training I got much smaller values (ex. -7e-44 or 1e-55)
2.3937e-11, -2.6920e-12, -1.0445e-11, ..., -1.0754e-11, 1.1128e-11, -1.4814e-11
The model's prediction and target
#Target
[2.],
[0.],
[0.]
#Prediction
[1.4187],
[1.4187],
[1.4187]
MyDataset.py
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import torch
import os
class MyDataset(Dataset):
def __init__(self, csv_filename):
self.dataset = pd.read_csv(csv_filename, index_col=0)
self.labels = self.dataset.pop("18~20_ride")
self.dataset = self.dataset.values
self.labels = np.reshape(self.labels.values,(-1,1))
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
return self.dataset[idx], self.labels[idx]
Model
class Network(nn.Module):
def __init__(self, input_num):
super(Network, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(input_num, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc2 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc3 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc4 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc5 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc6 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU)
)
self.fc7 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc8 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU())
)
self.fc9 = nn.Linear(64, 1)
The training and validation
def train(model, device, train_loader, optimizer, loss_fn, log_interval, epoch):
print("Training")
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.float().to(device), target.float().to(device)
optimizer.zero_grad()
output = model(data)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_idx+1) * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def validate(model, device, loader, loss_fn):
print("\nValidating")
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for batch_idx, (data, target) in enumerate(loader):
data, target = data.float().to(device), target.float().to(device)
output = model(data)
test_loss += loss_fn(output, target).item() # sum up batch loss
test_loss /= len(loader)
print('Validation average loss: {:.4f}\n'.format(
test_loss))
return test_loss
Entire process of training and validation
from MyDataset import MyDataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from datetime import datetime
train_dataset_path = "/content/drive/My Drive/root/bus/dataset/train.csv"
val_dataset_path = "/content/drive/My Drive/root/bus/dataset/val.csv"
model_base_path = "/content/drive/My Drive/root/bus/models/"
model_file = "/content/drive/My Drive/root/bus/models/checkpoints/1574427776.202017.pt"
"""
Training Config
"""
epochs = 10
batch_size = 32
learning_rate = 0.5
check_interval = 4
log_interval = int(40000/batch_size)
gamma = 0.1
load_model = False
save_model = True
make_checkpoint = True
"""
End of config
"""
# Read test set
train_set = MyDataset(train_dataset_path)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_set = MyDataset(val_dataset_path)
val_loader = DataLoader(val_set, batch_size=1)
print("Data READY")
device = torch.device("cuda")
net = Network(19).float().to(device)
if load_model:
net.load_state_dict(torch.load(model_file))
loss_fn = torch.nn.MSELoss()
optimizer = optim.AdamW(net.parameters(), lr=learning_rate)
best_loss = float('inf')
isAbort = False
for epoch in range(1, epochs+1):
train(net, device, train_loader, optimizer, loss_fn, log_interval, epoch)
val_loss = validate(net, device, val_loader, loss_fn)
if epoch%check_interval==0:
if make_checkpoint:
print("Saving new checkpoint")
torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
"""
if val_loss < best_loss and epoch%check_interval==0:
best_loss = val_loss
if make_checkpoint:
print("Saving new checkpoint")
torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
else:
print("Model overfit detected. Aborting training")
isAbort = True
break
"""
if save_model and not isAbort:
torch.save(net.state_dict(), model_base_path+"finals/"+str(datetime.today().timestamp())+".pt")
So I tried to train a fully connected model for a regression problem, with google colab. But it did not get trained well; The loss absolutely did not decrease. So I dug down and found out that the weights were really small. Any idea why this is happening and how I could avoid this? Thank you
I used MSE for loss and used ADaW optimizer. Below are the things I have tried
Tried other architectures (Changing number of layers sizes, Changed activation function ReLU, GELU)but the loss did not decrease
Tried changing the learning rate from 3e-1~1e-3, even tried 1
Tried other pre-processing(Used day/month/year instead of weekday) for the data
Given the label in the input data but loss did not decrease
Tried different batch_sizes(4, 10, 32, 64)
Removed batch_normalization
Other kinds of optimizer such as SGD, Adam
Training 20 epochs but loss did not decrease at all
The weights do change at loss.backward()
TL;DR: Invalid input data!! Check for NaN or NULL
Well it has been sometime since the question. Tried almost everything and though maybe messed up the project setup. So I deleted the project and tried it again: same. Delete again and migrate to TF2: THE SAME RESULT! So I found out that there wasn't any problem with the setup. So I searched other places. In the end I did find the reason. The input columns were actually modified by myself. (To remove some highly correlated features). It was not original. During the modification I messed up some float values and it ended up having NaN values. So check if you're dataset contains invalid values.

Passing word2vec embedding to a custom LSTM pytorch model

I have a set of input sentences. I am using the pretrained word2vec model from gensim to get the embedding of the input sentences. I want to pass these embeddings as input to a custom pytorch LSTM model
hidden_size = 32
num_layers = 1
num_classes = 2
class customModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(customModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=False, bidirectional=True)
self.fcl = nn.Linear(hidden_size*2, num_classes)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, hidden = self.bilstm(x, (h0, c0))
fw_bilstm = out[-1, :, :self.hidden_size]
bk_bilstm = out[0, :, :self.hidden_size]
concat_fw_bw = torch.cat((fw_bilstm, bk_bilstm), dim = 1)
fc = self.fcl(concat_fw_bw)
x = F.softmax(F.relu(fc))
return x
Now I initialize the model object.
model = customModel(300, hidden_size, num_layers, num_classes)
Get embedding for the input sentences
sentences = [['my', 'name', 'is', 'nad'], ['i', 'love', 'nlp', 'proc']]
embedding = create_embedding(sentences)
embedding_torch = torch.FloatTensor(embedding)
Now I want to pass these embeddings to the model to get the prediction
for item in embedding_torch:
item = item.view((1, item.size()[0], item.size()[1]))
for epoch in range(1):
tag_scores = model(item)
print (tag_scores)
Which throws me runtime error
RuntimeError: Expected hidden[0] size (2, 4, 32), got (2, 1, 32)
I am not sure why this is happening. My understanding is h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) line is calculating the hidden dimension properly.
What am I missing? Please suggest.
The backbone of your model is nn.LSTM which expects inputs with size [sequence_length, batch_size, embedding_size]. On the other hand, the inputs you are providing the model have size [1, sequence_lenth, embedding_size]. What I would do is create the nn.LSTM as:
# With batch_first=True
self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
That way, the model would expect the inputs to be of size [batch_size, sequence_length, embedding_size]. Then, instead of going through each element in the batch separately, do:
tag_scores = model(embedding_torch)

Highway networks in keras and lasagne - significant perfomance difference

I implemented highway networks with keras and with lasagne, and the keras version consistently underperforms to the lasagne version. I am using the same dataset and metaparameters in both of them. Here is the keras version's code:
X_train, y_train, X_test, y_test, X_all = hacking_script.load_all_data()
data_dim = 144
layer_count = 32
dropout = 0.04
hidden_units = 32
nb_epoch = 10
model = Sequential()
model.add(Dense(hidden_units, input_dim=data_dim))
model.add(Dropout(dropout))
for index in range(layer_count):
model.add(Highway(activation = 'relu'))
model.add(Dropout(dropout))
model.add(Dropout(dropout))
model.add(Dense(2, activation='softmax'))
print 'compiling...'
model.compile(loss='binary_crossentropy', optimizer='adagrad')
model.fit(X_train, y_train, batch_size=100, nb_epoch=nb_epoch,
show_accuracy=True, validation_data=(X_test, y_test), shuffle=True, verbose=0)
predictions = model.predict_proba(X_test)
And here is the lasagne version's code:
class MultiplicativeGatingLayer(MergeLayer):
def __init__(self, gate, input1, input2, **kwargs):
incomings = [gate, input1, input2]
super(MultiplicativeGatingLayer, self).__init__(incomings, **kwargs)
assert gate.output_shape == input1.output_shape == input2.output_shape
def get_output_shape_for(self, input_shapes):
return input_shapes[0]
def get_output_for(self, inputs, **kwargs):
return inputs[0] * inputs[1] + (1 - inputs[0]) * inputs[2]
def highway_dense(incoming, Wh=Orthogonal(), bh=Constant(0.0),
Wt=Orthogonal(), bt=Constant(-4.0),
nonlinearity=rectify, **kwargs):
num_inputs = int(np.prod(incoming.output_shape[1:]))
l_h = DenseLayer(incoming, num_units=num_inputs, W=Wh, b=bh, nonlinearity=nonlinearity)
l_t = DenseLayer(incoming, num_units=num_inputs, W=Wt, b=bt, nonlinearity=sigmoid)
return MultiplicativeGatingLayer(gate=l_t, input1=l_h, input2=incoming)
# ==== Parameters ====
num_features = X_train.shape[1]
epochs = 10
hidden_layers = 32
hidden_units = 32
dropout_p = 0.04
# ==== Defining the neural network shape ====
l_in = InputLayer(shape=(None, num_features))
l_hidden1 = DenseLayer(l_in, num_units=hidden_units)
l_hidden2 = DropoutLayer(l_hidden1, p=dropout_p)
l_current = l_hidden2
for k in range(hidden_layers - 1):
l_current = highway_dense(l_current)
l_current = DropoutLayer(l_current, p=dropout_p)
l_dropout = DropoutLayer(l_current, p=dropout_p)
l_out = DenseLayer(l_dropout, num_units=2, nonlinearity=softmax)
# ==== Neural network definition ====
net1 = NeuralNet(layers=l_out,
update=adadelta, update_rho=0.95, update_learning_rate=1.0,
objective_loss_function=categorical_crossentropy,
train_split=TrainSplit(eval_size=0), verbose=0, max_epochs=1)
net1.fit(X_train, y_train)
predictions = net1.predict_proba(X_test)[:, 1]
Now the keras version barely outperforms logistic regression, while the lasagne version is the best scoring algorithm so far. Any ideas as to why?
Here are some suggestions (I'm not sure if they will actually close the performance gap you are observing):
According to the Keras documentation the Highway layer is initialized using Glorot Uniform weights while in your Lasagne code you are using Orthogonal weight initialization. Unless you have another part of your code where you set the weight initialization to Orthogonal for the Keras Highway layer, this could be a source of the performance gap.
It also seems like you are using Adagrad for your Keras model, but you are using Adadelta for your Lasagne model.
Also I am not 100% sure about this, but you may also want to verify that your transform bias terms are initialized the same way.