I have created a LSTM model in Pytorch which looks like this:
LSTMNet
Now I want to build another LSTM model (NewLSTMNet) on top of it (LSTMNet) by freezing the fc1 layer. I used:
model.fc1.weight.requires_grad = False
model.fc1.bias.requires_grad = False
and then I changed fc2 layer with a linear layer with input features = 40 and output features = 40.
So far I did:
class NewLSTMNet(nn.Module):
def __init__(self, model, input_size, hidden_size, num_layers):
super(NewLSTMNet, self).__init__()
self.model = model
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.model.fc2 = nn.Linear(40, 40)
# self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc3 = nn.Sequential(
# nn.ReLU(),
nn.Linear (40 , 128),
nn.ReLU(),
nn.Linear(128, 40),
nn.ReLU(),
nn.Linear(40,1),
nn.ReLU(),
)
def forward(self,x):
# input = self.model(x)
# h0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# c0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# _, (h_out,_) = self.lstm(input, (h0,c0))
# h_out = h_out.view(-1, self.hidden_size)
# print(h_out.shape)
# out = self.fc3(out)
out = self.model(x)
out = self.fc3(out)
return out
Now my new LSTM model looks like this:
NewLSTMNet
My training loop looks like this:
for epoch in range(EPOCHS):
model.train()
output = model(X_train)
train_loss = criterion(output, y_train)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
with torch.no_grad():
model.eval()
output_val = model(X_valid)
valid_loss = criterion(output_val, y_valid)
if valid_loss <= valid_loss_min:
torch.save(model.state_dict(), './state_dict_new.pt')
print(
f'Epoch {epoch + 0:01}: Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
early_stopping_counter = 0 # reset counter if validation loss decreases
else:
print(f'Epoch {epoch + 0:01}: Validation loss did not decrease')
early_stopping_counter += 1
if early_stopping_counter > early_stopping_patience:
print('Early stopped at epoch :', epoch)
break
print(f'\t Train_Loss: {train_loss:.4f} Val_Loss: {valid_loss:.4f} BEST VAL Loss: {valid_loss_min:.4f}\n')
Now the model is working fine. But I want to create a LSTM layer in the NewLSTMNet model. I already tried to add a LSTM layer but I was expecting a vector in the output but I am getting a matrix in the output of the prediction. So there is shape mismatch!
How should I modify my code? Any help is appreciated. Thanks in advance!
Related
I am using LSTM for time series forecasting. I used a LSTM model
LSTMNet(
(lstm): LSTM(1, 128, batch_first=True)
(fc1): Linear(in_features=128, out_features=40, bias=True)
(fc2): Linear(in_features=40, out_features=1, bias=True)
(relu): ReLU()
)
Then I am freezing the last layer fc2 and trying to build another lstm layer and then train my model with the Input size[841, 14, 1]. However I am getting error:
"input.size(-1) must be equal to input_size. Expected 40, got 1."
My code is
early_stopping_patience = 150
early_stopping_counter = 0
valid_loss_min=np.inf
for epoch in range(EPOCHS):
model.train()
output = model(X_train)
train_loss = criterion(output, y_train)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
with torch.no_grad():
model.eval()
output_val = model(X_valid) #/edited here with reshape
valid_loss = criterion(output_val, y_valid)
if valid_loss <= valid_loss_min:
torch.save(model.state_dict(), './state_dict_new.pt')
print(f'Epoch {epoch + 0:01}: Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
early_stopping_counter=0 #reset counter if validation loss decreases
else:
print(f'Epoch {epoch + 0:01}: Validation loss did not decrease')
early_stopping_counter+=1
if early_stopping_counter > early_stopping_patience:
print('Early stopped at epoch :', epoch)
break
print(f'\t Train_Loss: {train_loss:.4f} Val_Loss: {valid_loss:.4f} BEST VAL Loss: {valid_loss_min:.4f}\n')
My Model is
class NewLSTMNet(nn.Module):
def __init__(self, base_model, input_size, hidden_size, num_layers):
super(NewLSTMNet, self).__init__()
self.num_layers = num_layers
self.input_size = input_size
self.base_model = base_model
self.hidden_size = hidden_size
for param in self.base_model.parameters():
param.requires_grad = False
self.lstm = nn.LSTM(
input_size=40, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True
)
self.fc1 = nn.Linear(hidden_size,64)
self.fc2 = nn.Linear(64,1)
self.relu = nn.ReLU()
def forward(self,x):
out = self.base_model(x)
out = out.view(out.shape[0], -1, 1)
h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size))
c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size))
_, (h_out, _) = self.lstm(out, (h0, c0))
h_out = h_out.view(-1, self.hidden_size)
out = self.fc2(self.relu(self.fc1(h_out)))
return out
Could anyone please look into it and help me please?
I've written a snippet to classify Omniglot images. I calculate the training and validation losses in each epoch, where the latter is computed using images that were not seen by the network before. The two plots are as below:
Since the training loss decreases while the validation loss increases, I have concluded that my model overfits. I've tried several suggestions (e.g. here) to overcome this, including:
Increasing the size of the training set.
shuffling the data.
Adding dropout layers (up to p=0.9).
Using smaller model.
Altering the architecture.
Changing the learning rate.
Reducing the batch size.
Adding weight decay.
However, the validation loss still increases. I wonder if there are any other suggestions to improve this behavior or if this is not overfitting, but the problem is something else. Below is the snippet used in this question.
import torch
import torchvision
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
dim_out = 964
# -- embedding params
self.cn1 = nn.Conv2d(1, 16, 7)
self.cn2 = nn.Conv2d(16, 32, 4)
self.cn3 = nn.Conv2d(32, 64, 3)
self.pool = nn.MaxPool2d(2)
self.bn1 = nn.BatchNorm2d(16)
self.bn2 = nn.BatchNorm2d(32)
self.bn3 = nn.BatchNorm2d(64)
# -- prediction params
self.fc1 = nn.Linear(256, 170)
self.fc2 = nn.Linear(170, 50)
self.fc3 = nn.Linear(50, dim_out)
# -- non-linearity
self.relu = nn.ReLU()
self.Beta = 10
self.sopl = nn.Softplus(beta=self.Beta)
def forward(self, x):
y1 = self.pool(self.bn1(self.relu(self.cn1(x))))
y2 = self.pool(self.bn2(self.relu(self.cn2(y1))))
y3 = self.relu(self.bn3(self.cn3(y2)))
y3 = y3.view(y3.size(0), -1)
y5 = self.sopl(self.fc1(y3))
y6 = self.sopl(self.fc2(y5))
return self.fc3(y6)
class Train:
def __init__(self):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# -- data
dim = 28
batch_size = 400
my_transforms = transforms.Compose([transforms.Resize((dim, dim)), transforms.ToTensor()])
trainset = torchvision.datasets.Omniglot(root="./data/omniglot_train/", download=False, transform=my_transforms)
validset = torchvision.datasets.Omniglot(root="./data/omniglot_train/", background=False, download=False,
transform=my_transforms)
self.TrainDataset = DataLoader(dataset=trainset, batch_size=batch_size, shuffle=True)
self.ValidDataset = DataLoader(dataset=validset, batch_size=len(validset), shuffle=False)
self.N_train = len(trainset)
self.N_valid = len(validset)
# -- model
self.model = MyModel().to(self.device)
# -- train
self.epochs = 3000
self.loss = nn.CrossEntropyLoss()
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
def train_epoch(self):
self.model.train()
train_loss = 0
for batch_idx, data_batch in enumerate(self.TrainDataset):
# -- predict
predict = self.model(data_batch[0].to(self.device))
# -- loss
loss = self.loss(predict, data_batch[1].to(self.device))
# -- optimize
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
train_loss += loss.item()
return train_loss/(batch_idx+1)
def valid_epoch(self):
with torch.no_grad():
self.model.eval()
for data_batch in self.ValidDataset:
# -- predict
predict = self.model(data_batch[0].to(self.device))
# -- loss
loss = self.loss(predict, data_batch[1].to(self.device))
return loss.item()
def __call__(self):
for epoch in range(self.epochs):
train_loss = self.train_epoch()
valid_loss = self.valid_epoch()
print('Epoch {}: Training loss = {:.5f}, Validation loss = {:.5f}.'.format(epoch, train_loss, valid_loss))
torch.save(self.model.state_dict(), './model_stat.pth')
if __name__ == '__main__':
my_train = Train()
my_train()
If your train accuracy is good but testing (data not used in training) accuracy is bad then you have an overfitting problem. I had the same problem with a CNN model. You can use two methods to overcome overfitting. First is early stopping for your train and second is regularization. Check the below example:
# L2 regularizers for layers
model = keras.Sequential([
keras.layers.InputLayer(input_shape=(32, 32)),
keras.layers.Reshape(target_shape=(32, 32, 1)),
keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu, use_bias=True , kernel_regularizer =tf.keras.regularizers.l2( l=0.01)),
keras.layers.MaxPooling2D(pool_size=(2, 2)),
keras.layers.Flatten(),
keras.layers.Dense(10, activation = 'softmax', use_bias=True)
])
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
#Early Stopping
history = model.fit(X_train, Y_train,
validation_data=(X_dev, Y_dev),
epochs=4000,
callbacks=EarlyStopping(monitor='val_loss'))
Do not forget to import for early stopping.
from tensorflow.keras.callbacks import EarlyStopping
TRAINING CODE
if os.path.isfile(PATH):
print("checkpoint training '{}' ...".format(PATH))
checkpoint = torch.load(PATH)
start_epoch = checkpoint['epoch']
start_i = checkpoint['i']
net.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (trained for {} epochs, {} i)".format(PATH, checkpoint['epoch'],
checkpoint['i']))
else:
print('new training')
for epoch in range(num_epochs): # loop over the dataset multiple times
running_loss = 0.0
for i in range(len(train_folder_list2)):
# get the inputs; data is a list of [inputs, labels]
# net.train()
inputs, labels = train_input[i], train_list[i]
inputs = torch.as_tensor(inputs).cuda()
inputs = inputs.transpose(1, 3)
labels = torch.as_tensor(labels).cuda()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
# zero the parameter gradients
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 100 == 1:
save_checkpoint({
'epoch': start_epoch + epoch + 1,
'i': start_i + i + 1,
'state_dict': net.state_dict(),
})
TEST CODE
PATH = './checkpoint.pth'
model = Net().cuda()
if os.path.isfile(PATH):
print('checkpoint check!')
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
model.eval()
for k in range(len(train_folder_list2)):
inputs = train_input[k]
inputs = torch.as_tensor(inputs).cuda()
inputs = inputs.transpose(1, 3)
outputs = model(inputs)
result = outputs.cpu().detach().numpy()
This is the code to find the edges of the image.
If I run the training code, train it, and test it with the test code, it doesn't seem to find any edges in the image. The edges are on the same side, whatever image i put.
**ADD
CNN CODE
In addition, we added cnn code to give you information. Data input was put in the list separately from the image and label.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(293904, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 18)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 293904)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
x = x.view(18)
return x
About the input. Sorry for the bad formatting. The for each two rows first row is the key and second row is the value. 18~20_ride is the label and is not included in the input. Below is one input. And train set consists of 400000 of these.
bus_route_id station_code latitude longitude 6~7_ride
0 4270000 344 33.48990 126.49373
7~8_ride 8~9_ride 9~10_ride 10~11_ride 11~12_ride 6~7_takeoff
0.0 1.0 2.0 5.0 2.0 6.0
7~8_takeoff 8~9_takeoff 9~10_takeoff 10~11_takeoff 11~12_takeoff
0.0 0.0 0.0 0.0 0.0
18~20_ride weekday dis_jejusi dis_seoquipo
0.0 6 2.954920 26.256744
Example weights: Captured at 4th epoch. After 20 epochs of training I got much smaller values (ex. -7e-44 or 1e-55)
2.3937e-11, -2.6920e-12, -1.0445e-11, ..., -1.0754e-11, 1.1128e-11, -1.4814e-11
The model's prediction and target
#Target
[2.],
[0.],
[0.]
#Prediction
[1.4187],
[1.4187],
[1.4187]
MyDataset.py
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import torch
import os
class MyDataset(Dataset):
def __init__(self, csv_filename):
self.dataset = pd.read_csv(csv_filename, index_col=0)
self.labels = self.dataset.pop("18~20_ride")
self.dataset = self.dataset.values
self.labels = np.reshape(self.labels.values,(-1,1))
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
return self.dataset[idx], self.labels[idx]
Model
class Network(nn.Module):
def __init__(self, input_num):
super(Network, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(input_num, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc2 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc3 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc4 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc5 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc6 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU)
)
self.fc7 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc8 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU())
)
self.fc9 = nn.Linear(64, 1)
The training and validation
def train(model, device, train_loader, optimizer, loss_fn, log_interval, epoch):
print("Training")
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.float().to(device), target.float().to(device)
optimizer.zero_grad()
output = model(data)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_idx+1) * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def validate(model, device, loader, loss_fn):
print("\nValidating")
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for batch_idx, (data, target) in enumerate(loader):
data, target = data.float().to(device), target.float().to(device)
output = model(data)
test_loss += loss_fn(output, target).item() # sum up batch loss
test_loss /= len(loader)
print('Validation average loss: {:.4f}\n'.format(
test_loss))
return test_loss
Entire process of training and validation
from MyDataset import MyDataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from datetime import datetime
train_dataset_path = "/content/drive/My Drive/root/bus/dataset/train.csv"
val_dataset_path = "/content/drive/My Drive/root/bus/dataset/val.csv"
model_base_path = "/content/drive/My Drive/root/bus/models/"
model_file = "/content/drive/My Drive/root/bus/models/checkpoints/1574427776.202017.pt"
"""
Training Config
"""
epochs = 10
batch_size = 32
learning_rate = 0.5
check_interval = 4
log_interval = int(40000/batch_size)
gamma = 0.1
load_model = False
save_model = True
make_checkpoint = True
"""
End of config
"""
# Read test set
train_set = MyDataset(train_dataset_path)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_set = MyDataset(val_dataset_path)
val_loader = DataLoader(val_set, batch_size=1)
print("Data READY")
device = torch.device("cuda")
net = Network(19).float().to(device)
if load_model:
net.load_state_dict(torch.load(model_file))
loss_fn = torch.nn.MSELoss()
optimizer = optim.AdamW(net.parameters(), lr=learning_rate)
best_loss = float('inf')
isAbort = False
for epoch in range(1, epochs+1):
train(net, device, train_loader, optimizer, loss_fn, log_interval, epoch)
val_loss = validate(net, device, val_loader, loss_fn)
if epoch%check_interval==0:
if make_checkpoint:
print("Saving new checkpoint")
torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
"""
if val_loss < best_loss and epoch%check_interval==0:
best_loss = val_loss
if make_checkpoint:
print("Saving new checkpoint")
torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
else:
print("Model overfit detected. Aborting training")
isAbort = True
break
"""
if save_model and not isAbort:
torch.save(net.state_dict(), model_base_path+"finals/"+str(datetime.today().timestamp())+".pt")
So I tried to train a fully connected model for a regression problem, with google colab. But it did not get trained well; The loss absolutely did not decrease. So I dug down and found out that the weights were really small. Any idea why this is happening and how I could avoid this? Thank you
I used MSE for loss and used ADaW optimizer. Below are the things I have tried
Tried other architectures (Changing number of layers sizes, Changed activation function ReLU, GELU)but the loss did not decrease
Tried changing the learning rate from 3e-1~1e-3, even tried 1
Tried other pre-processing(Used day/month/year instead of weekday) for the data
Given the label in the input data but loss did not decrease
Tried different batch_sizes(4, 10, 32, 64)
Removed batch_normalization
Other kinds of optimizer such as SGD, Adam
Training 20 epochs but loss did not decrease at all
The weights do change at loss.backward()
TL;DR: Invalid input data!! Check for NaN or NULL
Well it has been sometime since the question. Tried almost everything and though maybe messed up the project setup. So I deleted the project and tried it again: same. Delete again and migrate to TF2: THE SAME RESULT! So I found out that there wasn't any problem with the setup. So I searched other places. In the end I did find the reason. The input columns were actually modified by myself. (To remove some highly correlated features). It was not original. During the modification I messed up some float values and it ended up having NaN values. So check if you're dataset contains invalid values.
I have a set of input sentences. I am using the pretrained word2vec model from gensim to get the embedding of the input sentences. I want to pass these embeddings as input to a custom pytorch LSTM model
hidden_size = 32
num_layers = 1
num_classes = 2
class customModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(customModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=False, bidirectional=True)
self.fcl = nn.Linear(hidden_size*2, num_classes)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, hidden = self.bilstm(x, (h0, c0))
fw_bilstm = out[-1, :, :self.hidden_size]
bk_bilstm = out[0, :, :self.hidden_size]
concat_fw_bw = torch.cat((fw_bilstm, bk_bilstm), dim = 1)
fc = self.fcl(concat_fw_bw)
x = F.softmax(F.relu(fc))
return x
Now I initialize the model object.
model = customModel(300, hidden_size, num_layers, num_classes)
Get embedding for the input sentences
sentences = [['my', 'name', 'is', 'nad'], ['i', 'love', 'nlp', 'proc']]
embedding = create_embedding(sentences)
embedding_torch = torch.FloatTensor(embedding)
Now I want to pass these embeddings to the model to get the prediction
for item in embedding_torch:
item = item.view((1, item.size()[0], item.size()[1]))
for epoch in range(1):
tag_scores = model(item)
print (tag_scores)
Which throws me runtime error
RuntimeError: Expected hidden[0] size (2, 4, 32), got (2, 1, 32)
I am not sure why this is happening. My understanding is h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) line is calculating the hidden dimension properly.
What am I missing? Please suggest.
The backbone of your model is nn.LSTM which expects inputs with size [sequence_length, batch_size, embedding_size]. On the other hand, the inputs you are providing the model have size [1, sequence_lenth, embedding_size]. What I would do is create the nn.LSTM as:
# With batch_first=True
self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
That way, the model would expect the inputs to be of size [batch_size, sequence_length, embedding_size]. Then, instead of going through each element in the batch separately, do:
tag_scores = model(embedding_torch)