pytorch cnn Test result is sticked - deep-learning

TRAINING CODE
if os.path.isfile(PATH):
print("checkpoint training '{}' ...".format(PATH))
checkpoint = torch.load(PATH)
start_epoch = checkpoint['epoch']
start_i = checkpoint['i']
net.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (trained for {} epochs, {} i)".format(PATH, checkpoint['epoch'],
checkpoint['i']))
else:
print('new training')
for epoch in range(num_epochs): # loop over the dataset multiple times
running_loss = 0.0
for i in range(len(train_folder_list2)):
# get the inputs; data is a list of [inputs, labels]
# net.train()
inputs, labels = train_input[i], train_list[i]
inputs = torch.as_tensor(inputs).cuda()
inputs = inputs.transpose(1, 3)
labels = torch.as_tensor(labels).cuda()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
# zero the parameter gradients
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 100 == 1:
save_checkpoint({
'epoch': start_epoch + epoch + 1,
'i': start_i + i + 1,
'state_dict': net.state_dict(),
})
TEST CODE
PATH = './checkpoint.pth'
model = Net().cuda()
if os.path.isfile(PATH):
print('checkpoint check!')
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
model.eval()
for k in range(len(train_folder_list2)):
inputs = train_input[k]
inputs = torch.as_tensor(inputs).cuda()
inputs = inputs.transpose(1, 3)
outputs = model(inputs)
result = outputs.cpu().detach().numpy()
This is the code to find the edges of the image.
If I run the training code, train it, and test it with the test code, it doesn't seem to find any edges in the image. The edges are on the same side, whatever image i put.
**ADD
CNN CODE
In addition, we added cnn code to give you information. Data input was put in the list separately from the image and label.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(293904, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 18)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 293904)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
x = x.view(18)
return x

Related

Temporal sequence feature extraction CNN, batches with different dimensions

I am using a CNN to extract features from temporal data of different lengths. I am using pad_sequence to pad the data in a batch. However as the max length in a batch will change, the padded sequence length differs by batch. This creates errors when i flatten the data for the FCN layer (as the dimension of the flattened vector changes). I am currently handling this by using an 'adaptive avg pooling layer' in before the FCN layers. As this is a global averaging, it fixes the output dimension for the FCN. However I am not sure if this is the correct thing to do.
Code is:
##pad tensors
def pad_collate(batch):
sequences = [item[0] for item in batch]
lengths = [len(seq) for seq in sequences]
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
return padded_sequences, lengths
## Create dataloader
trainData = Sequence(root = path)
trainDataLoader = DataLoader(trainData, batch_size = BATCH_SIZE, collate_fn= pad_collate)
## CNN model
class FeatureExtractor(nn.Module):
def __init__(self, block, layers):
super(FeatureExtractor, self).__init__()
self.inplanes = 6
## 1st CONV layers
self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 3, stride = 2, padding = 4)
self.bn1 = nn.BatchNorm2d(6)
self.relu1 = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride = 2, padding = 1)
## residual blocks
self.layer0 = self._make_layer(block, 12, layers[0], stride = 1)
self.layer1 = self._make_layer(block, 24, layers[1], stride = 2)
self.avgpool = nn.AdaptiveAvgPool2d((5,5)) ##### MY CURRENT SOLUTION #####
self.fc = nn.Linear(600, 128)
def _make_layer(self, block, planes, blocks, stride):
downsample = None
if stride != 1 or self.inplanes != planes:
downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
nn.BatchNorm2d(planes))
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
## first conv
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.maxpool1(x)
## conv blocks
x = self.layer0(x)
x = self.layer1(x)
##FCN layer
x = self.avgpool(x)
x = torch.flatten(x, 1)
output = self.fc(x)
return output
Any other comments are also welcome (i am self-taught)

LSTM for time series forecasting

I have created a LSTM model in Pytorch which looks like this:
LSTMNet
Now I want to build another LSTM model (NewLSTMNet) on top of it (LSTMNet) by freezing the fc1 layer. I used:
model.fc1.weight.requires_grad = False
model.fc1.bias.requires_grad = False
and then I changed fc2 layer with a linear layer with input features = 40 and output features = 40.
So far I did:
class NewLSTMNet(nn.Module):
def __init__(self, model, input_size, hidden_size, num_layers):
super(NewLSTMNet, self).__init__()
self.model = model
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.model.fc2 = nn.Linear(40, 40)
# self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc3 = nn.Sequential(
# nn.ReLU(),
nn.Linear (40 , 128),
nn.ReLU(),
nn.Linear(128, 40),
nn.ReLU(),
nn.Linear(40,1),
nn.ReLU(),
)
def forward(self,x):
# input = self.model(x)
# h0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# c0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# _, (h_out,_) = self.lstm(input, (h0,c0))
# h_out = h_out.view(-1, self.hidden_size)
# print(h_out.shape)
# out = self.fc3(out)
out = self.model(x)
out = self.fc3(out)
return out
Now my new LSTM model looks like this:
NewLSTMNet
My training loop looks like this:
for epoch in range(EPOCHS):
model.train()
output = model(X_train)
train_loss = criterion(output, y_train)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
with torch.no_grad():
model.eval()
output_val = model(X_valid)
valid_loss = criterion(output_val, y_valid)
if valid_loss <= valid_loss_min:
torch.save(model.state_dict(), './state_dict_new.pt')
print(
f'Epoch {epoch + 0:01}: Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
early_stopping_counter = 0 # reset counter if validation loss decreases
else:
print(f'Epoch {epoch + 0:01}: Validation loss did not decrease')
early_stopping_counter += 1
if early_stopping_counter > early_stopping_patience:
print('Early stopped at epoch :', epoch)
break
print(f'\t Train_Loss: {train_loss:.4f} Val_Loss: {valid_loss:.4f} BEST VAL Loss: {valid_loss_min:.4f}\n')
Now the model is working fine. But I want to create a LSTM layer in the NewLSTMNet model. I already tried to add a LSTM layer but I was expecting a vector in the output but I am getting a matrix in the output of the prediction. So there is shape mismatch!
How should I modify my code? Any help is appreciated. Thanks in advance!

issue with arcface ( 0 accuracy)

Hello guys I've joined a university-level image recognition competition.
In the test, they will give two images (people face) and my model need to detect pair of the image is the same person or not
My model is resnet18 with IR block and SE block. and it will use Arcface loss.
I can use only the MS1M dataset with a total of 86876 classes
The problem is that loss is getting better, but accuracy is 0 and not changing.
Here's part of code I'm working on.
Train
def train_model(model, net, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
for phase in ['train']:
if phase == 'train':
model.train() # Set model to training mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in notebook.tqdm(dataloader):
inputs = inputs.to(device)
labels = labels.to(device).long()
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
features = model(inputs)
outputs = net(features, labels)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / len(dataloader)
epoch_acc = running_corrects.double() / len(dataloader)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'train' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
torch.save({'epoch': epoch,
'mode_state_dict': model.state_dict(),
'fc_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler': scheduler.state_dict(), # HERE IS THE CHANGE
}, f'/content/drive/MyDrive/inha_data/training_saver/training_stat{epoch}.pth')
print(f'finished {epoch} and saved model_save_{epoch}.pt')
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best train Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), 'model_save.pt')
return model
Parameters
train_dataset = MS1MDataset('train')
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True,num_workers=4)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 디바이스 설정
num_classes = 86876
# normal classifier
# net = nn.Sequential(nn.Linear(512, num_classes))
# Feature extractor backbone, input is 112x112 image output is 512 feature vector
model_ft = resnet18(True)
#set metric
metric_fc = metrics.ArcMarginProduct(512, num_classes, s = 30.0, m = 0.50, easy_margin = False)
metric_fc.to(device)
# net = net.to(device)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = torch.optim.Adam([{'params': model_ft.parameters()}, {'params': metric_fc.parameters()}],
lr=0.1)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=4, gamma=0.1)
Arcface
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import math
class ArcMarginProduct(nn.Module):
r"""Implement of large margin arc distance: :
Args:
in_features: size of each input sample
out_features: size of each output sample
s: norm of input feature
m: margin
cos(theta + m)
"""
def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
super(ArcMarginProduct, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.s = s
self.m = m
self.weight = Parameter(torch.FloatTensor(out_features, in_features))
nn.init.xavier_uniform_(self.weight)
self.easy_margin = easy_margin
self.cos_m = math.cos(m)
self.sin_m = math.sin(m)
self.th = math.cos(math.pi - m)
self.mm = math.sin(math.pi - m) * m
def forward(self, input, label):
# --------------------------- cos(theta) & phi(theta) ---------------------------
cosine = F.linear(F.normalize(input), F.normalize(self.weight))
sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
phi = cosine * self.cos_m - sine * self.sin_m
if self.easy_margin:
phi = torch.where(cosine > 0, phi, cosine)
else:
phi = torch.where(cosine > self.th, phi, cosine - self.mm)
# --------------------------- convert label to one-hot ---------------------------
# one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
one_hot = torch.zeros(cosine.size(), device='cuda')
one_hot.scatter_(1, label.view(-1, 1).long(), 1)
# -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
output = (one_hot * phi) + ((1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4
output *= self.s
# print(output)
return output
dataset
data_transforms = {
'train': transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.125, contrast=0.125, saturation=0.125),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
#train_ms1_data = torchvision.datasets.ImageFolder('/content/drive/MyDrive/inha_data/train', transform = data_transforms)
class MS1MDataset(Dataset):
def __init__(self,split):
self.file_list = '/content/drive/MyDrive/inha_data/ID_List.txt'
self.images = []
self.labels = []
self.transformer = data_transforms['train']
with open(self.file_list) as f:
files = f.read().splitlines()
for i, fi in enumerate(files):
fi = fi.split()
image = "/content/" + fi[1]
label = int(fi[0])
self.images.append(image)
self.labels.append(label)
def __getitem__(self, index):
img = Image.open(self.images[index])
img = self.transformer(img)
label = self.labels[index]
return img, label
def __len__(self):
return len(self.images)
You can try to use a smaller m in ArcFace, even a minus value.

why loss jump up to nan with swa method?

I use SWA methods to train model in pytorch.
SWA: https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/
my train code's loss jump up to nan immidiately.
model' loss jump up to nan
loss outputs is below.
1.loss: tensor(4.8463, device='cuda:0', grad_fn=)
2.loss: tensor(118317.8516, device='cuda:0', grad_fn=)
3.loss: tensor(5.7568e+22, device='cuda:0', grad_fn=)
4.loss: tensor(nan, device='cuda:0', grad_fn=)
without SWA methods,loss don't jump up.
is there any problems in train model's code with SWA method?
I would appreciate any advice,thank you.
#batch_size
batch_size=5
#DataLoader
train_dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=batch_size,shuffle=False)
#dict
dataloaders_dict={"train":train_dataloader,"val":val_dataloader}
#example outputs
#train :torch.Size([5, 25, 32, 32])
#target : torch.Size([5])
def train_model_withSWA(net,dataloaders_dict,criterion,optimizer,num_epochs):
loss_list=[]
acc_list=[]
#validation list
val_loss_list=[]
val_acc_list=[]
for epoch in tqdm(range(num_epochs)):
print("Epoch{}/{}".format(epoch+1,num_epochs))
print("--------------------------")
for phase in ["train","val"]:
if phase=="train":
net.train()
else:
net.eval()
epoch_loss=0.0
epoch_corrects=0
#if (epoch==0) and (phase=="train"):
continue
for inputs,labels in dataloaders_dict[phase]:
#optimizerを初期化:
optimizer.zero_grad()
with torch.set_grad_enabled(phase=="train"):
inputs=inputs.to(device)
labels=labels.to(device)
outputs=net(inputs)
loss=criterion(outputs,labels)
print("loss:",loss)
_,preds=torch.max(outputs,1)
if phase == "train":
loss.backward()
optimizer.step()
epoch_loss += loss.item()*inputs.size(0)
epoch_corrects +=torch.sum(preds==labels.data)
#for swa
optimizer.swap_swa_sgd()
epoch_loss=epoch_loss/len(dataloaders_dict[phase].dataset)
epoch_acc=epoch_corrects.double()/len(dataloaders_dict[phase].dataset)
print("{} Loss:{:.4f} Acc:{:.4f}".format(phase,epoch_loss,epoch_acc))
if phase=="train":
loss_list.append(epoch_loss.detach().numpy())
acc_list.append(epoch_acc.detach().numpy())
else:
val_loss_list.append(epoch_loss.detach().numpy())
val_acc_list.append(epoch_acc.detach().numpy())
from torchcontrib.optim import SWA
# ignore warning
import warnings
warnings.filterwarnings('ignore') # set to ignore
#criterion
criterion = nn.CrossEntropyLoss()
net=net.to(device)
base_opt = torch.optim.SGD(net.parameters(), lr=0.1)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
### train with SWA
train_model_withSWA(net=net,dataloaders_dict=dataloaders_dict,
criterion=criterion,
optimizer=optimizer,
num_epochs=num_epochs)
#model' loss jump up to nan....
#loss: tensor(4.8463, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(118317.8516, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(5.7568e+22, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
# In additional.
I use Self Attention and Positional Encoder code
class Self_Attention(nn.Module):
""" Self-Attention Layer"""
def __init__(self, in_dim):
super(Self_Attention, self).__init__()
#pointwise convolution
self.query_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.key_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.value_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
# softmax
self.softmax = nn.Softmax(dim=-2)
#output = x +gamma*o
# first:gamma=0
self.gamma = nn.Parameter(torch.zeros(1))
def forward(self, x):
x=x.to(device)
X = x
#B,C',W,H→B,C',N
proj_query = self.query_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
proj_query = proj_query.permute(0, 2, 1) # transpose
proj_key = self.key_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
# bmm
S = torch.bmm(proj_query, proj_key)
#
attention_map_T = self.softmax(S)
attention_map = attention_map_T.permute(0, 2, 1)
# Self-Attention Map
proj_value = self.value_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C,N
o = torch.bmm(proj_value, attention_map.permute(
0, 2, 1))
# Self-Attention Map
o = o.view(X.shape[0], X.shape[1], X.shape[2], X.shape[3])
out = x+self.gamma*o
#print("gamma:",self.gamma)
return out, attention_map
class PositionalEncoder(nn.Module):
def __init__(self, d_model=300, max_seq_len=256):
super(PositionalEncoder,self).__init__()
self.d_model = d_model
pe = torch.zeros(max_seq_len, d_model)
# GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pe = pe.to(device)
for pos in range(max_seq_len):
for i in range(0, d_model, 2):
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
pe[pos, i + 1] = math.cos(pos /
(10000 ** ((2 * (i + 1))/d_model)))
#
self.pe = pe.unsqueeze(0)
#
self.pe.requires_grad = False
def forward(self, x):
x=x.to(device)
ret = math.sqrt(self.d_model)*x + self.pe
return ret
Without normalization in data_preprocessing,loss don't jump up to nan.
this can be caused by my preprocessing error. I don't understand reration with normalization and loss's jump up in SWA method even now...
Im sorry for the mistake.

how to concatenate embedding layer in pytorch

I am trying to concatenate embedding layer with other features. It doesn’t give me any error, but doesn’t do any training either. Is anything wrong with this model definition, how to debug this?
Note: The last column (feature) in my X is feature with word2ix (single word).
Note: The net works fine without the embedding feature/layer
originally posted on pytorch forum
class Net(torch.nn.Module):
def __init__(self, n_features, h_sizes, num_words, embed_dim, out_size, dropout=None):
super().__init__()
self.num_layers = len(h_sizes) # hidden + input
self.embedding = torch.nn.Embedding(num_words, embed_dim)
self.hidden = torch.nn.ModuleList()
self.bnorm = torch.nn.ModuleList()
if dropout is not None:
self.dropout = torch.nn.ModuleList()
else:
self.dropout = None
for k in range(len(h_sizes)):
if k == 0:
self.hidden.append(torch.nn.Linear(n_features, h_sizes[0]))
self.bnorm.append(torch.nn.BatchNorm1d(h_sizes[0]))
if self.dropout is not None:
self.dropout.append(torch.nn.Dropout(p=dropout))
else:
if k == 1:
input_dim = h_sizes[0] + embed_dim
else:
input_dim = h_sizes[k-1]
self.hidden.append(torch.nn.Linear(input_dim, h_sizes[k]))
self.bnorm.append(torch.nn.BatchNorm1d(h_sizes[k]))
if self.dropout is not None:
self.dropout.append(torch.nn.Dropout(p=dropout))
# Output layer
self.out = torch.nn.Linear(h_sizes[-1], out_size)
def forward(self, inputs):
# Feedforward
for l in range(self.num_layers):
if l == 0:
x = self.hidden[l](inputs[:, :-1])
x = self.bnorm[l](x)
if self.dropout is not None:
x= self.dropout[l](x)
embeds = self.embedding(inputs[:,-1])#.view((1, -1)
x = torch.cat((embeds, x),dim=1)
else:
x = self.hidden[l](x)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
x = F.relu(x)
output= self.out(x)
return output
There were a few issues. The key one was data type. I mixed float features and int indices.
sample data and training before fix:
NUM_TARGETS = 4
NUM_FEATURES = 3
NUM_TEXT_FEATURES = 1
x = np.random.rand(5, NUM_FEATURES)
y = np.random.rand(5, NUM_TARGETS)
word_ix = np.arange(5).reshape(-1,1).astype(int)
x_train = np.append(x, word_ix, axis=1)
x_train = torch.from_numpy(x).float().to(device)
y_train = torch.from_numpy(y).float().to(device)
h_sizes = [2,2]
net = Net(x_train.shape[1] , h_sizes=h_sizes, num_words=5, embed_dim=2, out_size=y_train.shape[1],dropout=.01) # define the network
print(net) # net architecture
net = net.float()
net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001, weight_decay=.01)
loss_func = torch.nn.MSELoss() # this is for regression mean squared loss
# one training loop
prediction = net(x_train) # input x and predict based on x
loss = loss_func(prediction, y_train) # must be (1. nn output, 2. target)
optimizer.zero_grad() # clear gradients for next train
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
# train_losses.append(loss.detach().to('cpu').numpy())
To resolve this, I separated word index feature from x, and also removed net.float().
changed the dtypes conversion to:
x_train = torch.from_numpy(x).float().to(device)
y_train = torch.from_numpy(y).float().to(device)
# NOTE: word index needs to be long
word_ix = torch.from_numpy(word_ix).to(torch.long).to(device)
and forward method changed to :
def forward(self, inputs, word_ix):
# Feedforward
for l in range(self.num_layers):
if l == 0:
x = self.hidden[l](inputs)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
embeds = self.embedding(word_ix)
# NOTE:
# embeds has a shape of (batch_size, 1, embed_dim)
# inorder to merge this change this with x, reshape this to
# (batch_size, embed_dim)
embeds = embeds.view(embeds.shape[0], embeds.shape[2])
x = torch.cat((x, embeds.view(x.shape)),dim=1)
else:
x = self.hidden[l](x)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
x = F.relu(x)
output= self.out(x)
return output