RuntimeError: cudnn RNN backward can only be called in training mode - deep-learning

I have seen this problem the first time, I never encountered such an error in previous Python projects. Here is my training code:
def train(net, opt, criterion,ucf_train, batchsize,i):
opt.zero_grad()
total_loss = 0
net=net.eval()
net=net.train()
for vid in range(i*batchsize,i*batchsize+batchsize,1):
output=infer(net,ucf_train[vid])
m=get_label_no(ucf_train[vid])
m=m.cuda( )
loss = criterion(output,m)
loss.backward(retain_graph=True)
total_loss += loss
opt.step() #updates wghts and biases
return total_loss/n_points
code for infer(net,input)
def infer(net, name):
net.eval()
hidden_0 = net.init_hidden()
hidden_1 = net.init_hidden()
hidden_2 = net.init_hidden()
video_path = fetch_ucf_video(name)
cap = cv2.VideoCapture(video_path)
resize=(224,224)
T=FrameCapture(video_path)
print(T)
lim=T-(T%20)-2
i=0
while(1):
ret, frame2 = cap.read()
frame2= cv2.resize(frame2, resize)
# print(type(frame2))
if (i%20==0 and i<lim):
input=normalize(frame2)
input=input.cuda()
output,hidden_0,hidden_1, hidden_2 = net(input, hidden_0, hidden_1, hidden_2)
elif (i>=lim):
break
i=i+1
op=output
torch.cuda.empty_cache()
op=op.cuda()
return op
I am getting this error, I tried with model.train() following this where net is my model:
RuntimeError Traceback (most recent call last)
<ipython-input-62-42238f3f6877> in <module>()
----> 1 train(net1,opt,criterion,ucf_train,1,0)
2 frames
/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
125 Variable._execution_engine.run_backward(
126 tensors, grad_tensors, retain_graph, create_graph,
--> 127 allow_unreachable=True) # allow_unreachable flag
128
129
RuntimeError: cudnn RNN backward can only be called in training mode

You should remove the net.eval() call that comes right after the def infer(net, name):
It needs to be removed because you call this infer function inside your training code. Your model needs to be in train mode throughout the the whole training.
And you never set your model back to train after calling eval as well, so that is the root of the exception you are getting. If you want to use this infer code in your test cases, you can cover that case with an if.
Also the net.eval() that comes right after the total_loss=0 assignment is not useful since you call net.train() right after that. You can also remove that one since it gets neutralized right in next line.
The updated code
def train(net, opt, criterion,ucf_train, batchsize,i):
opt.zero_grad()
total_loss = 0
net=net.train()
for vid in range(i*batchsize,i*batchsize+batchsize,1):
output=infer(net,ucf_train[vid])
m=get_label_no(ucf_train[vid])
m=m.cuda( )
loss = criterion(output,m)
loss.backward(retain_graph=True)
total_loss += loss
opt.step() #updates wghts and biases
return total_loss/n_points
code for infer(net,input)
def infer(net, name, is_train=True):
if not is_train:
net.eval()
hidden_0 = net.init_hidden()
hidden_1 = net.init_hidden()
hidden_2 = net.init_hidden()
video_path = fetch_ucf_video(name)
cap = cv2.VideoCapture(video_path)
resize=(224,224)
T=FrameCapture(video_path)
print(T)
lim=T-(T%20)-2
i=0
while(1):
ret, frame2 = cap.read()
frame2= cv2.resize(frame2, resize)
# print(type(frame2))
if (i%20==0 and i<lim):
input=normalize(frame2)
input=input.cuda()
output,hidden_0,hidden_1, hidden_2 = net(input, hidden_0, hidden_1, hidden_2)
elif (i>=lim):
break
i=i+1
op=output
torch.cuda.empty_cache()
op=op.cuda()
return op

Related

RuntimeError: shape '[128, -1]' is invalid for input of size 378 pytorch

I'm running a spiking neural network for data that has 21 features with a batch size of 128. I get the following error after many iterations of training (this error doesn't arise immediately!):
RuntimeError: shape '[128, -1]' is invalid for input of size 378 pytorch
When I went to go print out what the shapes of the tensors are before, I get the following:
Train
torch.Size([128, 21])
Test
torch.Size([128, 21])
This is my network:
class SpikingNeuralNetwork(nn.Module):
"""
Parameters in SpikingNeuralNetwork class:
1. number_inputs: Number of inputs to the SNN.
2. number_hidden: Number of hidden layers.
3. number_outputs: Number of output classes.
4. beta: Decay rate.
"""
def __init__(self, number_inputs, number_hidden, number_outputs, beta):
super().__init__()
self.number_inputs = number_inputs
self.number_hidden = number_hidden
self.number_outputs = number_outputs
self.beta = beta
# Initialize layers
self.fc1 = nn.Linear(self.number_inputs, self.number_hidden) # Applies linear transformation to all input points
self.lif1 = snn.Leaky(beta = self.beta) # Integrates weighted input over time, emitting a spike if threshold condition is met
self.fc2 = nn.Linear(self.number_hidden, self.number_outputs) # Applies linear transformation to output spikes of lif1
self.lif2 = snn.Leaky(beta = self.beta) # Another spiking neuron, integrating the weighted spikes over time
"""
Forward propagation of SNN. The code below function will only be called once the input argument x
is explicitly passed into net.
#param x: input passed into the network
#return layer of output after applying final spiking neuron
"""
def forward(self, x):
num_steps = 25
# Initialize hidden states at t = 0
mem1 = self.lif1.init_leaky()
mem2 = self.lif2.init_leaky()
# Record the final layer
spk2_rec = []
mem2_rec = []
for step in range(num_steps):
cur1 = self.fc1(x)
spk1, mem1 = self.lif1(cur1, mem1)
cur2 = self.fc2(spk1)
spk2, mem2 = self.lif2(cur2, mem2)
spk2_rec.append(spk2)
mem2_rec.append(mem2)
return torch.stack(spk2_rec, dim = 0), torch.stack(mem2_rec, dim = 0)
This is my training loop:
def training_loop(net, train_loader, test_loader, dtype, device, optimizer):
num_epochs = 1
loss_history = []
test_loss_history = []
counter = 0
# Temporal dynamics
num_steps = 25
# Outer training loop
for epoch in range(num_epochs):
iter_counter = 0
train_batch = iter(train_loader)
# Minibatch training loop
for data, targets in train_batch:
data = data.to(device)
targets = targets.to(device)
# Forward pass
net.train()
print("Train")
print(data.size())
spk_rec, mem_rec = net(data.view(batch_size, -1))
# Initialize the loss and sum over time
loss_val = torch.zeros((1), dtype = dtype, device = device)
for step in range(num_steps):
loss_val += loss_function(mem_rec[step], targets.long().flatten().to(device))
# Gradient calculation and weight update
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# Store loss history for future plotting
loss_history.append(loss_val.item())
# Test set
with torch.no_grad():
net.eval()
test_data, test_targets = next(iter(test_loader))
test_data = test_data.to(device)
test_targets = test_targets.to(device)
# Test set forward pass
print("Test")
print(test_data.size())
test_spk, test_mem = net(test_data.view(batch_size, -1))
# Test set loss
test_loss = torch.zeros((1), dtype = dtype, device = device)
for step in range(num_steps):
test_loss += loss_function(test_mem[step], test_targets.long().flatten().to(device))
test_loss_history.append(test_loss.item())
# Print train/test loss and accuracy
if counter % 50 == 0:
train_printer(epoch, iter_counter, counter, loss_history, data, targets, test_data, test_targets)
counter = counter + 1
iter_counter = iter_counter + 1
return loss_history, test_loss_history
The error occurs on spk_rec, mem_rec = net(data.view(batch_size, -1)).
The code was adopted from https://snntorch.readthedocs.io/en/latest/tutorials/tutorial_5.html, where it was originally used for the MNIST dataset. However, I am not working with an image dataset. I am working with a dataset that has 21 features and predicts just one target (with 100 classes). I tried to change data.view(batch_size, -1) and test_data.view(batch_size, -1) to data.view(batch_size, 21) and test_data.view(batch_size, 21) based on some other forum answers that I saw, and my program is running for now through the training loop. Does anyone have any suggestions for how I can run through the training with no errors?
EDIT: I now get the error RuntimeError: shape '[128, 21]' is invalid for input of size 378 from spk_rec, mem_rec = net(data.view(batch_size, -1)).
Here are my DataLoaders:
train_loader = DataLoader(dataset = train, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(dataset = test, batch_size = batch_size, shuffle = True)
My batch size is 128.
Tryng to run it by myself to try to solve your problem I luck also: net params and snn.snn.Leaky
import torch
from torch import nn
from torch.utils.data import DataLoader
class SpikingNeuralNetwork(nn.Module):
"""
Parameters in SpikingNeuralNetwork class:
1. number_inputs: Number of inputs to the SNN.
2. number_hidden: Number of hidden layers.
3. number_outputs: Number of output classes.
4. beta: Decay rate.
"""
def __init__(self, number_inputs, number_hidden, number_outputs, beta):
super().__init__()
self.number_inputs = number_inputs
self.number_hidden = number_hidden
self.number_outputs = number_outputs
self.beta = beta
# Initialize layers
self.fc1 = nn.Linear(self.number_inputs,
self.number_hidden) # Applies linear transformation to all input points
self.lif1 = snn.Leaky(
beta=self.beta) # Integrates weighted input over time, emitting a spike if threshold condition is met
self.fc2 = nn.Linear(self.number_hidden,
self.number_outputs) # Applies linear transformation to output spikes of lif1
self.lif2 = snn.Leaky(beta=self.beta) # Another spiking neuron, integrating the weighted spikes over time
"""
Forward propagation of SNN. The code below function will only be called once the input argument x
is explicitly passed into net.
#param x: input passed into the network
#return layer of output after applying final spiking neuron
"""
def forward(self, x):
num_steps = 25
# Initialize hidden states at t = 0
mem1 = self.lif1.init_leaky()
mem2 = self.lif2.init_leaky()
# Record the final layer
spk2_rec = []
mem2_rec = []
for step in range(num_steps):
cur1 = self.fc1(x)
spk1, mem1 = self.lif1(cur1, mem1)
cur2 = self.fc2(spk1)
spk2, mem2 = self.lif2(cur2, mem2)
spk2_rec.append(spk2)
mem2_rec.append(mem2)
return torch.stack(spk2_rec, dim=0), torch.stack(mem2_rec, dim=0)
batch_size = 2
train = torch.rand(128, 21)
test = torch.rand(128, 21)
train_loader = DataLoader(dataset=train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test, batch_size=batch_size, shuffle=True)
net = SpikingNeuralNetwork(number_inputs=1)
loss_function = nn.CrossEntropyLoss()
optimizer = nn.optim.Adam(net.parameters(), lr=0.1)
def training_loop(net, train_loader, test_loader, dtype, device, optimizer):
num_epochs = 1
loss_history = []
test_loss_history = []
counter = 0
# Temporal dynamics
num_steps = 25
# Outer training loop
for epoch in range(num_epochs):
iter_counter = 0
train_batch = iter(train_loader)
# Minibatch training loop
for data, targets in train_batch:
data = data.to(device)
targets = targets.to(device)
# Forward pass
net.train()
print("Train")
print(data.size())
spk_rec, mem_rec = net(data.view(batch_size, -1))
# Initialize the loss and sum over time
loss_val = torch.zeros((1), dtype=dtype, device=device)
for step in range(num_steps):
loss_val += loss_function(mem_rec[step], targets.long().flatten().to(device))
# Gradient calculation and weight update
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# Store loss history for future plotting
loss_history.append(loss_val.item())
# Test set
with torch.no_grad():
net.eval()
test_data, test_targets = next(iter(test_loader))
test_data = test_data.to(device)
test_targets = test_targets.to(device)
# Test set forward pass
print("Test")
print(test_data.size())
test_spk, test_mem = net(test_data.view(batch_size, -1))
# Test set loss
test_loss = torch.zeros((1), dtype=dtype, device=device)
for step in range(num_steps):
test_loss += loss_function(test_mem[step], test_targets.long().flatten().to(device))
test_loss_history.append(test_loss.item())
# Print train/test loss and accuracy
if counter % 50 == 0:
train_printer(epoch, iter_counter, counter, loss_history, data, targets, test_data, test_targets)
counter = counter + 1
iter_counter = iter_counter + 1
return loss_history, test_loss_history
Your code works just fine on the MNIST dataset, so I think it might be a problem with how the DataLoader is being called. My guess is that the total dataset is not evenly divisible by your batch_size. If this is true, then you have two options:
Instead of spk_rec, mem_rec = net(data.view(batch_size, -1)), try spk_rec, mem_rec = net(data.flatten(1)) which preserves the first dimension of your data.
Alternatively, you may need to set drop_last=True in the DataLoader functions.

HuggingFace BertForMaskedLM: Expected input batch_size (3200) to match target batch_size (16)

Im working on a Multiclass Classification (Bengali Language Sentiment Analysis) on a pretrained Huggingface (BertForMaskedLM) model.
When the error occured I knew I have to change the label(output) size to match the input. But do not know how. Im adding the code snippents below.
MAX_LEN = 200
BATCH_SIZE = 16
The pretrained models used:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
model = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
tokenizer = BertTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
Code to make the pytorch dataset:
class GPReviewDataset(Dataset):
def __init__(self, reviews, targets, tokenizer, max_len):
self.reviews = reviews
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.reviews)
def __getitem__(self, item):
review = str(self.reviews[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
review,
add_special_tokens=True,
max_length=self.max_len,
truncation = True,
return_token_type_ids=False,
padding='max_length',
return_attention_mask=True,
return_tensors='pt',
)
return {
'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
The input dimentions are:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)
Which Outputs:
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
Training Class
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
model = model.train() # tells your model that we are training
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
loss, logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels = targets
)
#logits = classification scores befroe softmax
#loss = classification loss
logits = logits.view(-1, 28*28).detach().cpu().numpy()
label_ids = targets.to('cpu').numpy()
preds = np.argmax(logits, axis=1).flatten() #returns indices of maximum logit
targ = label_ids.flatten()
correct_predictions += np.sum(preds == targ)
losses.append(loss.item())
loss.backward() # performs backpropagation(computes derivates of loss w.r.t to parameters)
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) #clipping gradients so they dont explode
optimizer.step() #After gradients are computed by loss.backward() this makes the optimizer iterate over all parameters it is supposed to update and use internally #stored grad to update their values
scheduler.step() # this will make sure learning rate changes. If we dont provide this learning rate stays at initial value
optimizer.zero_grad() # clears old gradients from last step
return correct_predictions / n_examples, np.mean(losses)
Where the training Starts (Where the error triggers):
%%time
# standard block
# used accuracy as metric here
history = defaultdict(list)
best_acc = 0
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))
print(f'Train loss {train_loss} Accuracy {train_acc}')
val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))
print(f'Val loss {val_loss} Accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_acc:
torch.save(model.state_dict(), 'best_model_state_a5.bin')
best_acc = val_acc
The error:
Epoch 1/5
----------
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-41-fb5a4d77ce37> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', "# standard block\n# used accuracy as metric here\nhistory = defaultdict(list)\n\nbest_acc = 0\n\nfor epoch in range(EPOCHS):\n\n print(f'Epoch {epoch + 1}/{EPOCHS}')\n print('-' * 10)\n\n train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))\n\n print(f'Train loss {train_loss} Accuracy {train_acc}')\n\n val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))\n\n print(f'Val loss {val_loss} Accuracy {val_acc}')\n print()\n\n history['train_acc'].append(train_acc)\n history['train_loss'].append(train_loss)\n history['val_acc'].append(val_acc)\n history['val_loss'].append(val_loss)\n\n if val_acc > best_acc:\n torch.save(model.state_dict(), 'best_model_state_a5.bin')\n best_acc = val_acc\n\n# We are storing state of best model indicated by highest validation accuracy")
8 frames
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2115 magic_arg_s = self.var_expand(line, stack_depth)
2116 with self.builtin_trap:
-> 2117 result = fn(magic_arg_s, cell)
2118 return result
2119
<decorator-gen-53> in time(self, line, cell, local_ns)
/usr/local/lib/python3.7/dist-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
/usr/local/lib/python3.7/dist-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns)
1191 else:
1192 st = clock2()
-> 1193 exec(code, glob, local_ns)
1194 end = clock2()
1195 out = None
<timed exec> in <module>()
<ipython-input-39-948eefef2f8d> in train_epoch(model, data_loader, optimizer, device, scheduler, n_examples)
13 input_ids=input_ids,
14 attention_mask=attention_mask,
---> 15 labels = targets
16 )
17
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict)
1327 if labels is not None:
1328 loss_fct = CrossEntropyLoss() # -100 index = padding token
-> 1329 masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1330
1331 if not return_dict:
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
1119 def forward(self, input: Tensor, target: Tensor) -> Tensor:
1120 return F.cross_entropy(input, target, weight=self.weight,
-> 1121 ignore_index=self.ignore_index, reduction=self.reduction)
1122
1123
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
2822 if size_average is not None or reduce is not None:
2823 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2824 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2825
2826
ValueError: Expected input batch_size (3200) to match target batch_size (16).

PyTorch: Target 1 is out of bounds

I am new to Deep Learning and wondering how to modify my model to fix it.
It says Target 1 is out of bounds, so what parameter should I change to make it works. When the output is changed to 2, it works. However, the goal for the model is to predict 2 classes classification. Also, when output is 2, the training loss becomes nan.
The data is a dataframe with shape (15958, 4) transformed into tensor format.
Sorry Split_NN is a class:
# SplitNN
# to protect privacy and split
class SplitNN:
def __init__(self, models, optimizers):
self.models = models
self.optimizers = optimizers
self.data = []
self.remote_tensors = []
def forward(self, x):
data = []
remote_tensors = []
data.append(self.models[0](x))
if data[-1].location == self.models[1].location:
remote_tensors.append(data[-1].detach().requires_grad_())
else:
remote_tensors.append(
data[-1].detach().move(self.models[1].location).requires_grad_()
)
i = 1
while i < (len(models) - 1):
data.append(self.models[i](remote_tensors[-1]))
if data[-1].location == self.models[i + 1].location:
remote_tensors.append(data[-1].detach().requires_grad_())
else:
remote_tensors.append(
data[-1].detach().move(self.models[i + 1].location).requires_grad_()
)
i += 1
data.append(self.models[i](remote_tensors[-1]))
self.data = data
self.remote_tensors = remote_tensors
return data[-1]
def backward(self):
for i in range(len(models) - 2, -1, -1):
if self.remote_tensors[i].location == self.data[i].location:
grads = self.remote_tensors[i].grad.copy()
else:
grads = self.remote_tensors[i].grad.copy().move(self.data[i].location)
self.data[i].backward(grads)
def zero_grads(self):
for opt in self.optimizers:
opt.zero_grad()
def step(self):
for opt in self.optimizers:
opt.step()
Below are the codes:
Model set up: The Model is a sequential deep learning model, which I tried to use nn.linear to generated binary prediction.
torch.manual_seed(0)
# Define our model segments
input_size = 3
hidden_sizes = [128, 640]
output_size = 1
# original models
models = [
nn.Sequential(
nn.Linear(input_size, hidden_sizes[0]),
nn.ReLU(),
nn.Linear(hidden_sizes[0], hidden_sizes[1]),
nn.ReLU(),
),
nn.Sequential(nn.Linear(hidden_sizes[1], output_size), nn.LogSoftmax(dim=1)),
]
# Create optimisers for each segment and link to them
optimizers = [
optim.SGD(model.parameters(), lr=0.03,)
for model in models
]
Train model is here:
def train(x, target, splitNN):
#1) Zero our grads
splitNN.zero_grads()
#2) Make a prediction
pred = splitNN.forward(x)
#3) Figure out how much we missed by
criterion = nn.NLLLoss()
loss = criterion(pred, target)
#4) Backprop the loss on the end layer
loss.backward()
#5) Feed Gradients backward through the nework
splitNN.backward()
#6) Change the weights
splitNN.step()
return loss, pred
Finally the training part, also the part where problem happen:
the send function is for assigning model to the nodes, cuz this is set up to simulating federated learning.
for i in range(epochs):
running_loss = 0
correct_preds = 0
total_preds = 0
for (data, ids1), (labels, ids2) in dataloader:
# Train a model
data = data.send(models[0].location)
data = data.view(data.shape[0], -1)
labels = labels.send(models[-1].location)
# Call model
loss, preds = train(data.float(), labels, splitNN)
# Collect statistics
running_loss += loss.get()
correct_preds += preds.max(1)[1].eq(labels).sum().get().item()
total_preds += preds.get().size(0)
print(f"Epoch {i} - Training loss: {running_loss/len(dataloader):.3f} - Accuracy: {100*correct_preds/total_preds:.3f}")
The error show the problem occurs at loss, preds = train(data.float(), labels, splitNN)
The actual error message:
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
1836 .format(input.size(0), target.size(0)))
1837 if dim == 2:
-> 1838 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
1839 elif dim == 4:
1840 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
IndexError: Target 1 is out of bounds.
Please help me. Thank you

RuntimeError: one of the variables needed for gradient computation has been modified: is at version 2; expected version 1 instead

I'm trying the following Kaggle.
TL;DR: I want to classify a sequence (time-series) of measurements to 1 of K classes using LSTM.
I'm trying to overfit the model on 2 sequences:
My input is (B, N, M):
B : batch-size = 1
N : sequence-size = 128
M : num-of-feature = 14 (number of measurements in each timestamp)
My model is a very simple LSTM:
class LSTMClassifier(nn.Module):
def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
super(LSTMClassifier, self).__init__()
self.in_dim = in_dim
self.hidden_dim = hidden_dim
self.out_dim = out_dim
self.num_layers = num_layers
self.lstm = nn.LSTM(in_dim, hidden_dim, num_layers=num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, out_dim)
def forward(self, x):
lstm_out, (ht, ct) = self.lstm(x)
y = self.fc(ht[-1].reshape(-1, self.hidden_dim))
return y
And the train process is:
def train_lstm_model(model, data_loader, num_epochs, loss_cls, optimizer_cls, learning_rate):
start = time.time()
loss = loss_cls()
optimizer = optimizer_cls(model.parameters(), lr=learning_rate)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
for epoch in tqdm(range(num_epochs)):
hidden = (torch.zeros((1, data_loader.batch_size, model.hidden_dim), requires_grad=True).to(device),
torch.zeros(1, data_loader.batch_size, model.hidden_dim, requires_grad=True).to(device))
for i, (batch_x, batch_y) in enumerate(data_loader):
batch_x = batch_x.to(device).float()
batch_y = batch_y.to(device).long()
optimizer.zero_grad()
y_predicted, hidden = model(batch_x, hidden)
l = loss(y_predicted, batch_y)
l.backward()
optimizer.step()
# print(f'epoch {epoch+1}, batch {i+1}: loss = {l.item()} |',
# f'train accuracy: {eval_lstm_model(model, data_loader.dataset, hidden)}')
end = time.time()
print(f'Training took {end-start} seconds.')
And my setup code is:
loss_cls = nn.CrossEntropyLoss
optimizer_cls = torch.optim.SGD
hidden_dim = 100
model_lstm = LSTMClassifier(X_of.shape[-1], hidden_dim, len(np.unique(y_train)))
learning_rate = 0.01
num_epochs = 1000
train_lstm_model(model_lstm, overfit_loader, num_epochs, loss_cls, optimizer_cls, learning_rate)
The overfit_loader is a DataLoader which contains only 2 samples.
But the training process outputs the following error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-87-5f725d0ecc50> in <module>
27 learning_rate = 0.001
28 num_epochs = 100
---> 29 train_lstm_model(model_lstm, overfit_loader, num_epochs, loss_cls, optimizer_cls, learning_rate)
<ipython-input-86-ba60b3627f13> in train_lstm_model(model, data_loader, num_epochs, loss_cls, optimizer_cls, learning_rate, test_loader)
20 l = loss(y_predicted, batch_y)
21
---> 22 l.backward(retain_graph=True)
23 optimizer.step()
24
/usr/local/lib64/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
219 retain_graph=retain_graph,
220 create_graph=create_graph)
--> 221 torch.autograd.backward(self, gradient, retain_graph, create_graph)
222
223 def register_hook(self, hook):
/usr/local/lib64/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
130 Variable._execution_engine.run_backward(
131 tensors, grad_tensors_, retain_graph, create_graph,
--> 132 allow_unreachable=True) # allow_unreachable flag
133
134
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace
operation: [torch.cuda.FloatTensor [400, 14]] is at version 2; expected version 1 instead. Hint: the
backtrace further above shows the operation that failed to compute its gradient. The variable in question
was changed in there or anywhere later. Good luck!
EDIT: I've removed the loss printing and stop re-using the hidden, according to #SzymonMaszke comment, and the exception gone, but there's still a problem that the loss isn't converges below 0.7
I'd like to get some help please,
Thanks!

builtin_function_or_method' object has no attribute 'size'

optimizer = optim.SGD(model.parameters(), lr = lr)
criterion = nn.MSELoss()
valid_loss_min = np.Inf
def train(model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (img_data32,img_data64, target) in enumerate(train_loader):
img_data32,img_data64, target = Variable(img_data32.to(device)),Variable(img_data64.to(device)), Variable(target.to(device))
optimizer.zero_grad() # Gradient Zeroing
output = model(img_data32,img_data64)
loss = criterion(output,target.view(1, -1).float)
loss.backward()
optimizer.step() # Update gradient
if(batch_idx+1)%150 == 0:
print("saving model ...")
torch.save(model.state_dict(),'{}/gdrive/My Drive/RA/HEVC-CU/model_MSEloss_Adam_0.001.pt'.format(LOAD_DIR))
if(batch_idx+1)%100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(img_data32), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def validation(model, device, validation_loader,epoch):
global valid_loss_min,startTick
model.eval()
print("start validation...")
validation_loss = 0
correct = 0
label = []
for i in range(16):
label.append(str(i))
with torch.no_grad():
for img_data32,img_data64, target in validation_loader:
img_data32,img_data64, target = img_data32.to(device),img_data64.to(device), target.to(device)
output = model(img_data32,img_data64)
validation_loss = criterion(output,target.view(1,-1).float)
for i,single_pred in enumerate(output):
pred_0 = torch.argmax(single_pred[0:4])
pred_1 = torch.argmax(single_pred[4:8])
pred_2 = torch.argmax(single_pred[8:12])
pred_3 = torch.argmax(single_pred[12:16])
pred = str(int(pred_0)) + str(int(pred_1)) + str(int(pred_2)) + str(int(pred_3))
target_0 = int(target[i,0])
target_1 = int(target[i,1])
target_2 = int(target[i,2])
target_3 = int(target[i,3])
if str(pred[0]) == str(target_0):
correct += 1
if str(pred[1]) == str(target_1):
correct += 1
if str(pred[2]) == str(target_2):
correct += 1
if str(pred[3]) == str(target_3):
correct += 1
validation_loss = validation_loss*BATCH_SIZE/len(validation_loader.dataset)
timeSpan = time.clock() - startTick # Calculation takes time
print('EPOCH:{} Time used:{} Validation set: Average loss: {:.4f}'.format(epoch,str(timeSpan),validation_loss))
print('\nAccuracy: {}/{} ({:.2f}%)\n'.format(correct, len(validation_loader.dataset)*4, 100. * correct / len(validation_loader.dataset)/4))
if validation_loss < valid_loss_min:
valid_loss_min = validation_loss
print("saving model ...")
torch.save(model.state_dict(),'{}/gdrive/My Drive/RA/HEVC-CU/model_MSEloss_Adam_0.001.pt'.format(LOAD_DIR))
for epoch in range(1, EPOCHS + 1):
train(model, DEVICE, train_loader, optimizer, epoch)
validation(model, DEVICE, validation_loader,epoch)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py:443: UserWarning: Using a target size (torch.Size([1, 16])) that is different to the input size (torch.Size([4, 16])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-147-c7589b926dae> in <module>()
1 for epoch in range(1, EPOCHS + 1):
----> 2 train(model, DEVICE, train_loader, optimizer, epoch)
3 validation(model, DEVICE, validation_loader,epoch)
3 frames
<ipython-input-144-a7cdc82e4f14> in train(model, device, train_loader, optimizer, epoch)
17 #loss = criterion(output[:,0:4], target[:,0].float)+criterion(output[:,4:8], target[:,1].float)+criterion(output[:,8:12], target[:,2].float)+criterion(output[:,12:16], target[:,3].float)
18 #loss = criterion(output,target.view(1,-1))
---> 19 loss = criterion(output,target.view(1, -1))
20 loss.backward()
21 optimizer.step() # Update gradient
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
491 result = self._slow_forward(*input, **kwargs)
492 else:
--> 493 result = self.forward(*input, **kwargs)
494 for hook in self._forward_hooks.values():
495 hook_result = hook(self, input, result)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
441 #weak_script_method
442 def forward(self, input, target):
--> 443 return F.mse_loss(input, target, reduction=self.reduction)
444
445
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in mse_loss(input, target, size_average, reduce, reduction)
2255 else:
2256 expanded_input, expanded_target = torch.broadcast_tensors(input, target)
-> 2257 ret = torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
2258 return ret
2259
RuntimeError: Expected object of scalar type Float but got scalar type Long for argument #2 'target'
I was using cross-entropy loss function before but now I gonna to use MSE but I got this error:
"AttributeError: 'builtin_function_or_method' object has no attribute 'size'"
I tried these as well:
loss = criterion(output,target.view(1,-1))
validation_loss = criterion(output,target.view(1,-1))
but it get me this error:
RuntimeError: Expected object of scalar type Float but got scalar type Long for argument #2 'target'
I will appreciate if anyone helps me to solve this.