DQN model (Game: Atari PongNoFrameskip) does not learn - deep-learning

I'm trying to implement a DQN model of Pong game. However, it still performs like random activities even after about 1000 episodes. The CNN training seems not improve the agents.
Here is my main code:
I create a CNN including three convolution layers after pooling and three forward connection layers. the input channels is the number of pre-processed frame (from 3210160 to 48484 and the channel is 4):
class CNN(nn.Module):
def __init__(self, s_channels, a_space):
super(CNN, self).__init__()
self.pool = nn.MaxPool2d(kernel_size=2, stride=1)
self.conv1 = nn.Conv2d(s_channels,out_channels=32,kernel_size=8,stride=4)
self.conv2 = nn.Conv2d(32,64,4,2)
self.conv3 = nn.Conv2d(64,64,3,1)
self.fc1 = nn.Linear(64*4*4,1024)
self.fc2 = nn.Linear(1024,512)
self.fc3 = nn.Linear(512,a_space)
def forward(self,input):
output = self.pool(F.relu(self.conv1(input)))
output = self.pool(F.relu(self.conv2(output)))
output = self.pool(F.relu(self.conv3(output)))
output = output.view(-1,64*4*4)
output = F.relu(self.fc1(output))
output = F.relu(self.fc2(output))
output = F.relu(self.fc3(output))
return output
After that, I construct an agent class with action selection and CNN training functions. In CNN training function, I use the batch input to come out the loss value instead of step-by-step for-loop iteration of batch data. before coming out the loss and backward, I transform the input image data into vectors of batch size. Here is the agent class:
class Agent():
def __init__(self, s_space, a_space, device) -> None:
# set GPU device to cuda
self.device = device
# define parameters
self.epsilon = 1.0
self.min_epsilon = 0.01
self.dr = 0.995
self.lr = 0.001
self.gamma = 0.9
# define models
self.evl_net = CNN(s_space, a_space).to(self.device)
self.tgt_net = CNN(s_space, a_space).to(self.device)
self.cert = nn.SmoothL1Loss()
self.optimal = th.optim.Adam(self.evl_net.parameters(),lr=self.lr)
# define memory store
self.memory = deque(maxlen=2000)
# pre-process the input image data
def data_pre_process(self,batch_size):
s_v = []
a_v = []
next_s_v = []
r_v = []
dones = []
materials = random.sample(self.memory,batch_size)
for t in materials:
s_v.append(t[0])
a_v.append(t[1])
next_s_v.append(t[2])
r_v.append(t[3])
dones.append(t[4])
s_v = th.Tensor(s_v).to(self.device)
a_v = th.LongTensor(a_v).unsqueeze(1).to(self.device)
r_v = th.FloatTensor(r_v).to(device)
# print(r_v.shape)
return s_v, a_v, next_s_v, r_v, dones
# record the transformed images
def record(self,tpl):
self.memory.append(tpl)
# select actions according to the states (input images with 4 channels)
def select(self,state,a_space):
actions = self.evl_net(state).data.tolist()
if(random.random() <= self.epsilon):
action = random.randint(0,a_space-1)
else:
action = actions.index(max(actions))
return action
# save CNN model
def save(self):
th.save(self.evl_net.state_dict(), "./Pong.pth")
# at the beginning load the saved CNN model
def load(self,s_channels, a_space):
self.evl_net = CNN(s_channels, a_space).to(self.device)
self.evl_net.load_state_dict(th.load("./Pong.pth"))
# DQN replay progression
def train(self,state,batch_size):
"""
s_v_size: [batch_size,4,84,84] type: Tensor
s_a_size: [batch_size,1] type: Tensor
next_s_v_size: [batch_size,4,84,84] type: List
r_v_size: [1,batch_size] type: Tensor
dones_size: [batch_size] type: List
"""
s_v,a_v,next_s_v,r_v,dones = self.data_pre_process(batch_size)
self.tgt_net.load_state_dict(self.evl_net.state_dict())
# create evl_Q_value tensor
evl_Q_value = self.evl_net(s_v).gather(0,a_v) # size: [batch_size,6].gather() -> [batch_size,1] Type: Tensor
# correctly transform next_s_v into tensor:
nonDone_index = th.LongTensor(tuple([i for i,x in enumerate(dones) if x!=True])).to(self.device)
tgt_Q_value = th.zeros(batch_size).to(device)
true_next_s_v = list(filter((None).__ne__,next_s_v)) # pop the "None" elements
true_next_s_v = th.FloatTensor(true_next_s_v).to(self.device) # size: [notDone_batch_size,4,84,84]
# print(true_next_s_v.shape)
tgt = self.tgt_net(true_next_s_v).max(1)[0].detach() # size [1,notDone_batch_size] Type: Tensor
# print(tgt.shape)
# update tgt_Q_value
tgt_Q_value[nonDone_index] = tgt
tgt_Q_value = r_v + self.gamma * tgt_Q_value
tgt_Q_value = tgt_Q_value.reshape(batch_size,1) # size: [batch_size, 1] cannot be back propagated
# print(tgt_Q_value)
self.optimal.zero_grad()
loss = self.cert(evl_Q_value, tgt_Q_value)
loss.backward()
# constrain the gradient from explosion
for p in self.evl_net.parameters():
p.grad.data.clamp_(-1, 1)
self.optimal.step()
# decrease fire
if(self.epsilon > self.min_epsilon):
self.epsilon *= self.dr
In the main training progress, I set the batch size increasing from 32 to 64 for accelerating the operation. The CNN will be updated each four episodes. The statistic information will be printed each ten episodes.
# set GPU device to cuda
device = th.device("cuda:0" if th.cuda.is_available() else "cpu")
# set episode step and batch_size
episodes = 5000
batch_size = 32
env = gym.make("PongNoFrameskip-v4")
env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=True, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
# create frame stack for the input image data (size: (4,84,84))
env = gym.wrappers.FrameStack(env, 4)
channels = env.observation_space.shape[0]
a_space = env.action_space.n
agent = Agent(channels, a_space, device)
agent.load(channels, a_space)
# testing start:
for e in range(episodes):
# step 1: reset the agent at the beginning
s = np.array(env.reset())
img = plt.imshow(env.render('rgb_array'))
done = False
score = 0
while not done:
# step 2: iterate actions
a = agent.select(th.Tensor(s).unsqueeze(0).to(device),a_space)
next_s, reward, done, _ = env.step(a)
if(done==True):
reward = -1.0
next_s = None
else:
next_s = np.array(next_s)
# print(next_s.shape)
# step 3: record the data into buffer
dataset = (s,a,next_s,reward,done)
agent.record(dataset)
# step 4: update state steps
s = next_s
score += reward
# step 5: training and update CNN by each 4 episodes
if(len(agent.memory) > batch_size and e % 4 == 0):
agent.train(channels,batch_size)
agent.save()
# appendix 1: at the beginning increase batch_size from 32 to 64
if(batch_size < 64):
batch_size += 1
# appendix 2: return score by each 10 episodes
if(e % 10 == 0 and len(agent.memory)>batch_size):
print("episodes:",e,"score:",score,"epsilon: {:.2}".format(agent.epsilon))
During running there is not any error information reminded. However, the agent does not perform as well as expected. After 1000 episodes, it still returns minus score as it did at the very start. The output is like this:
episodes: 800 score: -20.0 epsilon: 0.37
episodes: 810 score: -21.0 epsilon: 0.36
episodes: 820 score: -21.0 epsilon: 0.36
episodes: 830 score: -21.0 epsilon: 0.35
episodes: 840 score: -21.0 epsilon: 0.35
episodes: 850 score: -21.0 epsilon: 0.34
episodes: 860 score: -21.0 epsilon: 0.34
episodes: 870 score: -21.0 epsilon: 0.34
episodes: 880 score: -20.0 epsilon: 0.33
episodes: 890 score: -21.0 epsilon: 0.33
episodes: 900 score: -20.0 epsilon: 0.32
episodes: 910 score: -21.0 epsilon: 0.32
episodes: 920 score: -21.0 epsilon: 0.31
episodes: 930 score: -21.0 epsilon: 0.31
episodes: 940 score: -21.0 epsilon: 0.31
episodes: 950 score: -21.0 epsilon: 0.3
episodes: 960 score: -21.0 epsilon: 0.3
episodes: 970 score: -21.0 epsilon: 0.3
episodes: 980 score: -21.0 epsilon: 0.29
I rechecked the structure of the model according to the algorithm theory but found nothing different. I hope to get some advice and help on how to deal with this problem.

Related

Why are all weights in my GRU close to zero after training?

Problem description:
I want to train a GRU by using a small dataset, which has 40 training samples. Length of input sequence is 3 and length of output sequence is 1. Below is the configuraions:
hidden units: 128
input size(input feature num): 11
layer: 2
drupout: 0.5
bidirectional: False
lr: 0.0001
optimizer: Adam
batch size: 8
Below is my model. Thank Azhar Khan for helping me format the code.
class GRU(nn.Module):
def __init__(self, GRU_input_num, GRU_hidden_num, GRU_layer_num, dropout, bidirectional, seed):
super(GRU, self).__init__()
torch.manual_seed(seed)
self.GRU_input_num = GRU_input_num
self.GRU_hidden_num = GRU_hidden_num
self.GRU_layer_num = GRU_layer_num
self.dropout = dropout
self.bidirectional = bidirectional
self.direction = 2 if self.bidirectional else 1
if GRU_layer_num > 1:
self.gru = nn.GRU(GRU_input_num, GRU_hidden_num, GRU_layer_num, dropout=dropout, batch_first=True,
bidirectional=bidirectional)
else:
self.gru = nn.GRU(GRU_input_num, GRU_hidden_num, GRU_layer_num, batch_first=True,
bidirectional=bidirectional)
self.predict = nn.Linear(in_features=self.GRU_hidden_num * self.direction * self.GRU_layer_num,
out_features=constants.CATEGORY_NUM * constants.BACTERIA_PER_CATEGORY)
def forward(self, x):
"""GRU forward. Input shape: [batch, input_step, features]"""
# shape of hidden state: [layer * direction, batch, hidden]
batch = x.shape[0]
_, hidden_state = self.gru(x)
hidden_state = hidden_state.permute(1, 0, 2) # [batch, layer * direction, hidden]
hidden_state = hidden_state.contiguous().view(batch, 1, self.GRU_hidden_num * self.direction * self.GRU_layer_num).squeeze(1) # [batch, layer * direction * hidden_num]
return self.predict(hidden_state), hidden_state.cpu().detach().numpy() # [batch, features]
I am curious about how will the hidden state change during training. Here are my observations.
My observations:
Heatmap of hidden state.
I printed the heatmap of hidden states every 25 epochs. x axis is sample index in one batch, y axis is hidden state dimension and the title is the sum of all hidden states. The figures are showed below
after 25 epochs
after 75 epochs
after 150 epochs
after 300 epochs
after 550 epochs
The summation keeps shrink along with training.
Weights of GRU.
I also have a glance to the weights of GRU, below is a snapshot of part of weights.
snapshot of GRU weights
Almost all the weights are end with 'e-40(or 41 etc.)', which mean every single weight close to zero.
My question:
Is this a normal phenomenon? If not, what could be the cause of this issue?
Thanks for anyone who can give me some comments about this, and do let me know what else information you need.

deep learning RestNet problem calculate confusion matrix and other matrixes

i am new to deep learning.
I am running a code to train and test a model and find its precision recall f1-score support and confusion matrix.
plz see the code and tell me that am i coding right for taking F1 score and other matrixes. my accuracy is .97.
not sure about
have i taken the right prediction
have i compute the right confusion matrix.
guide me that the confusion matrix is ok or not.
enterinput_shape = (128, 128, 3)
batch_size = 64
epochs = 10
epoch_list = list(range(1, epochs+1))
Path to training & testing set.
train_dir = 'train'
test_dir = 'test'
train_dir_fake, test_dir_fake = os.path.join(train_dir, 'forged'), os.path.join(test_dir, 'forged')
train_dir_real, test_dir_real = os.path.join(train_dir, 'real'), os.path.join(test_dir, 'real')
train_fake_fnames, test_fake_fnames = os.listdir(train_dir_fake), os.listdir(test_dir_fake)
train_real_fnames, test_real_fnames = os.listdir(train_dir_real), os.listdir(test_dir_real)"
Training Data Generator.
train_datagen = ImageDataGenerator(rescale=1./255.)
Testing Data Generator.
test_datagen = ImageDataGenerator(rescale=1./255.)
Flow training images in batches of 64 using train_datagen generator
train_generator = train_datagen.flow_from_directory(train_dir,
target_size=(128, 128),
batch_size=batch_size,
shuffle='False',
class_mode='binary')
Flow test images in batches of 64 using test_datagen generator
test_generator = test_datagen.flow_from_directory(test_dir,
target_size=(128, 128),
batch_size=batch_size,
shuffle='False',
class_mode='binary')
ResNet50V2_model = ResNet50V2(input_shape=input_shape, include_top=False, weights="imagenet", classes=2)
for i in range(50):
l = ResNet50V2_model.get_layer(index=i)
l.trainable = True
model = Sequential()
model.add(ResNet50V2_model)
model.add(GlobalAveragePooling2D())
model.add(Dense(units=1, activation='sigmoid'))
Compiling the Model.
model.compile(loss='binary_crossentropy',
optimizer=optimizers.Adam(learning_rate=1e-6, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0),
metrics=['accuracy'])
reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, mode='auto')
early_stopping = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, verbose=0, mode='auto')
Starting the Training.
history = model.fit(train_generator, epochs=epochs, validation_data=test_generator)
storing model
network_name = "ResNet50V2"
try:
os.mkdir("./Reference_Data")
os.mkdir("./Reference_Data/Graphs")
os.mkdir("./Reference_Data/Summary")
os.mkdir("./Reference_Data/Model")
except OSError:
pass
try:
os.mkdir(os.path.join("./Reference_Data/Graphs", network_name))
except OSError:
pass
!dir
acc = np.linspace(min(epoch_list), max(epoch_list), 200)
val_acc = np.linspace(min(epoch_list), max(epoch_list), 200)
#define spline for accuracy
spl1 = make_interp_spline(epoch_list, history.history['accuracy'], k=3)
y_smooth1 = spl1(acc)
#define spline accuracy
spl2 = make_interp_spline(epoch_list, history.history['val_accuracy'], k=3)
y_smooth2 = spl2(val_acc)
with open("./Reference_Data/Summary/" + network_name + "summary.txt", 'w+') as f:
model.summary(print_fn=lambda x: f.write(x + '\n'))
Saving the Model for Inference Purpose.
model.save('./Reference_Data/Model/' + network_name + '/')
model.save('./Reference_Data/Model/' + network_name + '/' + network_name + '.h5')
test_generator.reset()
Y_pred = model.predict(test_generator,)
classes = test_generator.classes[test_generator.index_array]
y_pred = np.argmax(Y_pred, axis=-1)
y_pred=y_pred.round()
sum(y_pred==classes)/10000
pred=model.predict(test_generator,verbose=1)
def get_classification_report(
model, data_dir, batch_size=64,
steps=None, threshold=0.5, output_dict=False
):
data = get_test_data_generator(data_dir, batch_size=batch_size)
predictions = predict(model, data, steps, threshold)
predictions = predictions.reshape((predictions.shape[0],))
return classification_report(data.classes, predictions, output_dict=output_dict)
import sklearn.metrics as metrics
#y_pred = np.argmax(y_pred,axis=0)
#y_true=np.argmax(test_generator.classes,axis=0)
report = metrics.classification_report(true_classes, Y_pred.round(), target_names=class_labels,zero_division=0.0)
print(report)
precision recall f1-score support
forged 0.40 0.40 0.40 773
real 0.60 0.60 0.60 1172
accuracy 0.52 1945
macro avg 0.50 0.50 0.50 1945
weighted avg 0.52 0.52 0.52 1945

Emotion Detection with FER2013 and MPI dataset using CNN and TFLearn

I am just beginner for Deep Learning. I try to capture all details.
Perplex is derived from MPI dataset. other emotions are derived from FER2013. To balance the data set, all emotions into (training: 3171, validation: 816) strategy, due to lack of perplex dataset.
Dataset Size:
perplex happy sad neutral angry
train 3171 3171 3171 3171 3171
perplex happy sad neutral angry
test 816 816 816 816 815
FER2013 source: downsized version of
https://www.kaggle.com/msambare/fer2013
MPI source (only cam 2, 3 & 4 angle, of all actors emotions such as Clueless, Confusion and Thinking):
https://www.b-tu.de/en/graphic-systems/databases/the-small-mpi-facial-expression-database
https://www.b-tu.de/fg-graphische-systeme/datenbanken/die-grosse-mpi-gesichtsausdrueckedatenbank
preprocess steps:
Firstly, all 3171 x 5 = 15855 and 816 x 5 = 4079 images into 48x48 gray scale.
Samples:
Sample dataset for Angry, Happy, Perplex and Sad
CNN Architecute using TFLearn:
# Input Layer
convnet = input_data(name="input", shape=[None, 48, 48, 1])
#Enabling Filters
convnet = conv_2d(convnet, 32, 5, activation = "relu")
convnet = max_pool_2d(convnet, 5)
convnet = conv_2d(convnet, 64, 5, activation = "relu")
convnet = max_pool_2d(convnet, 5)
convnet = conv_2d(convnet, 128, 5, activation = "relu")
convnet = max_pool_2d(convnet, 5)
convnet = conv_2d(convnet, 64, 5, activation = "relu")
convnet = max_pool_2d(convnet, 5)
convnet = conv_2d(convnet, 32, 5, activation = "relu")
convnet = max_pool_2d(convnet, 5)
convnet = fully_connected(convnet, 1024, activation = "relu")
convnet = dropout(convnet, 0.5)
#Output Layer
convnet = fully_connected(convnet, 5, activation = "softmax")
convnet = regression(convnet, optimizer = "SGD", learning_rate = 0.001, loss = "categorical_crossentropy", name = "targets")
model = tflearn.DNN(convnet,
best_checkpoint_path = best_cp_path + '/',
best_val_accuracy = "BEST_VAL_ACCURACY",
tensorboard_dir = log_path,
tensorboard_verbose = 3)
image_size = 48
channel = 1
#Split for training and testing
train_x = np.array([index[0] for index in train]).reshape(-1, image_size, image_size, channel)
train_y = np.array([index[1] for index in train])
test_x = np.array([index[0] for index in test]).reshape(-1, image_size, image_size, channel)
test_y = np.array([index[1] for index in test])
model.fit(
train_x,
train_y,
validation_set = (test_x, test_y),
n_epoch = 500,
snapshot_step = 500,
show_metric = True,
run_id = "ED_SGD-0.001",
snapshot_epoch = True
)
I am shuffling all training and validation files before splitting as train and test input.
The training is overfitting after 150+ epoch out of 500, I am using raw SGD without momentum and learning decay, I tried Adam also same overfitting issue. the val_accuracy is 0.62 and train_accuracy is 0.8+
Early Saving Method: I am saving 0.62 accuracy model files in the model folder.
Odd's in my mind:
If you see closely all perplex emotion have black background, because it's taken in lab environment. Other emotions are taken from fer2013 it's lively with different grey shade background and some black also I can find.
How to overcome this overfitting issue?
Which hyper-parameter values
should I tune?
Should I upscale to 7000+ images as like in FER2013
dataset?
Should I apply different background of grey shades randomly to perplex
images?
How to increase the accuracy?
Loss curve:
Training & Validation Curve
Last Epoch values (taken from tensorboard graph, some values I didn't save from terminal):
Training Step: 17200+ | total loss: 1.7+ | time:
| SGD | epoch: 500 | **loss: 0.5236** - acc: 0.8270 | val_loss: 1.30219 - val_acc: 0.6239 -- iter: 15855/15855
GitHub:
you are warmly welcome
https://github.com/tcsbmogarage/ED.git

HuggingFace BertForMaskedLM: Expected input batch_size (3200) to match target batch_size (16)

Im working on a Multiclass Classification (Bengali Language Sentiment Analysis) on a pretrained Huggingface (BertForMaskedLM) model.
When the error occured I knew I have to change the label(output) size to match the input. But do not know how. Im adding the code snippents below.
MAX_LEN = 200
BATCH_SIZE = 16
The pretrained models used:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
model = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
tokenizer = BertTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
Code to make the pytorch dataset:
class GPReviewDataset(Dataset):
def __init__(self, reviews, targets, tokenizer, max_len):
self.reviews = reviews
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.reviews)
def __getitem__(self, item):
review = str(self.reviews[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
review,
add_special_tokens=True,
max_length=self.max_len,
truncation = True,
return_token_type_ids=False,
padding='max_length',
return_attention_mask=True,
return_tensors='pt',
)
return {
'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
The input dimentions are:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)
Which Outputs:
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
Training Class
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
model = model.train() # tells your model that we are training
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
loss, logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels = targets
)
#logits = classification scores befroe softmax
#loss = classification loss
logits = logits.view(-1, 28*28).detach().cpu().numpy()
label_ids = targets.to('cpu').numpy()
preds = np.argmax(logits, axis=1).flatten() #returns indices of maximum logit
targ = label_ids.flatten()
correct_predictions += np.sum(preds == targ)
losses.append(loss.item())
loss.backward() # performs backpropagation(computes derivates of loss w.r.t to parameters)
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) #clipping gradients so they dont explode
optimizer.step() #After gradients are computed by loss.backward() this makes the optimizer iterate over all parameters it is supposed to update and use internally #stored grad to update their values
scheduler.step() # this will make sure learning rate changes. If we dont provide this learning rate stays at initial value
optimizer.zero_grad() # clears old gradients from last step
return correct_predictions / n_examples, np.mean(losses)
Where the training Starts (Where the error triggers):
%%time
# standard block
# used accuracy as metric here
history = defaultdict(list)
best_acc = 0
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))
print(f'Train loss {train_loss} Accuracy {train_acc}')
val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))
print(f'Val loss {val_loss} Accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_acc:
torch.save(model.state_dict(), 'best_model_state_a5.bin')
best_acc = val_acc
The error:
Epoch 1/5
----------
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-41-fb5a4d77ce37> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', "# standard block\n# used accuracy as metric here\nhistory = defaultdict(list)\n\nbest_acc = 0\n\nfor epoch in range(EPOCHS):\n\n print(f'Epoch {epoch + 1}/{EPOCHS}')\n print('-' * 10)\n\n train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))\n\n print(f'Train loss {train_loss} Accuracy {train_acc}')\n\n val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))\n\n print(f'Val loss {val_loss} Accuracy {val_acc}')\n print()\n\n history['train_acc'].append(train_acc)\n history['train_loss'].append(train_loss)\n history['val_acc'].append(val_acc)\n history['val_loss'].append(val_loss)\n\n if val_acc > best_acc:\n torch.save(model.state_dict(), 'best_model_state_a5.bin')\n best_acc = val_acc\n\n# We are storing state of best model indicated by highest validation accuracy")
8 frames
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2115 magic_arg_s = self.var_expand(line, stack_depth)
2116 with self.builtin_trap:
-> 2117 result = fn(magic_arg_s, cell)
2118 return result
2119
<decorator-gen-53> in time(self, line, cell, local_ns)
/usr/local/lib/python3.7/dist-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
/usr/local/lib/python3.7/dist-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns)
1191 else:
1192 st = clock2()
-> 1193 exec(code, glob, local_ns)
1194 end = clock2()
1195 out = None
<timed exec> in <module>()
<ipython-input-39-948eefef2f8d> in train_epoch(model, data_loader, optimizer, device, scheduler, n_examples)
13 input_ids=input_ids,
14 attention_mask=attention_mask,
---> 15 labels = targets
16 )
17
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict)
1327 if labels is not None:
1328 loss_fct = CrossEntropyLoss() # -100 index = padding token
-> 1329 masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1330
1331 if not return_dict:
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
1119 def forward(self, input: Tensor, target: Tensor) -> Tensor:
1120 return F.cross_entropy(input, target, weight=self.weight,
-> 1121 ignore_index=self.ignore_index, reduction=self.reduction)
1122
1123
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
2822 if size_average is not None or reduce is not None:
2823 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2824 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2825
2826
ValueError: Expected input batch_size (3200) to match target batch_size (16).

RuntimeError: one of the variables needed for gradient computation has been modified: is at version 2; expected version 1 instead

I'm trying the following Kaggle.
TL;DR: I want to classify a sequence (time-series) of measurements to 1 of K classes using LSTM.
I'm trying to overfit the model on 2 sequences:
My input is (B, N, M):
B : batch-size = 1
N : sequence-size = 128
M : num-of-feature = 14 (number of measurements in each timestamp)
My model is a very simple LSTM:
class LSTMClassifier(nn.Module):
def __init__(self, in_dim, hidden_dim, out_dim, num_layers):
super(LSTMClassifier, self).__init__()
self.in_dim = in_dim
self.hidden_dim = hidden_dim
self.out_dim = out_dim
self.num_layers = num_layers
self.lstm = nn.LSTM(in_dim, hidden_dim, num_layers=num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, out_dim)
def forward(self, x):
lstm_out, (ht, ct) = self.lstm(x)
y = self.fc(ht[-1].reshape(-1, self.hidden_dim))
return y
And the train process is:
def train_lstm_model(model, data_loader, num_epochs, loss_cls, optimizer_cls, learning_rate):
start = time.time()
loss = loss_cls()
optimizer = optimizer_cls(model.parameters(), lr=learning_rate)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
for epoch in tqdm(range(num_epochs)):
hidden = (torch.zeros((1, data_loader.batch_size, model.hidden_dim), requires_grad=True).to(device),
torch.zeros(1, data_loader.batch_size, model.hidden_dim, requires_grad=True).to(device))
for i, (batch_x, batch_y) in enumerate(data_loader):
batch_x = batch_x.to(device).float()
batch_y = batch_y.to(device).long()
optimizer.zero_grad()
y_predicted, hidden = model(batch_x, hidden)
l = loss(y_predicted, batch_y)
l.backward()
optimizer.step()
# print(f'epoch {epoch+1}, batch {i+1}: loss = {l.item()} |',
# f'train accuracy: {eval_lstm_model(model, data_loader.dataset, hidden)}')
end = time.time()
print(f'Training took {end-start} seconds.')
And my setup code is:
loss_cls = nn.CrossEntropyLoss
optimizer_cls = torch.optim.SGD
hidden_dim = 100
model_lstm = LSTMClassifier(X_of.shape[-1], hidden_dim, len(np.unique(y_train)))
learning_rate = 0.01
num_epochs = 1000
train_lstm_model(model_lstm, overfit_loader, num_epochs, loss_cls, optimizer_cls, learning_rate)
The overfit_loader is a DataLoader which contains only 2 samples.
But the training process outputs the following error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-87-5f725d0ecc50> in <module>
27 learning_rate = 0.001
28 num_epochs = 100
---> 29 train_lstm_model(model_lstm, overfit_loader, num_epochs, loss_cls, optimizer_cls, learning_rate)
<ipython-input-86-ba60b3627f13> in train_lstm_model(model, data_loader, num_epochs, loss_cls, optimizer_cls, learning_rate, test_loader)
20 l = loss(y_predicted, batch_y)
21
---> 22 l.backward(retain_graph=True)
23 optimizer.step()
24
/usr/local/lib64/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
219 retain_graph=retain_graph,
220 create_graph=create_graph)
--> 221 torch.autograd.backward(self, gradient, retain_graph, create_graph)
222
223 def register_hook(self, hook):
/usr/local/lib64/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
130 Variable._execution_engine.run_backward(
131 tensors, grad_tensors_, retain_graph, create_graph,
--> 132 allow_unreachable=True) # allow_unreachable flag
133
134
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace
operation: [torch.cuda.FloatTensor [400, 14]] is at version 2; expected version 1 instead. Hint: the
backtrace further above shows the operation that failed to compute its gradient. The variable in question
was changed in there or anywhere later. Good luck!
EDIT: I've removed the loss printing and stop re-using the hidden, according to #SzymonMaszke comment, and the exception gone, but there's still a problem that the loss isn't converges below 0.7
I'd like to get some help please,
Thanks!