I have a model training and I got this plot. It is over audio (about 70K of around 5-10s) and no augmentation is being done. I have tried the following to avoid overfitting:
Reduce complexity of the model by reducing number of GRU cells and hidden dimensions.
Add dropout in each layer.
I have tried with higher dataset.
What I am not sure is if my calculation of training loss and validation loss is correct. It is something like this. I am using drop_last=True and I am using the CTC loss criterion.
train_data_len = len(train_loader.dataset)
valid_data_len = len(valid_loader.dataset)
epoch_train_loss = 0
epoch_val_loss = 0
train_losses = []
valid_losses = []
model.train()
for e in range(n_epochs):
t0 = time.time()
#batch loop
running_loss = 0.0
for batch_idx, _data in enumerate(train_loader, 1):
# Calculate output ...
# bla bla
loss = criterion(output, labels.float(), input_lengths, label_lengths)
loss.backward()
optimizer.step()
scheduler.step()
# loss stats
running_loss += loss.item() * specs.size(0)
t_t = time.time() - t0
######################
# validate the model #
######################
with torch.no_grad():
model.eval()
tv = time.time()
running_val_loss = 0.0
for batch_idx_v, _data in enumerate(valid_loader, 1):
#bla, bla
val_loss = criterion(output, labels.float(), input_lengths, label_lengths)
running_val_loss += val_loss.item() * specs.size(0)
print("Epoch {}: Training took {:.2f} [s]\tValidation took: {:.2f} [s]\n".format(e+1, t_t, time.time() - tv))
epoch_train_loss = running_loss / train_data_len
epoch_val_loss = running_val_loss / valid_data_len
train_losses.append(epoch_train_loss)
valid_losses.append(epoch_val_loss)
print('Epoch: {} Losses\tTraining Loss: {:.6f}\tValidation Loss: {:.6f}'.format(
e+1, epoch_train_loss, epoch_val_loss))
model.train()
Related
I've trained the GooogLeNet from scratch on the MNIST dataset. It achieved very good results (top-1 accuracy of 99% on test set).
Now I want to do transfer learning in order to adapt it to the FashionMNIST dataset. For that I'm doing the following:
# Loading trained model on MNIST
googlenet = torch.load('googlenet-mnist.pth')
# Freeze the network
def freeze(net):
for param in net.parameters():
param.requires_grad = False
return net
# Override all the Linear layers and initialize them
# (including the ones that produce auxiliarity logits)
def forget_FC(net):
net.aux1.fc1 = nn.Linear(in_features=net.aux1.fc1.in_features, out_features=net.aux1.fc1.out_features, bias=True)
net.aux1.fc2 = nn.Linear(in_features=net.aux1.fc2.in_features, out_features=net.aux1.fc2.out_features, bias=True)
net.aux2.fc1 = nn.Linear(in_features=net.aux2.fc1.in_features, out_features=net.aux2.fc1.out_features, bias=True)
net.aux2.fc2 = nn.Linear(in_features=net.aux2.fc2.in_features, out_features=net.aux2.fc2.out_features, bias=True)
# Override the classification layer
net.fc = nn.Sequential(
nn.Linear(num_in_features, num_in_features),
nn.Linear(num_in_features, num_in_features),
nn.Linear(num_in_features, 10))
# Initialize weights auxiliarity logits branches
torch.nn.init.trunc_normal_(net.aux1.fc1.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux1.fc2.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux2.fc1.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux2.fc2.weight, mean=0.0, std=0.01, a=-2, b=2)
# Initialize weights each Linear module in the classification layer
for module in net.fc.modules():
if isinstance(module, nn.Linear):
torch.nn.init.trunc_normal_(module.weight, mean=0.0, std=0.01, a=-2, b=2)
return net
# The training algorithm
def train(net, train_iter, test_iter, num_epochs, lr, device, plot_title, fine_tune=False):
"""Train a model with a GPU.
"""
# def init_weights(m):
# if type(m) == nn.Linear or type(m) == nn.Conv2d:
# nn.init.xavier_uniform_(m.weight)
# net.apply(init_weights)
print('training on', device)
progress = ""
net.to(device)
if fine_tune:
optimizer = torch.optim.SGD(net.fc.parameters(), lr=lr, momentum=.9)
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=.9)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=1, verbose=True)
loss = nn.CrossEntropyLoss()
animator = Animator(xlabel='epoch', xlim=[1, num_epochs], title=plot_title, ylim=[0, 1], figsize=(5,5),
legend=['train loss', 'train acc', 'val acc'])
timer, num_batches = d2l.Timer(), len(train_iter)
for epoch in range(num_epochs):
# Sum of training loss, sum of training accuracy, sum of top 5 training accuracy, no. of examples
metric = d2l.Accumulator(4)
net.train()
# Training
for i, (X, y) in enumerate(train_iter):
timer.start()
optimizer.zero_grad()
X, y = X.to(device), y.to(device)
# Mini-batch inference
y_hat = net(X)
# Take into account the auxiliarity logits (see link cell above)
if isinstance(y_hat, GoogLeNetOutputs):
aux_logit1, aux_logit2, y_hat = y_hat
l1 = loss(y_hat, y)
l2 = loss(aux_logit1, y)
l3 = loss(aux_logit2, y)
l = l1 + .3 * (l2 + l3)
else:
l = loss(y_hat, y)
l.backward()
optimizer.step()
# Training accuracies
with torch.no_grad():
acc_1, acc_5 = accuracy(y_hat, y)
metric.add(l * X.shape[0], acc_1, acc_5, X.shape[0])
timer.stop()
train_l = metric[0] / metric[3]
train_acc_1 = metric[1] / metric[3]
train_acc_5 = metric[2] / metric[3]
if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
animator.add(epoch + (i + 1) / num_batches,
(train_l, train_acc_1, None), plot_title)
# Validation, (validation loss computed when model in eval mode, is that correct?)
val_l, test_acc_1, test_acc_5 = evaluate_accuracy_gpu(net, test_iter)
scheduler.step(val_l)
animator.add(epoch + 1, (None, None, test_acc_1), plot_title)
# Un-comment to see memory consumption, modify batch size to see effects
# print(os.popen('nvidia-smi').read())
# break
progress += f"----\nEpoch {epoch}/{num_epochs}\n\ttrain loss={train_l}[{train_acc_1}]\tval loss={val_l} [{test_acc_1}]\n----"
print(progress)
print(f'loss={train_l:.3f}, train=[1-acc {train_acc_1:.3f}, 5-acc {train_acc_5:.3f}]'
f'test=[1-acc {test_acc_1:.3f}, 5-acc {test_acc_5:.3f}]')
print(f'{metric[3] * num_epochs / timer.sum():.1f} examples/sec '
f'on {str(device)}')
print(f'total training time: {timer.sum()} seconds')
With this approach 34% training accuracy is achieved. Honestly, I was expecting more close to results obtained in MNIST. What is wrong with my current approach?
I am a bit confused as to how to calculate Validation loss? Are validation loss to be computed at the end of an epoch OR should the loss be also monitored during iteration through the batches ?
Below I have computed using running_loss which is getting accumulated over batches - but I want to see if its the correct approach?
def validate(loader, model, criterion):
correct = 0
total = 0
running_loss = 0.0
model.eval()
with torch.no_grad():
for i, data in enumerate(loader):
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
running_loss = running_loss + loss.item()
mean_val_accuracy = (100 * correct / total)
mean_val_loss = ( running_loss )
#mean_val_accuracy = accuracy(outputs,labels)
print('Validation Accuracy: %d %%' % (mean_val_accuracy))
print('Validation Loss:' ,mean_val_loss )
Below is the training block I am using
def train(loader, model, criterion, optimizer, epoch):
correct = 0
running_loss = 0.0
i_max = 0
for i, data in enumerate(loader):
total_loss = 0.0
#print('batch=',i)
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 2000 == 1999:
print('[%d , %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('finished training')
return mean_val_loss, mean_val_accuracy
You can evaluate your network on the validation when you want. It can be every epoch or if this is too costly because the dataset is huge it can be each N epoch.
What you did seems correct, you compute the loss of the whole validation set. You can optionally divide by its length in order to normalize the loss, so the scale will be the same if you increase the validation set one day.
I am fairly new to deep learning and neural networks. I recently built a facial emotions recognition classifier using the FER-2013 dataset. I am using the pretrained resnet-152 model for classification, but the accuracy of my model is very low, both training and validation accuracies. I am getting an accuracy of around 36%, which is not good. I suppose that using transfer learning, the accuracies should be high, why is it that im getting such a low accuracy. should I change the hyperparameters? here is my code.
model= models.resnet152(pretrained=True)
for param in model.parameters():
param.requires_grad= False
print(model)
from collections import OrderedDict
classifier= nn.Sequential(OrderedDict([
('fc1',nn.Linear(2048, 512)),
('relu', nn.ReLU()),
('dropout1', nn. Dropout(p=0.5)),
('fc2', nn.Linear(512, 7)),
('output', nn.LogSoftmax(dim=1))
]))
model.fc= classifier
print(classifier)
def train_model(model, criterion, optimizer, scheduler, num_epochs=10):
since= time.time()
best_model_wts= copy.deepcopy(model.state_dict())
best_acc= 0.0
for epoch in range(1, num_epochs + 1):
print('Epoch {}/{}'.format(epoch, num_epochs))
print('-' * 10)
for phase in ['train', 'validation']:
if phase == 'train':
scheduler.step()
model.train()
else:
model.eval()
running_loss= 0.0
running_corrects=0
for inputs, labels in dataloaders[phase]:
inputs, labels= inputs.to(device), labels.to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(phase== 'train'):
outputs= model(inputs)
loss= criterion(outputs, labels)
_, preds= torch.max(outputs, 1)
if phase == 'train':
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds== labels.data)
epoch_loss= running_loss / dataset_sizes[phase]
epoch_acc= running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
if phase == 'validation' and epoch_acc > best_acc:
best_acc= epoch_acc
best_model_wts= copy.deepcopy(model.state_dict())
time_elapsed= time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best valid accuracy: {:4f}'.format(best_acc))
model.load_state_dict(best_model_wts)
return model
use_gpu= torch.cuda.is_available()
num_epochs= 10
if use_gpu:
print('Using GPU: '+ str(use_gpu))
model= model.cuda()
criterion= nn.NLLLoss()
optimizer= optim.SGD(model.fc.parameters(), lr = .0006, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
model_ft = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=10)
Can someone please guide me. I am a beginner at it, and I could really make use of some help in it.
preprocess the dataset.
Get more dataset as low accuracy could be a result of smaller dataset.
Try data-augmentation if you have less data.
Background: It's about loss jittering which generates at the beginning stage of every training epoch. When the dataloader loads the first batch data to feed into the network, the loss value always rises suddenly, then returns to normal from the second batch and continues to decline. The curve is so strange. I need your help!
for epoch in range(begin_epoch, end_epoch):
print('PROGRESS: %.2f%%' % (100.0 * epoch / end_epoch))
# set epoch as random seed of sampler while distributed training
if train_sampler is not None and hasattr(train_sampler, 'set_epoch'):
train_sampler.set_epoch(epoch)
# reset metrics
metrics.reset()
# set net to train mode
net.train()
# clear the paramter gradients
# optimizer.zero_grad()
# init end time
end_time = time.time()
if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
name, value = validation_monitor.metrics.get()
val = value[name.index(validation_monitor.host_metric_name)]
lr_scheduler.step(val, epoch)
# training
train_loader_iter = iter(train_loader)
for nbatch in range(total_size):
try:
batch = next(train_loader_iter)
except StopIteration:
print('reset loader .. ')
train_loader_iter = iter(train_loader)
batch = next(train_loader_iter)
global_steps = total_size * epoch + nbatch
os.environ['global_steps'] = str(global_steps)
# record time
data_in_time = time.time() - end_time
# transfer data to GPU
data_transfer_time = time.time()
batch = to_cuda(batch)
data_transfer_time = time.time() - data_transfer_time
# forward
forward_time = time.time()
outputs, loss = net(*batch)
loss = loss.mean()
if gradient_accumulate_steps > 1:
loss = loss / gradient_accumulate_steps
forward_time = time.time() - forward_time
# backward
backward_time = time.time()
if fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
backward_time = time.time() - backward_time
optimizer_time = time.time()
if (global_steps + 1) % gradient_accumulate_steps == 0:
# clip gradient
if clip_grad_norm > 0:
if fp16:
total_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
clip_grad_norm)
else:
total_norm = torch.nn.utils.clip_grad_norm_(net.parameters(),
clip_grad_norm)
if writer is not None:
writer.add_scalar(tag='grad-para/Total-Norm',
scalar_value=float(total_norm),
global_step=global_steps)
optimizer.step()
# step LR scheduler
if lr_scheduler is not None and not isinstance(lr_scheduler,
torch.optim.lr_scheduler.ReduceLROnPlateau):
lr_scheduler.step()
# clear the parameter gradients
optimizer.zero_grad()
optimizer_time = time.time() - optimizer_time
# update metric
metric_time = time.time()
metrics.update(outputs)
if writer is not None and nbatch % 50 == 0:
with torch.no_grad():
for group_i, param_group in enumerate(optimizer.param_groups):
writer.add_scalar(tag='Initial-LR/Group_{}'.format(group_i),
scalar_value=param_group['initial_lr'],
global_step=global_steps)
writer.add_scalar(tag='LR/Group_{}'.format(group_i),
scalar_value=param_group['lr'],
global_step=global_steps)
writer.add_scalar(tag='Train-Loss',
scalar_value=float(loss.item()),
global_step=global_steps)
name, value = metrics.get()
for n, v in zip(name, value):
if 'Logits' in n:
writer.add_scalar(tag='Train-Logits/' + n,
scalar_value=v,
global_step=global_steps)
else:
writer.add_scalar(tag='Train-' + n,
scalar_value=v,
global_step=global_steps)
for k, v in outputs.items():
if 'score' in k:
writer.add_histogram(tag=k,
values=v,
global_step=global_steps)
metric_time = time.time() - metric_time
You have a batch in your dataset that have high loss, that's it.
It is not that common that people store metrics for every batch, usually it is the average over epoch (or average over multiple batch steps) that is stored. You won't see such spikes if you will store averages.
You also could reduce these spikes by shuffling your data so that the problematic batch is spread out across the epoch. In general it is a good practice to do so at the beginning of each epoch.
Problem
I am trying to implement 2-layer neural network using different methods (TensorFlow, PyTorch and from scratch) and then compare their performance based on MNIST dataset.
I am not sure what mistakes I have made, but the accuracy in PyTorch is only about 10%, which is basically random guess. I think probably the weights does not get updated at all.
Note that I intentionally use the dataset provided by TensorFlow to keep the data I use through 3 different methods consistent for accurate comparison.
from tensorflow.examples.tutorials.mnist import input_data
import torch
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = torch.nn.Linear(784, 100)
self.fc2 = torch.nn.Linear(100, 10)
def forward(self, x):
# x -> (batch_size, 784)
x = torch.relu(x)
# x -> (batch_size, 10)
x = torch.softmax(x, dim=1)
return x
net = Net()
net.zero_grad()
Loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)
for epoch in range(1000): # loop over the dataset multiple times
batch_xs, batch_ys = mnist_m.train.next_batch(100)
# convert to appropriate settins
# note the input to the linear layer should be (n_sample, n_features)
batch_xs = torch.tensor(batch_xs, requires_grad=True)
# batch_ys -> (batch_size,)
batch_ys = torch.tensor(batch_ys, dtype=torch.int64)
# forward
# output -> (batch_size, 10)
output = net(batch_xs)
# result -> (batch_size,)
result = torch.argmax(output, dim=1)
loss = Loss(output, batch_ys)
# backward
optimizer.zero_grad()
loss.backward()
optimizer.step()
The problem here is that you don't apply your fully connected layers fc1 and fc2.
Your forward() currently looks like:
def forward(self, x):
# x -> (batch_size, 784)
x = torch.relu(x)
# x -> (batch_size, 10)
x = torch.softmax(x, dim=1)
return x
So if you change it to:
def forward(self, x):
# x -> (batch_size, 784)
x = self.fc1(x) # added layer fc1
x = torch.relu(x)
# x -> (batch_size, 10)
x = self.fc2(x) # added layer fc2
x = torch.softmax(x, dim=1)
return x
It should work.
Regarding Umang Guptas answer: As I see it, calling zero_grad() before calling backward() as Mr.Robot did, is just fine. This shouldn't be a problem.
Edit:
So I did a short test - I set iterations from 1000 to 10000 to see the bigger picture if it is really decreasing. ( Of course I also loaded the data to mnist_m as this wasn't included in the code you've posted )
I added a print condition to the code:
if epoch % 1000 == 0:
print('Epoch', epoch, '- Loss:', round(loss.item(), 3))
Which prints out the loss every 1000 iterations:
Epoch 0 - Loss: 2.305
Epoch 1000 - Loss: 2.263
Epoch 2000 - Loss: 2.187
Epoch 3000 - Loss: 2.024
Epoch 4000 - Loss: 1.819
Epoch 5000 - Loss: 1.699
Epoch 6000 - Loss: 1.699
Epoch 7000 - Loss: 1.656
Epoch 8000 - Loss: 1.675
Epoch 9000 - Loss: 1.659
Tested with PyTorch version 0.4.1
So you can see that with the changed forward() the network is learning now, the rest of the code I left untouched.
Good luck further!