Finding the number of of nodes and gpus of DistributedDataParallel - deep-learning

I would like to know what number should I select for nodes and gpus.
I use Tesla V100-SXM2 (8 boards).
I tried:
nodes = 1, gpus=1 (only the first gpu works)
nodes=1, gpus =8 (It took very long time and cannot execute)
Did I got wrong parameter for the nodes and gpus? or Is my code wrong ? I would appreciate if you could help me out. The code below is simplified sample code of DPP.
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
help='number of total epochs to run')
args = parser.parse_args()
args.world_size = args.gpus * args.nodes
os.environ['MASTER_ADDR'] = 'host1'
os.environ['MASTER_PORT'] = '7777'
mp.spawn(train, nprocs=args.gpus, args=(args,))
def train(gpu, args):
rank = args.nr * args.gpus + gpu
dist.init_process_group(
backend='nccl',
init_method='env://',
world_size=args.world_size,
rank=rank
)
torch.manual_seed(0)
model = ConvNet()
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size = 100
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Wrapper around our model to handle parallel training
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
# Data loading code
train_dataset = get_datasets()
# Sampler that takes care of the distribution of the batches such that
# the data is not repeated in the iteration and sampled accordingly
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset,
num_replicas=args.world_size,
rank=rank
)
# We pass in the train_sampler which can be used by the DataLoader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0,
pin_memory=True,
sampler=train_sampler)
start = datetime.now()
total_step = len(train_loader)
for epoch in range(args.epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
epoch + 1,
args.epochs,
i + 1,
total_step,
loss.item())
)
if gpu == 0:
print("Training complete)

Related

Poor results fine-tunned GoogLeNet, how to improve them?

I've trained the GooogLeNet from scratch on the MNIST dataset. It achieved very good results (top-1 accuracy of 99% on test set).
Now I want to do transfer learning in order to adapt it to the FashionMNIST dataset. For that I'm doing the following:
# Loading trained model on MNIST
googlenet = torch.load('googlenet-mnist.pth')
# Freeze the network
def freeze(net):
for param in net.parameters():
param.requires_grad = False
return net
# Override all the Linear layers and initialize them
# (including the ones that produce auxiliarity logits)
def forget_FC(net):
net.aux1.fc1 = nn.Linear(in_features=net.aux1.fc1.in_features, out_features=net.aux1.fc1.out_features, bias=True)
net.aux1.fc2 = nn.Linear(in_features=net.aux1.fc2.in_features, out_features=net.aux1.fc2.out_features, bias=True)
net.aux2.fc1 = nn.Linear(in_features=net.aux2.fc1.in_features, out_features=net.aux2.fc1.out_features, bias=True)
net.aux2.fc2 = nn.Linear(in_features=net.aux2.fc2.in_features, out_features=net.aux2.fc2.out_features, bias=True)
# Override the classification layer
net.fc = nn.Sequential(
nn.Linear(num_in_features, num_in_features),
nn.Linear(num_in_features, num_in_features),
nn.Linear(num_in_features, 10))
# Initialize weights auxiliarity logits branches
torch.nn.init.trunc_normal_(net.aux1.fc1.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux1.fc2.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux2.fc1.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux2.fc2.weight, mean=0.0, std=0.01, a=-2, b=2)
# Initialize weights each Linear module in the classification layer
for module in net.fc.modules():
if isinstance(module, nn.Linear):
torch.nn.init.trunc_normal_(module.weight, mean=0.0, std=0.01, a=-2, b=2)
return net
# The training algorithm
def train(net, train_iter, test_iter, num_epochs, lr, device, plot_title, fine_tune=False):
"""Train a model with a GPU.
"""
# def init_weights(m):
# if type(m) == nn.Linear or type(m) == nn.Conv2d:
# nn.init.xavier_uniform_(m.weight)
# net.apply(init_weights)
print('training on', device)
progress = ""
net.to(device)
if fine_tune:
optimizer = torch.optim.SGD(net.fc.parameters(), lr=lr, momentum=.9)
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=.9)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=1, verbose=True)
loss = nn.CrossEntropyLoss()
animator = Animator(xlabel='epoch', xlim=[1, num_epochs], title=plot_title, ylim=[0, 1], figsize=(5,5),
legend=['train loss', 'train acc', 'val acc'])
timer, num_batches = d2l.Timer(), len(train_iter)
for epoch in range(num_epochs):
# Sum of training loss, sum of training accuracy, sum of top 5 training accuracy, no. of examples
metric = d2l.Accumulator(4)
net.train()
# Training
for i, (X, y) in enumerate(train_iter):
timer.start()
optimizer.zero_grad()
X, y = X.to(device), y.to(device)
# Mini-batch inference
y_hat = net(X)
# Take into account the auxiliarity logits (see link cell above)
if isinstance(y_hat, GoogLeNetOutputs):
aux_logit1, aux_logit2, y_hat = y_hat
l1 = loss(y_hat, y)
l2 = loss(aux_logit1, y)
l3 = loss(aux_logit2, y)
l = l1 + .3 * (l2 + l3)
else:
l = loss(y_hat, y)
l.backward()
optimizer.step()
# Training accuracies
with torch.no_grad():
acc_1, acc_5 = accuracy(y_hat, y)
metric.add(l * X.shape[0], acc_1, acc_5, X.shape[0])
timer.stop()
train_l = metric[0] / metric[3]
train_acc_1 = metric[1] / metric[3]
train_acc_5 = metric[2] / metric[3]
if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
animator.add(epoch + (i + 1) / num_batches,
(train_l, train_acc_1, None), plot_title)
# Validation, (validation loss computed when model in eval mode, is that correct?)
val_l, test_acc_1, test_acc_5 = evaluate_accuracy_gpu(net, test_iter)
scheduler.step(val_l)
animator.add(epoch + 1, (None, None, test_acc_1), plot_title)
# Un-comment to see memory consumption, modify batch size to see effects
# print(os.popen('nvidia-smi').read())
# break
progress += f"----\nEpoch {epoch}/{num_epochs}\n\ttrain loss={train_l}[{train_acc_1}]\tval loss={val_l} [{test_acc_1}]\n----"
print(progress)
print(f'loss={train_l:.3f}, train=[1-acc {train_acc_1:.3f}, 5-acc {train_acc_5:.3f}]'
f'test=[1-acc {test_acc_1:.3f}, 5-acc {test_acc_5:.3f}]')
print(f'{metric[3] * num_epochs / timer.sum():.1f} examples/sec '
f'on {str(device)}')
print(f'total training time: {timer.sum()} seconds')
With this approach 34% training accuracy is achieved. Honestly, I was expecting more close to results obtained in MNIST. What is wrong with my current approach?

RuntimeError: shape '[128, -1]' is invalid for input of size 378 pytorch

I'm running a spiking neural network for data that has 21 features with a batch size of 128. I get the following error after many iterations of training (this error doesn't arise immediately!):
RuntimeError: shape '[128, -1]' is invalid for input of size 378 pytorch
When I went to go print out what the shapes of the tensors are before, I get the following:
Train
torch.Size([128, 21])
Test
torch.Size([128, 21])
This is my network:
class SpikingNeuralNetwork(nn.Module):
"""
Parameters in SpikingNeuralNetwork class:
1. number_inputs: Number of inputs to the SNN.
2. number_hidden: Number of hidden layers.
3. number_outputs: Number of output classes.
4. beta: Decay rate.
"""
def __init__(self, number_inputs, number_hidden, number_outputs, beta):
super().__init__()
self.number_inputs = number_inputs
self.number_hidden = number_hidden
self.number_outputs = number_outputs
self.beta = beta
# Initialize layers
self.fc1 = nn.Linear(self.number_inputs, self.number_hidden) # Applies linear transformation to all input points
self.lif1 = snn.Leaky(beta = self.beta) # Integrates weighted input over time, emitting a spike if threshold condition is met
self.fc2 = nn.Linear(self.number_hidden, self.number_outputs) # Applies linear transformation to output spikes of lif1
self.lif2 = snn.Leaky(beta = self.beta) # Another spiking neuron, integrating the weighted spikes over time
"""
Forward propagation of SNN. The code below function will only be called once the input argument x
is explicitly passed into net.
#param x: input passed into the network
#return layer of output after applying final spiking neuron
"""
def forward(self, x):
num_steps = 25
# Initialize hidden states at t = 0
mem1 = self.lif1.init_leaky()
mem2 = self.lif2.init_leaky()
# Record the final layer
spk2_rec = []
mem2_rec = []
for step in range(num_steps):
cur1 = self.fc1(x)
spk1, mem1 = self.lif1(cur1, mem1)
cur2 = self.fc2(spk1)
spk2, mem2 = self.lif2(cur2, mem2)
spk2_rec.append(spk2)
mem2_rec.append(mem2)
return torch.stack(spk2_rec, dim = 0), torch.stack(mem2_rec, dim = 0)
This is my training loop:
def training_loop(net, train_loader, test_loader, dtype, device, optimizer):
num_epochs = 1
loss_history = []
test_loss_history = []
counter = 0
# Temporal dynamics
num_steps = 25
# Outer training loop
for epoch in range(num_epochs):
iter_counter = 0
train_batch = iter(train_loader)
# Minibatch training loop
for data, targets in train_batch:
data = data.to(device)
targets = targets.to(device)
# Forward pass
net.train()
print("Train")
print(data.size())
spk_rec, mem_rec = net(data.view(batch_size, -1))
# Initialize the loss and sum over time
loss_val = torch.zeros((1), dtype = dtype, device = device)
for step in range(num_steps):
loss_val += loss_function(mem_rec[step], targets.long().flatten().to(device))
# Gradient calculation and weight update
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# Store loss history for future plotting
loss_history.append(loss_val.item())
# Test set
with torch.no_grad():
net.eval()
test_data, test_targets = next(iter(test_loader))
test_data = test_data.to(device)
test_targets = test_targets.to(device)
# Test set forward pass
print("Test")
print(test_data.size())
test_spk, test_mem = net(test_data.view(batch_size, -1))
# Test set loss
test_loss = torch.zeros((1), dtype = dtype, device = device)
for step in range(num_steps):
test_loss += loss_function(test_mem[step], test_targets.long().flatten().to(device))
test_loss_history.append(test_loss.item())
# Print train/test loss and accuracy
if counter % 50 == 0:
train_printer(epoch, iter_counter, counter, loss_history, data, targets, test_data, test_targets)
counter = counter + 1
iter_counter = iter_counter + 1
return loss_history, test_loss_history
The error occurs on spk_rec, mem_rec = net(data.view(batch_size, -1)).
The code was adopted from https://snntorch.readthedocs.io/en/latest/tutorials/tutorial_5.html, where it was originally used for the MNIST dataset. However, I am not working with an image dataset. I am working with a dataset that has 21 features and predicts just one target (with 100 classes). I tried to change data.view(batch_size, -1) and test_data.view(batch_size, -1) to data.view(batch_size, 21) and test_data.view(batch_size, 21) based on some other forum answers that I saw, and my program is running for now through the training loop. Does anyone have any suggestions for how I can run through the training with no errors?
EDIT: I now get the error RuntimeError: shape '[128, 21]' is invalid for input of size 378 from spk_rec, mem_rec = net(data.view(batch_size, -1)).
Here are my DataLoaders:
train_loader = DataLoader(dataset = train, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(dataset = test, batch_size = batch_size, shuffle = True)
My batch size is 128.
Tryng to run it by myself to try to solve your problem I luck also: net params and snn.snn.Leaky
import torch
from torch import nn
from torch.utils.data import DataLoader
class SpikingNeuralNetwork(nn.Module):
"""
Parameters in SpikingNeuralNetwork class:
1. number_inputs: Number of inputs to the SNN.
2. number_hidden: Number of hidden layers.
3. number_outputs: Number of output classes.
4. beta: Decay rate.
"""
def __init__(self, number_inputs, number_hidden, number_outputs, beta):
super().__init__()
self.number_inputs = number_inputs
self.number_hidden = number_hidden
self.number_outputs = number_outputs
self.beta = beta
# Initialize layers
self.fc1 = nn.Linear(self.number_inputs,
self.number_hidden) # Applies linear transformation to all input points
self.lif1 = snn.Leaky(
beta=self.beta) # Integrates weighted input over time, emitting a spike if threshold condition is met
self.fc2 = nn.Linear(self.number_hidden,
self.number_outputs) # Applies linear transformation to output spikes of lif1
self.lif2 = snn.Leaky(beta=self.beta) # Another spiking neuron, integrating the weighted spikes over time
"""
Forward propagation of SNN. The code below function will only be called once the input argument x
is explicitly passed into net.
#param x: input passed into the network
#return layer of output after applying final spiking neuron
"""
def forward(self, x):
num_steps = 25
# Initialize hidden states at t = 0
mem1 = self.lif1.init_leaky()
mem2 = self.lif2.init_leaky()
# Record the final layer
spk2_rec = []
mem2_rec = []
for step in range(num_steps):
cur1 = self.fc1(x)
spk1, mem1 = self.lif1(cur1, mem1)
cur2 = self.fc2(spk1)
spk2, mem2 = self.lif2(cur2, mem2)
spk2_rec.append(spk2)
mem2_rec.append(mem2)
return torch.stack(spk2_rec, dim=0), torch.stack(mem2_rec, dim=0)
batch_size = 2
train = torch.rand(128, 21)
test = torch.rand(128, 21)
train_loader = DataLoader(dataset=train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test, batch_size=batch_size, shuffle=True)
net = SpikingNeuralNetwork(number_inputs=1)
loss_function = nn.CrossEntropyLoss()
optimizer = nn.optim.Adam(net.parameters(), lr=0.1)
def training_loop(net, train_loader, test_loader, dtype, device, optimizer):
num_epochs = 1
loss_history = []
test_loss_history = []
counter = 0
# Temporal dynamics
num_steps = 25
# Outer training loop
for epoch in range(num_epochs):
iter_counter = 0
train_batch = iter(train_loader)
# Minibatch training loop
for data, targets in train_batch:
data = data.to(device)
targets = targets.to(device)
# Forward pass
net.train()
print("Train")
print(data.size())
spk_rec, mem_rec = net(data.view(batch_size, -1))
# Initialize the loss and sum over time
loss_val = torch.zeros((1), dtype=dtype, device=device)
for step in range(num_steps):
loss_val += loss_function(mem_rec[step], targets.long().flatten().to(device))
# Gradient calculation and weight update
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# Store loss history for future plotting
loss_history.append(loss_val.item())
# Test set
with torch.no_grad():
net.eval()
test_data, test_targets = next(iter(test_loader))
test_data = test_data.to(device)
test_targets = test_targets.to(device)
# Test set forward pass
print("Test")
print(test_data.size())
test_spk, test_mem = net(test_data.view(batch_size, -1))
# Test set loss
test_loss = torch.zeros((1), dtype=dtype, device=device)
for step in range(num_steps):
test_loss += loss_function(test_mem[step], test_targets.long().flatten().to(device))
test_loss_history.append(test_loss.item())
# Print train/test loss and accuracy
if counter % 50 == 0:
train_printer(epoch, iter_counter, counter, loss_history, data, targets, test_data, test_targets)
counter = counter + 1
iter_counter = iter_counter + 1
return loss_history, test_loss_history
Your code works just fine on the MNIST dataset, so I think it might be a problem with how the DataLoader is being called. My guess is that the total dataset is not evenly divisible by your batch_size. If this is true, then you have two options:
Instead of spk_rec, mem_rec = net(data.view(batch_size, -1)), try spk_rec, mem_rec = net(data.flatten(1)) which preserves the first dimension of your data.
Alternatively, you may need to set drop_last=True in the DataLoader functions.

PyTorch: Confusion Matrix for Transfer Learning

I've been trying to plot a confusion matrix for the below code - check def train_alexnet(). But I keep getting this error:
IndexError: only integers, slices (`:`), ellipsis (`...`), None and long or byte Variables are valid indices (got float)
So, I tried converting my tensors to an integer tensor but then got the error:
ValueError: only one element tensors can be converted to Python scalars
Can someone suggest me what can be done to convert the tensors 'all_preds' and 'source_value' to tensors containing integer values? I found the torch no grad option but I am unaware as to how to use it because I'm new to pytorch.
Here's the link of the github repo that I'm trying to work with: https://github.com/syorami/DDC-transfer-learning/blob/master/DDC.py
from __future__ import print_function
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import warnings
warnings.filterwarnings('ignore')
import math
import model
import torch
import dataloader
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix
from plotcm import plot_confusion_matrix
from torch import nn
from torch import optim
from torch.autograd import Variable
cuda = torch.cuda.is_available()
def step_decay(epoch, learning_rate):
# learning rate step decay
# :param epoch: current training epoch
# :param learning_rate: initial learning rate
# :return: learning rate after step decay
initial_lrate = learning_rate
drop = 0.8
epochs_drop = 10.0
lrate = initial_lrate * math.pow(drop, math.floor((1 + epoch) / epochs_drop))
return lrate
def train_alexnet(epoch, model, learning_rate, source_loader):
# train source on alexnet
# :param epoch: current training epoch
# :param model: defined alexnet
# :param learning_rate: initial learning rate
# :param source_loader: source loader
# :return:
log_interval = 10
LEARNING_RATE = step_decay(epoch, learning_rate)
print(f'Learning Rate: {LEARNING_RATE}')
optimizer = optim.SGD([
{'params': model.features.parameters()},
{'params': model.classifier.parameters()},
{'params': model.final_classifier.parameters(), 'lr': LEARNING_RATE}
], lr=LEARNING_RATE / 10, momentum=MOMENTUM, weight_decay=L2_DECAY)
# enter training mode
model.train()
iter_source = iter(source_loader)
num_iter = len(source_loader)
correct = 0
total_loss = 0
clf_criterion = nn.CrossEntropyLoss()
all_preds = torch.tensor([])
source_value = torch.tensor([])
for i in range(1, num_iter):
source_data, source_label = iter_source.next()
# print("source label: ", source_label)
if cuda:
source_data, source_label = source_data.cuda(), source_label.cuda()
source_data, source_label = Variable(source_data), Variable(source_label)
optimizer.zero_grad()
##
source_preds = model(source_data)
preds = source_preds.data.max(1, keepdim=True)[1]
correct += preds.eq(source_label.data.view_as(preds)).sum()
#prediction label
all_preds = torch.cat(
(all_preds, preds)
,dim=0
)
#actual label
source_value = torch.cat(
(source_value,source_label)
,dim=0
)
loss = clf_criterion(source_preds, source_label)
total_loss += loss
loss.backward()
optimizer.step()
if i % log_interval == 0:
print('Train Epoch {}: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, i * len(source_data), len(source_loader) * BATCH_SIZE,
100. * i / len(source_loader), loss.item()))
total_loss /= len(source_loader)
acc_train = float(correct) * 100. / (len(source_loader) * BATCH_SIZE)
# print('all preds= ',int(all_preds))
# print("source value", int(source_value))
stacked = torch.stack(
(
source_value
,(all_preds.argmax(dim=1))
)
,dim=1
)
print("stacked",stacked)
cmt = torch.zeros(3
,3, dtype=torch.float64)
with torch.no_grad():
for p in stacked:
tl, pl = p.tolist()
cmt[tl, pl] = cmt[tl, pl] + 1
print("cmt: ",cmt)
print('{} set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
SOURCE_NAME, total_loss.item(), correct, len(source_loader.dataset), acc_train))
def test_alexnet(model, target_loader):
# test target data on fine-tuned alexnet
# :param model: trained alexnet on source data set
# :param target_loader: target dataloader
# :return: correct num
# enter evaluation mode
clf_criterion = nn.CrossEntropyLoss()
model.eval()
test_loss = 0
correct = 0
for data, target in target_test_loader:
if cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data, volatile=True), Variable(target)
target_preds = model(data)
test_loss += clf_criterion(target_preds, target) # sum up batch loss
pred = target_preds.data.max(1)[1] # get the index of the max log-probability
correct += pred.eq(target.data.view_as(pred)).cpu().sum()
stacked = torch.stack(
(
target
,target_preds.argmax(dim=1)
)
,dim=1
)
print("stacked target",stacked)
test_loss /= len(target_loader)
print('{} set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
TARGET_NAME, test_loss.item(), correct, len(target_loader.dataset),
100. * correct / len(target_loader.dataset)))
return correct
def compute_confusion_matrix(preds, y):
#round predictions to the closest integer
rounded_preds = torch.round(torch.sigmoid(preds))
return confusion_matrix(y, rounded_preds)
if __name__ == '__main__':
ROOT_PATH = './v1234_combined/pets'
SOURCE_NAME = 'v123'
TARGET_NAME = 'v4'
BATCH_SIZE = 15
TRAIN_EPOCHS = 1
learning_rate = 1e-2
L2_DECAY = 5e-4
MOMENTUM = 0.9
source_loader = dataloader.load_training(ROOT_PATH, SOURCE_NAME, BATCH_SIZE)
#target_train_loader = dataloader.load_training(ROOT_PATH, TARGET_NAME, BATCH_SIZE)
target_test_loader = dataloader.load_testing(ROOT_PATH, TARGET_NAME, BATCH_SIZE)
print('Load data complete')
alexnet = model.Alexnet_finetune(num_classes=3)
print('Construct model complete')
# load pretrained alexnet model
alexnet = model.load_pretrained_alexnet(alexnet)
print('Load pretrained alexnet parameters complete\n')
if cuda: alexnet.cuda()
for epoch in range(1, TRAIN_EPOCHS + 1):
print(f'Train Epoch {epoch}:')
train_alexnet(epoch, alexnet, learning_rate, source_loader)
correct = test_alexnet(alexnet, target_test_loader)
print(len(source_loader.dataset))
In oder to conver all elements of a tensor from floats to ints, you need to use .to():
all_preds_int = all_preds.to(torch.int64)
Note that it appears as if your all_preds are the predicted class probabilities and not the actual labels. You might need to torch.argmax along the appropriate dimension. (BTW, the output of argmax is int - no need to convert).

Google Colab RAM issue with semi-supervised CNN model training

I'm trying to training a binary classifier by transfer learning on EfficientNet. Since I have lots of unlabeled data, I use semi-supervised method to generate multiple "pseudo labeled" data before the model go through each epoch.
Since Colab has its limits of RAM, I delete some large variables(like numpy arrays, dataset, dataloader...) in each loop, however the RAM still increase in every loop like the picture shown below.
Below is my Training loop which consists of 3 main structure: semi-supervised, training loop, validation loop.
I'm not sure which step cause the RAM to keep increase in each epoch.
(1) semi-supervised part
for epoch in range(n_epochs):
print(f"[ Epoch | {epoch + 1:03d}/{n_epochs:03d} ]")
if do_semi:
model.eval()
dataset_0 = []
dataset_1 = []
for img in pseudo_loader:
with torch.no_grad():
logits = model(img.to(device))
probs = softmax(logits)
# Filter the data and construct a new dataset.
for i in range(len(probs)):
p = probs[i].tolist()
idx = p.index(max(p))
if p[idx] >= threshold:
if idx == 0:
dataset_0.append(img[i].numpy().reshape(128, 128, 3))
else:
dataset_1.append(img[i].numpy().reshape(128, 128, 3))
# stratified sampling with labels
len_0, len_1 = len(dataset_0), len(dataset_1)
print('label 0: ', len_0)
print('label 1: ', len_1)
# since there may be RAM memory error, restrict to 1000
if len_0 > 1000:
dataset_0 = random.sample(dataset_0, 1000)
if len_1 > 1000:
dataset_1 = random.sample(dataset_1, 1000)
if len_0 == len_1:
pseudo_x = np.array(dataset_0 + dataset_1)
pseudo_y = ['0' for _ in range(len(dataset_0))] + ['1' for _ in range(len(dataset_1))]
elif len_0 > len_1:
dataset_0 = random.sample(dataset_0, len(dataset_1))
pseudo_x = np.array(dataset_0 + dataset_1)
pseudo_y = ['0' for _ in range(len(dataset_0))] + ['1' for _ in range(len(dataset_1))]
else:
dataset_1 = random.sample(dataset_1, len(dataset_0))
pseudo_x = np.array(dataset_0 + dataset_1)
pseudo_y = ['0' for _ in range(len(dataset_0))] + ['1' for _ in range(len(dataset_1))]
if len(pseudo_x) != 0:
new_dataset = CustomTensorDataset(pseudo_x, np.array(pseudo_y), 'pseudo')
else:
new_dataset = []
# print how many pseudo label data added
print('Total number of pseudo labeled data are added: ', len(new_dataset))
# release RAM
dataset_0 = None
dataset_1 = None
pseudo_x = None
pseudo_y = None
del dataset_0, dataset_1, pseudo_x, pseudo_y
gc.collect()
# Turn off the eval mode.
model.train()
concat_dataset = ConcatDataset([train_set, new_dataset])
train_loader = DataLoader(concat_dataset, batch_size=batch_size, shuffle=True)
i'm quiet sure the problem happened in semi-supervised part, since RAM usage did not increase when no apply semi-supervised part.
Thanks for your helps!!

Have you encountered the similar problem like loss jitter during training?

Background: It's about loss jittering which generates at the beginning stage of every training epoch. When the dataloader loads the first batch data to feed into the network, the loss value always rises suddenly, then returns to normal from the second batch and continues to decline. The curve is so strange. I need your help!
for epoch in range(begin_epoch, end_epoch):
print('PROGRESS: %.2f%%' % (100.0 * epoch / end_epoch))
# set epoch as random seed of sampler while distributed training
if train_sampler is not None and hasattr(train_sampler, 'set_epoch'):
train_sampler.set_epoch(epoch)
# reset metrics
metrics.reset()
# set net to train mode
net.train()
# clear the paramter gradients
# optimizer.zero_grad()
# init end time
end_time = time.time()
if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
name, value = validation_monitor.metrics.get()
val = value[name.index(validation_monitor.host_metric_name)]
lr_scheduler.step(val, epoch)
# training
train_loader_iter = iter(train_loader)
for nbatch in range(total_size):
try:
batch = next(train_loader_iter)
except StopIteration:
print('reset loader .. ')
train_loader_iter = iter(train_loader)
batch = next(train_loader_iter)
global_steps = total_size * epoch + nbatch
os.environ['global_steps'] = str(global_steps)
# record time
data_in_time = time.time() - end_time
# transfer data to GPU
data_transfer_time = time.time()
batch = to_cuda(batch)
data_transfer_time = time.time() - data_transfer_time
# forward
forward_time = time.time()
outputs, loss = net(*batch)
loss = loss.mean()
if gradient_accumulate_steps > 1:
loss = loss / gradient_accumulate_steps
forward_time = time.time() - forward_time
# backward
backward_time = time.time()
if fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
backward_time = time.time() - backward_time
optimizer_time = time.time()
if (global_steps + 1) % gradient_accumulate_steps == 0:
# clip gradient
if clip_grad_norm > 0:
if fp16:
total_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
clip_grad_norm)
else:
total_norm = torch.nn.utils.clip_grad_norm_(net.parameters(),
clip_grad_norm)
if writer is not None:
writer.add_scalar(tag='grad-para/Total-Norm',
scalar_value=float(total_norm),
global_step=global_steps)
optimizer.step()
# step LR scheduler
if lr_scheduler is not None and not isinstance(lr_scheduler,
torch.optim.lr_scheduler.ReduceLROnPlateau):
lr_scheduler.step()
# clear the parameter gradients
optimizer.zero_grad()
optimizer_time = time.time() - optimizer_time
# update metric
metric_time = time.time()
metrics.update(outputs)
if writer is not None and nbatch % 50 == 0:
with torch.no_grad():
for group_i, param_group in enumerate(optimizer.param_groups):
writer.add_scalar(tag='Initial-LR/Group_{}'.format(group_i),
scalar_value=param_group['initial_lr'],
global_step=global_steps)
writer.add_scalar(tag='LR/Group_{}'.format(group_i),
scalar_value=param_group['lr'],
global_step=global_steps)
writer.add_scalar(tag='Train-Loss',
scalar_value=float(loss.item()),
global_step=global_steps)
name, value = metrics.get()
for n, v in zip(name, value):
if 'Logits' in n:
writer.add_scalar(tag='Train-Logits/' + n,
scalar_value=v,
global_step=global_steps)
else:
writer.add_scalar(tag='Train-' + n,
scalar_value=v,
global_step=global_steps)
for k, v in outputs.items():
if 'score' in k:
writer.add_histogram(tag=k,
values=v,
global_step=global_steps)
metric_time = time.time() - metric_time
You have a batch in your dataset that have high loss, that's it.
It is not that common that people store metrics for every batch, usually it is the average over epoch (or average over multiple batch steps) that is stored. You won't see such spikes if you will store averages.
You also could reduce these spikes by shuffling your data so that the problematic batch is spread out across the epoch. In general it is a good practice to do so at the beginning of each epoch.