I am a bit confused as to how to calculate Validation loss? Are validation loss to be computed at the end of an epoch OR should the loss be also monitored during iteration through the batches ?
Below I have computed using running_loss which is getting accumulated over batches - but I want to see if its the correct approach?
def validate(loader, model, criterion):
correct = 0
total = 0
running_loss = 0.0
model.eval()
with torch.no_grad():
for i, data in enumerate(loader):
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
running_loss = running_loss + loss.item()
mean_val_accuracy = (100 * correct / total)
mean_val_loss = ( running_loss )
#mean_val_accuracy = accuracy(outputs,labels)
print('Validation Accuracy: %d %%' % (mean_val_accuracy))
print('Validation Loss:' ,mean_val_loss )
Below is the training block I am using
def train(loader, model, criterion, optimizer, epoch):
correct = 0
running_loss = 0.0
i_max = 0
for i, data in enumerate(loader):
total_loss = 0.0
#print('batch=',i)
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 2000 == 1999:
print('[%d , %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('finished training')
return mean_val_loss, mean_val_accuracy
You can evaluate your network on the validation when you want. It can be every epoch or if this is too costly because the dataset is huge it can be each N epoch.
What you did seems correct, you compute the loss of the whole validation set. You can optionally divide by its length in order to normalize the loss, so the scale will be the same if you increase the validation set one day.
Related
I've trained the GooogLeNet from scratch on the MNIST dataset. It achieved very good results (top-1 accuracy of 99% on test set).
Now I want to do transfer learning in order to adapt it to the FashionMNIST dataset. For that I'm doing the following:
# Loading trained model on MNIST
googlenet = torch.load('googlenet-mnist.pth')
# Freeze the network
def freeze(net):
for param in net.parameters():
param.requires_grad = False
return net
# Override all the Linear layers and initialize them
# (including the ones that produce auxiliarity logits)
def forget_FC(net):
net.aux1.fc1 = nn.Linear(in_features=net.aux1.fc1.in_features, out_features=net.aux1.fc1.out_features, bias=True)
net.aux1.fc2 = nn.Linear(in_features=net.aux1.fc2.in_features, out_features=net.aux1.fc2.out_features, bias=True)
net.aux2.fc1 = nn.Linear(in_features=net.aux2.fc1.in_features, out_features=net.aux2.fc1.out_features, bias=True)
net.aux2.fc2 = nn.Linear(in_features=net.aux2.fc2.in_features, out_features=net.aux2.fc2.out_features, bias=True)
# Override the classification layer
net.fc = nn.Sequential(
nn.Linear(num_in_features, num_in_features),
nn.Linear(num_in_features, num_in_features),
nn.Linear(num_in_features, 10))
# Initialize weights auxiliarity logits branches
torch.nn.init.trunc_normal_(net.aux1.fc1.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux1.fc2.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux2.fc1.weight, mean=0.0, std=0.01, a=-2, b=2)
torch.nn.init.trunc_normal_(net.aux2.fc2.weight, mean=0.0, std=0.01, a=-2, b=2)
# Initialize weights each Linear module in the classification layer
for module in net.fc.modules():
if isinstance(module, nn.Linear):
torch.nn.init.trunc_normal_(module.weight, mean=0.0, std=0.01, a=-2, b=2)
return net
# The training algorithm
def train(net, train_iter, test_iter, num_epochs, lr, device, plot_title, fine_tune=False):
"""Train a model with a GPU.
"""
# def init_weights(m):
# if type(m) == nn.Linear or type(m) == nn.Conv2d:
# nn.init.xavier_uniform_(m.weight)
# net.apply(init_weights)
print('training on', device)
progress = ""
net.to(device)
if fine_tune:
optimizer = torch.optim.SGD(net.fc.parameters(), lr=lr, momentum=.9)
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=.9)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=1, verbose=True)
loss = nn.CrossEntropyLoss()
animator = Animator(xlabel='epoch', xlim=[1, num_epochs], title=plot_title, ylim=[0, 1], figsize=(5,5),
legend=['train loss', 'train acc', 'val acc'])
timer, num_batches = d2l.Timer(), len(train_iter)
for epoch in range(num_epochs):
# Sum of training loss, sum of training accuracy, sum of top 5 training accuracy, no. of examples
metric = d2l.Accumulator(4)
net.train()
# Training
for i, (X, y) in enumerate(train_iter):
timer.start()
optimizer.zero_grad()
X, y = X.to(device), y.to(device)
# Mini-batch inference
y_hat = net(X)
# Take into account the auxiliarity logits (see link cell above)
if isinstance(y_hat, GoogLeNetOutputs):
aux_logit1, aux_logit2, y_hat = y_hat
l1 = loss(y_hat, y)
l2 = loss(aux_logit1, y)
l3 = loss(aux_logit2, y)
l = l1 + .3 * (l2 + l3)
else:
l = loss(y_hat, y)
l.backward()
optimizer.step()
# Training accuracies
with torch.no_grad():
acc_1, acc_5 = accuracy(y_hat, y)
metric.add(l * X.shape[0], acc_1, acc_5, X.shape[0])
timer.stop()
train_l = metric[0] / metric[3]
train_acc_1 = metric[1] / metric[3]
train_acc_5 = metric[2] / metric[3]
if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
animator.add(epoch + (i + 1) / num_batches,
(train_l, train_acc_1, None), plot_title)
# Validation, (validation loss computed when model in eval mode, is that correct?)
val_l, test_acc_1, test_acc_5 = evaluate_accuracy_gpu(net, test_iter)
scheduler.step(val_l)
animator.add(epoch + 1, (None, None, test_acc_1), plot_title)
# Un-comment to see memory consumption, modify batch size to see effects
# print(os.popen('nvidia-smi').read())
# break
progress += f"----\nEpoch {epoch}/{num_epochs}\n\ttrain loss={train_l}[{train_acc_1}]\tval loss={val_l} [{test_acc_1}]\n----"
print(progress)
print(f'loss={train_l:.3f}, train=[1-acc {train_acc_1:.3f}, 5-acc {train_acc_5:.3f}]'
f'test=[1-acc {test_acc_1:.3f}, 5-acc {test_acc_5:.3f}]')
print(f'{metric[3] * num_epochs / timer.sum():.1f} examples/sec '
f'on {str(device)}')
print(f'total training time: {timer.sum()} seconds')
With this approach 34% training accuracy is achieved. Honestly, I was expecting more close to results obtained in MNIST. What is wrong with my current approach?
I would like to know what number should I select for nodes and gpus.
I use Tesla V100-SXM2 (8 boards).
I tried:
nodes = 1, gpus=1 (only the first gpu works)
nodes=1, gpus =8 (It took very long time and cannot execute)
Did I got wrong parameter for the nodes and gpus? or Is my code wrong ? I would appreciate if you could help me out. The code below is simplified sample code of DPP.
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
help='number of total epochs to run')
args = parser.parse_args()
args.world_size = args.gpus * args.nodes
os.environ['MASTER_ADDR'] = 'host1'
os.environ['MASTER_PORT'] = '7777'
mp.spawn(train, nprocs=args.gpus, args=(args,))
def train(gpu, args):
rank = args.nr * args.gpus + gpu
dist.init_process_group(
backend='nccl',
init_method='env://',
world_size=args.world_size,
rank=rank
)
torch.manual_seed(0)
model = ConvNet()
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size = 100
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Wrapper around our model to handle parallel training
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
# Data loading code
train_dataset = get_datasets()
# Sampler that takes care of the distribution of the batches such that
# the data is not repeated in the iteration and sampled accordingly
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset,
num_replicas=args.world_size,
rank=rank
)
# We pass in the train_sampler which can be used by the DataLoader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0,
pin_memory=True,
sampler=train_sampler)
start = datetime.now()
total_step = len(train_loader)
for epoch in range(args.epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
epoch + 1,
args.epochs,
i + 1,
total_step,
loss.item())
)
if gpu == 0:
print("Training complete)
We are facing a very strange issue. We tested the exact same model into two different “execution” settings. In the first case, given a certain amount of epochs, we train using mini-batches for one epoch, and thereafter we test on the validation set following the same criteria. Then, we go for the next epoch. Clearly, before each training epoch, we use model.train(), and before validation we turn on model.eval().
Then we take the exact same model (same init, same dataset, same epochs, etc.) and we just train it without validation after each epoch.
Just looking at performance on training set, we observed that, even if we fixed all seeds, the two training procedures evolve differently and produce quite different metrics results (losses, accuracy, and so on). Specifically, the training-only procedure is less performing.
We also observe the following things:
It is not a reproducibility issue, because multiple executions of the
same procedure produce exactly the same results (and this is
intended);
Removing the dropout, it appears that the problem vanishes;
Batchnorm1d layer, that still has different behaviours between
training and evaluation, seems to work properly;
The issue still happens if we move from training onto TPUs to CPUs.
We are working and tried Pythorch 1.6, Pythorch nightly, XLA 1.6.
We quite lost one full day in trying to tackle this issue (and no, we cannot avoid using dropout). Does anyone have any idea about how to solve this fact?
Thank you very much!
p.s. Here the code employed for the training (on CPU).
def sigmoid(x):
return 1 / (1 + torch.exp(-x))
def _run(model, EPOCHS, training_data_in, validation_data_in=None):
def train_fn(train_dataloader, model, optimizer, criterion):
running_loss = 0.
running_accuracy = 0.
running_tp = 0.
running_tn = 0.
running_fp = 0.
running_fn = 0.
model.train()
for batch_idx, (ecg, spo2, labels) in enumerate(train_dataloader, 1):
optimizer.zero_grad()
outputs = model(ecg)
loss = criterion(outputs, labels)
loss.backward() # calculate the gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step() # update the network weights
running_loss += loss.item()
predicted = torch.round(sigmoid(outputs.data)) # here determining the sigmoid, not included in the model
running_accuracy += (predicted == labels).sum().item() / labels.size(0)
fp = ((predicted - labels) == 1.).sum().item()
fn = ((predicted - labels) == -1.).sum().item()
tp = ((predicted + labels) == 2.).sum().item()
tn = ((predicted + labels) == 0.).sum().item()
running_tp += tp
running_fp += fp
running_tn += tn
running_fn += fn
retval = {'loss':running_loss / batch_idx,
'accuracy':running_accuracy / batch_idx,
'tp':running_tp,
'tn':running_tn,
'fp':running_fp,
'fn':running_fn
}
return retval
def valid_fn(valid_dataloader, model, criterion):
running_loss = 0.
running_accuracy = 0.
running_tp = 0.
running_tn = 0.
running_fp = 0.
running_fn = 0.
model.eval()
for batch_idx, (ecg, spo2, labels) in enumerate(valid_dataloader, 1):
outputs = model(ecg)
loss = criterion(outputs, labels)
running_loss += loss.item()
predicted = torch.round(sigmoid(outputs.data)) # here determining the sigmoid, not included in the model
running_accuracy += (predicted == labels).sum().item() / labels.size(0)
fp = ((predicted - labels) == 1.).sum().item()
fn = ((predicted - labels) == -1.).sum().item()
tp = ((predicted + labels) == 2.).sum().item()
tn = ((predicted + labels) == 0.).sum().item()
running_tp += tp
running_fp += fp
running_tn += tn
running_fn += fn
retval = {'loss':running_loss / batch_idx,
'accuracy':running_accuracy / batch_idx,
'tp':running_tp,
'tn':running_tn,
'fp':running_fp,
'fn':running_fn
}
return retval
# Defining data loaders
train_dataloader = torch.utils.data.DataLoader(training_data_in, batch_size=BATCH_SIZE, shuffle=True, num_workers=1)
if validation_data_in != None:
validation_dataloader = torch.utils.data.DataLoader(validation_data_in, batch_size=BATCH_SIZE, shuffle=False, num_workers=1)
# Defining the loss function
criterion = nn.BCEWithLogitsLoss()
# Defining the optimizer
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=3e-4, amsgrad=False, eps=1e-07)
# Training code
metrics_history = {"loss":[], "accuracy":[], "precision":[], "recall":[], "f1":[], "specificity":[], "accuracy_bis":[], "tp":[], "tn":[], "fp":[], "fn":[],
"val_loss":[], "val_accuracy":[], "val_precision":[], "val_recall":[], "val_f1":[], "val_specificity":[], "val_accuracy_bis":[], "val_tp":[], "val_tn":[], "val_fp":[], "val_fn":[],}
train_begin = time.time()
for epoch in range(EPOCHS):
start = time.time()
print("EPOCH:", epoch+1)
train_metrics = train_fn(train_dataloader=train_dataloader,
model=model,
optimizer=optimizer,
criterion=criterion)
metrics_history["loss"].append(train_metrics["loss"])
metrics_history["accuracy"].append(train_metrics["accuracy"])
metrics_history["tp"].append(train_metrics["tp"])
metrics_history["tn"].append(train_metrics["tn"])
metrics_history["fp"].append(train_metrics["fp"])
metrics_history["fn"].append(train_metrics["fn"])
precision = train_metrics["tp"] / (train_metrics["tp"] + train_metrics["fp"]) if train_metrics["tp"] > 0 else 0
recall = train_metrics["tp"] / (train_metrics["tp"] + train_metrics["fn"]) if train_metrics["tp"] > 0 else 0
specificity = train_metrics["tn"] / (train_metrics["tn"] + train_metrics["fp"]) if train_metrics["tn"] > 0 else 0
f1 = 2*precision*recall / (precision + recall) if precision*recall > 0 else 0
metrics_history["precision"].append(precision)
metrics_history["recall"].append(recall)
metrics_history["f1"].append(f1)
metrics_history["specificity"].append(specificity)
if validation_data_in != None:
# Calculate the metrics on the validation data, in the same way as done for training
with torch.no_grad(): # don't keep track of the info necessary to calculate the gradients
val_metrics = valid_fn(valid_dataloader=validation_dataloader,
model=model,
criterion=criterion)
metrics_history["val_loss"].append(val_metrics["loss"])
metrics_history["val_accuracy"].append(val_metrics["accuracy"])
metrics_history["val_tp"].append(val_metrics["tp"])
metrics_history["val_tn"].append(val_metrics["tn"])
metrics_history["val_fp"].append(val_metrics["fp"])
metrics_history["val_fn"].append(val_metrics["fn"])
val_precision = val_metrics["tp"] / (val_metrics["tp"] + val_metrics["fp"]) if val_metrics["tp"] > 0 else 0
val_recall = val_metrics["tp"] / (val_metrics["tp"] + val_metrics["fn"]) if val_metrics["tp"] > 0 else 0
val_specificity = val_metrics["tn"] / (val_metrics["tn"] + val_metrics["fp"]) if val_metrics["tn"] > 0 else 0
val_f1 = 2*val_precision*val_recall / (val_precision + val_recall) if val_precision*val_recall > 0 else 0
metrics_history["val_precision"].append(val_precision)
metrics_history["val_recall"].append(val_recall)
metrics_history["val_f1"].append(val_f1)
metrics_history["val_specificity"].append(val_specificity)
print(" > Training/validation loss:", round(train_metrics['loss'], 4), round(val_metrics['loss'], 4))
print(" > Training/validation accuracy:", round(train_metrics['accuracy'], 4), round(val_metrics['accuracy'], 4))
print(" > Training/validation precision:", round(precision, 4), round(val_precision, 4))
print(" > Training/validation recall:", round(recall, 4), round(val_recall, 4))
print(" > Training/validation f1:", round(f1, 4), round(val_f1, 4))
print(" > Training/validation specificity:", round(specificity, 4), round(val_specificity, 4))
else:
print(" > Training loss:", round(train_metrics['loss'], 4))
print(" > Training accuracy:", round(train_metrics['accuracy'], 4))
print(" > Training precision:", round(precision, 4))
print(" > Training recall:", round(recall, 4))
print(" > Training f1:", round(f1, 4))
print(" > Training specificity:", round(specificity, 4))
print("Completed in:", round(time.time() - start, 1), "seconds \n")
print("Training completed in:", round((time.time()- train_begin)/60, 1), "minutes")
# Save the model weights
torch.save(model.state_dict(), './nnet_model.pt')
# Save the metrics history
torch.save(metrics_history, 'training_history')
And here is the function that initializes the model and set the seeds, called before each execution of the code of "_run":
def reinit_model():
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
net = Net() # the model
return net
Ok, I found the issue.
The problem is determined by the fact that, apparently, running the evaluation some random seeds are changed, and this affects the training phase.
The solution is thus as follows:
at the beginning of function "_run()", set all seeds states to the desired value, e.g., 42. Then, save those seeds to disk.
at the beginning of function "train_fn()", read the seeds states from disk, and set them
at the end of function "train_fn()", save the seeds states to disk
For instance, running on TPU with XLA, the following instructions have to be used:
at the beginning of function "_run()": xm.set_rng_state(42), xm.save(xm.get_rng_state(), 'xm_seed')
at the beginning of function "train_fn()": xm.set_rng_state(torch.load('xm_seed'), device=device) (you can also print here the seed for verification purposes with xm.master_print(xm.get_rng_state())
at the end of function "train_fn_()": xm.save(xm.get_rng_state(), 'xm_seed')
I have a model training and I got this plot. It is over audio (about 70K of around 5-10s) and no augmentation is being done. I have tried the following to avoid overfitting:
Reduce complexity of the model by reducing number of GRU cells and hidden dimensions.
Add dropout in each layer.
I have tried with higher dataset.
What I am not sure is if my calculation of training loss and validation loss is correct. It is something like this. I am using drop_last=True and I am using the CTC loss criterion.
train_data_len = len(train_loader.dataset)
valid_data_len = len(valid_loader.dataset)
epoch_train_loss = 0
epoch_val_loss = 0
train_losses = []
valid_losses = []
model.train()
for e in range(n_epochs):
t0 = time.time()
#batch loop
running_loss = 0.0
for batch_idx, _data in enumerate(train_loader, 1):
# Calculate output ...
# bla bla
loss = criterion(output, labels.float(), input_lengths, label_lengths)
loss.backward()
optimizer.step()
scheduler.step()
# loss stats
running_loss += loss.item() * specs.size(0)
t_t = time.time() - t0
######################
# validate the model #
######################
with torch.no_grad():
model.eval()
tv = time.time()
running_val_loss = 0.0
for batch_idx_v, _data in enumerate(valid_loader, 1):
#bla, bla
val_loss = criterion(output, labels.float(), input_lengths, label_lengths)
running_val_loss += val_loss.item() * specs.size(0)
print("Epoch {}: Training took {:.2f} [s]\tValidation took: {:.2f} [s]\n".format(e+1, t_t, time.time() - tv))
epoch_train_loss = running_loss / train_data_len
epoch_val_loss = running_val_loss / valid_data_len
train_losses.append(epoch_train_loss)
valid_losses.append(epoch_val_loss)
print('Epoch: {} Losses\tTraining Loss: {:.6f}\tValidation Loss: {:.6f}'.format(
e+1, epoch_train_loss, epoch_val_loss))
model.train()
Background: It's about loss jittering which generates at the beginning stage of every training epoch. When the dataloader loads the first batch data to feed into the network, the loss value always rises suddenly, then returns to normal from the second batch and continues to decline. The curve is so strange. I need your help!
for epoch in range(begin_epoch, end_epoch):
print('PROGRESS: %.2f%%' % (100.0 * epoch / end_epoch))
# set epoch as random seed of sampler while distributed training
if train_sampler is not None and hasattr(train_sampler, 'set_epoch'):
train_sampler.set_epoch(epoch)
# reset metrics
metrics.reset()
# set net to train mode
net.train()
# clear the paramter gradients
# optimizer.zero_grad()
# init end time
end_time = time.time()
if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
name, value = validation_monitor.metrics.get()
val = value[name.index(validation_monitor.host_metric_name)]
lr_scheduler.step(val, epoch)
# training
train_loader_iter = iter(train_loader)
for nbatch in range(total_size):
try:
batch = next(train_loader_iter)
except StopIteration:
print('reset loader .. ')
train_loader_iter = iter(train_loader)
batch = next(train_loader_iter)
global_steps = total_size * epoch + nbatch
os.environ['global_steps'] = str(global_steps)
# record time
data_in_time = time.time() - end_time
# transfer data to GPU
data_transfer_time = time.time()
batch = to_cuda(batch)
data_transfer_time = time.time() - data_transfer_time
# forward
forward_time = time.time()
outputs, loss = net(*batch)
loss = loss.mean()
if gradient_accumulate_steps > 1:
loss = loss / gradient_accumulate_steps
forward_time = time.time() - forward_time
# backward
backward_time = time.time()
if fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
backward_time = time.time() - backward_time
optimizer_time = time.time()
if (global_steps + 1) % gradient_accumulate_steps == 0:
# clip gradient
if clip_grad_norm > 0:
if fp16:
total_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
clip_grad_norm)
else:
total_norm = torch.nn.utils.clip_grad_norm_(net.parameters(),
clip_grad_norm)
if writer is not None:
writer.add_scalar(tag='grad-para/Total-Norm',
scalar_value=float(total_norm),
global_step=global_steps)
optimizer.step()
# step LR scheduler
if lr_scheduler is not None and not isinstance(lr_scheduler,
torch.optim.lr_scheduler.ReduceLROnPlateau):
lr_scheduler.step()
# clear the parameter gradients
optimizer.zero_grad()
optimizer_time = time.time() - optimizer_time
# update metric
metric_time = time.time()
metrics.update(outputs)
if writer is not None and nbatch % 50 == 0:
with torch.no_grad():
for group_i, param_group in enumerate(optimizer.param_groups):
writer.add_scalar(tag='Initial-LR/Group_{}'.format(group_i),
scalar_value=param_group['initial_lr'],
global_step=global_steps)
writer.add_scalar(tag='LR/Group_{}'.format(group_i),
scalar_value=param_group['lr'],
global_step=global_steps)
writer.add_scalar(tag='Train-Loss',
scalar_value=float(loss.item()),
global_step=global_steps)
name, value = metrics.get()
for n, v in zip(name, value):
if 'Logits' in n:
writer.add_scalar(tag='Train-Logits/' + n,
scalar_value=v,
global_step=global_steps)
else:
writer.add_scalar(tag='Train-' + n,
scalar_value=v,
global_step=global_steps)
for k, v in outputs.items():
if 'score' in k:
writer.add_histogram(tag=k,
values=v,
global_step=global_steps)
metric_time = time.time() - metric_time
You have a batch in your dataset that have high loss, that's it.
It is not that common that people store metrics for every batch, usually it is the average over epoch (or average over multiple batch steps) that is stored. You won't see such spikes if you will store averages.
You also could reduce these spikes by shuffling your data so that the problematic batch is spread out across the epoch. In general it is a good practice to do so at the beginning of each epoch.