I am building a multi-class Vision Transformer Network. When passing my values through my loss function, it always returns zero. My output layer consisits of 37 Dense Layers with a softmax-unit on each on of them. criterion is created with nn.CrossEntropyLoss().The output of criterion is 0.0 for every iteration. I am using the colab notebook. I printed out the output and label for one iteration:
for output, label in zip(iter(ouputs_t), iter(labels_t)):
loss += criterion(
output,
# reshape label from (Batch_Size) to (Batch_Size, 1)
torch.reshape(label, (label.shape[0] , 1 ))
)
output: tensor([[0.1534],
[0.5797],
[0.6554],
[0.4066],
[0.2683],
[0.1773],
[0.7410],
[0.5136],
[0.5695],
[0.3970],
[0.4317],
[0.7216],
[0.8336],
[0.4517],
[0.4004],
[0.5963],
[0.3079],
[0.5956],
[0.3876],
[0.2327],
[0.7919],
[0.2722],
[0.3064],
[0.9779],
[0.8358],
[0.1851],
[0.2869],
[0.3128],
[0.4301],
[0.4740],
[0.6689],
[0.7588]], device='cuda:0', grad_fn=<UnbindBackward0>)
label: tensor([[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[0.],
[1.],
[0.]], device='cuda:0')
My Model:
class vit_large_patch16_224_multiTaskNet(nn.Module):
def __init__(self, output_classes, frozen_feature_layers=False):
super().__init__()
vit_base_patch16_224 = timm.create_model('vit_large_patch16_224',pretrained=True)
self.is_frozen = frozen_feature_layers
# here we get all the modules(layers) before the fc layer at the end
self.features = nn.ModuleList(vit_base_patch16_224.children())[:-1]
self.features = nn.Sequential(*self.features)
if frozen_feature_layers:
self.freeze_feature_layers()
# now lets add our new layers
in_features = vit_base_patch16_224.head.in_features
# it helps with performance. you can play with it
# create more layers, play/experiment with them.
self.fc0 = nn.Linear(in_features, 512)
self.bn_pu = nn.BatchNorm1d(512, eps = 1e-5)
self.output_modules = nn.ModuleList()
for i in range(output_classes):
self.output_modules.append(nn.Linear(512, 1))
# initialize all fc layers to xavier
for m in self.modules():
if isinstance(m, nn.Linear):
torch.nn.init.xavier_normal_(m.weight, gain = 1)
def forward(self, input_imgs):
output = self.features(input_imgs)
final_cs_token = output[:, 0]
output = self.bn_pu(F.relu(self.fc0(final_cs_token)))
output_list= list()
for output_modul in self.output_modules:
output_list.append(torch.sigmoid(output_modul(output)))
# Convert List to Tensor
output_tensor = torch.stack(output_list)
#
output_tensor = torch.swapaxes(output_tensor, 0 , 1)
return output_tensor
def _set_freeze_(self, status):
for n,p in self.features.named_parameters():
p.requires_grad = status
# for m in self.features.children():
# for p in m.parameters():
# p.requires_grad=status
def freeze_feature_layers(self):
self._set_freeze_(False)
def unfreeze_feature_layers(self):
self._set_freeze_(True)
You are in a multi-class classification scenario, which means you can consider your problem as c-binary class classification done in parallel (where c is the total number of class). Having output_t the logit tensor containing the values outputted by your model's last linear layer and target the ground-truth tensor containing the true classes states for each instance in the batch. You can apply nn.BCEWithLogitsLoss since it works with multi-dimensional tensors out of the box:
With dummy inputs:
>>> output_t = torch.rand(47, 32, 1)
>>> target = torch.randint(0, 2, (47, 32, 1)).float()
Then initializing and calling the loss function:
>>> loss = nn.BCEWithLogitsLoss()
>>> loss(output_t, target)
tensor(0.7246)
Related
Each training example is a sequence of how 8 input variables vary across 5 timesteps
I.e Input is [ip0_0, ip1_0,...,ip4_0], [ip0_1, ip1_1,...,ip4_1]..., [ip0_1, ip1_1,...,ip4_1]
Each training example has a label 0 or 1
I want to create a RNN that predicts the label from the inputs.
I see two ways of doing it
See RNNModelMultiforward below. High level idea is
Have a single torch.RNN()
Initialize hidden state to 0
Run the following 5 times
out, h = RNN([ip0_i,...,ip4_i], h), where i = 0,...,4
Run a feedforward layer that predicts the label from the final hidden state h
Is this the right way to do it or should I use a torch.RNN with num_layers = 5 and run it once to get the output: I.e hn = RNN([[ip0_0,...,ip4_0],.....,[ip0_4,...,ip4_4]], h0) (see RNNModelMultilayer below)
RNNModel multiforward
# Create RNN Model
class RNNModelMultiforward(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, num_epochs, act_fn='relu'):
super(RNNModelMultiforward, self).__init__()
# Number of hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
#Number of times the RNN will be run (the input should be num_epochs X input_dim in size)
self.num_epochs = num_epochs
# RNN
self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity=act_fn)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
# Initialize hidden state with zeros
h = Variable(torch.zeros(self.layer_dim, self.hidden_dim))
for ts in range(0, self.num_epochs):
out, h = self.rnn(x[ts], h)
out = self.fc(h)
return out
RNNModel multilayer
# Create RNN Model
class RNNModelMultilayer(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, act_fn='relu'):
super(RNNModelMultilayer, self).__init__()
# Number of hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
# RNN
self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity=act_fn)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
# Initialize hidden state with zeros
h0 = Variable(torch.zeros(self.layer_dim, self.hidden_dim))
out, hn = self.rnn(x, h0)
out = self.fc(hn[4])
return out
Hello guys I've joined a university-level image recognition competition.
In the test, they will give two images (people face) and my model need to detect pair of the image is the same person or not
My model is resnet18 with IR block and SE block. and it will use Arcface loss.
I can use only the MS1M dataset with a total of 86876 classes
The problem is that loss is getting better, but accuracy is 0 and not changing.
Here's part of code I'm working on.
Train
def train_model(model, net, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
for phase in ['train']:
if phase == 'train':
model.train() # Set model to training mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in notebook.tqdm(dataloader):
inputs = inputs.to(device)
labels = labels.to(device).long()
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
features = model(inputs)
outputs = net(features, labels)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / len(dataloader)
epoch_acc = running_corrects.double() / len(dataloader)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'train' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
torch.save({'epoch': epoch,
'mode_state_dict': model.state_dict(),
'fc_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler': scheduler.state_dict(), # HERE IS THE CHANGE
}, f'/content/drive/MyDrive/inha_data/training_saver/training_stat{epoch}.pth')
print(f'finished {epoch} and saved model_save_{epoch}.pt')
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best train Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), 'model_save.pt')
return model
Parameters
train_dataset = MS1MDataset('train')
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True,num_workers=4)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 디바이스 설정
num_classes = 86876
# normal classifier
# net = nn.Sequential(nn.Linear(512, num_classes))
# Feature extractor backbone, input is 112x112 image output is 512 feature vector
model_ft = resnet18(True)
#set metric
metric_fc = metrics.ArcMarginProduct(512, num_classes, s = 30.0, m = 0.50, easy_margin = False)
metric_fc.to(device)
# net = net.to(device)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = torch.optim.Adam([{'params': model_ft.parameters()}, {'params': metric_fc.parameters()}],
lr=0.1)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=4, gamma=0.1)
Arcface
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import math
class ArcMarginProduct(nn.Module):
r"""Implement of large margin arc distance: :
Args:
in_features: size of each input sample
out_features: size of each output sample
s: norm of input feature
m: margin
cos(theta + m)
"""
def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
super(ArcMarginProduct, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.s = s
self.m = m
self.weight = Parameter(torch.FloatTensor(out_features, in_features))
nn.init.xavier_uniform_(self.weight)
self.easy_margin = easy_margin
self.cos_m = math.cos(m)
self.sin_m = math.sin(m)
self.th = math.cos(math.pi - m)
self.mm = math.sin(math.pi - m) * m
def forward(self, input, label):
# --------------------------- cos(theta) & phi(theta) ---------------------------
cosine = F.linear(F.normalize(input), F.normalize(self.weight))
sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
phi = cosine * self.cos_m - sine * self.sin_m
if self.easy_margin:
phi = torch.where(cosine > 0, phi, cosine)
else:
phi = torch.where(cosine > self.th, phi, cosine - self.mm)
# --------------------------- convert label to one-hot ---------------------------
# one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
one_hot = torch.zeros(cosine.size(), device='cuda')
one_hot.scatter_(1, label.view(-1, 1).long(), 1)
# -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
output = (one_hot * phi) + ((1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4
output *= self.s
# print(output)
return output
dataset
data_transforms = {
'train': transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.125, contrast=0.125, saturation=0.125),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
#train_ms1_data = torchvision.datasets.ImageFolder('/content/drive/MyDrive/inha_data/train', transform = data_transforms)
class MS1MDataset(Dataset):
def __init__(self,split):
self.file_list = '/content/drive/MyDrive/inha_data/ID_List.txt'
self.images = []
self.labels = []
self.transformer = data_transforms['train']
with open(self.file_list) as f:
files = f.read().splitlines()
for i, fi in enumerate(files):
fi = fi.split()
image = "/content/" + fi[1]
label = int(fi[0])
self.images.append(image)
self.labels.append(label)
def __getitem__(self, index):
img = Image.open(self.images[index])
img = self.transformer(img)
label = self.labels[index]
return img, label
def __len__(self):
return len(self.images)
You can try to use a smaller m in ArcFace, even a minus value.
Here is some part of my PyTorch code:
test_loader = DataLoader(dataset = test_loader_hibiscus, batch_size = 1, shuffle=False, num_workers=0)
test_losses = []
y_pred_list = []
feat_list = []
with torch.no_grad():
model.eval()
test_loss = 0.0
if expe_temoin == False :
for test_dwi,test_adc,test_tmax,test_cbf,test_cbv,test_label in test_loader :
test_dwi = test_dwi.to(device)
test_adc = test_adc.to(device)
test_tmax = test_tmax.to(device)
test_cbf = test_cbf.to(device)
test_cbv = test_cbv.to(device)
in_imgs = torch.cat((train_dwi,train_adc,train_tmax,train_cbf,train_cbv), dim=1)
out_recon, my_feat = model(in_imgs)
print("my_feat", my_feat[0].shape)
But it prints:
my_feat torch.Size([2, 512, 1, 24, 24])
Could someone please tell me why 2 (batch?) Thanks!
Hint: When I run with test data size: 26, its OK, when run with data size: 25, mess up the batch! Is there something about being odd and even?!
Here is the Unet 3d model for 3D reconstruction and segmentation:
class Abstract3DUNet(nn.Module):
def __init__(self, in_channels, out_channels, final_sigmoid, basic_module, f_maps=64, layer_order='gcr',
num_groups=8, num_levels=4, is_segmentation=True, testing=False,
conv_kernel_size=3, pool_kernel_size=2, conv_padding=1, **kwargs):
super(Abstract3DUNet, self).__init__()
self.testing = testing
if isinstance(f_maps, int):
f_maps = number_of_features_per_level(f_maps, num_levels=num_levels)
assert isinstance(f_maps, list) or isinstance(f_maps, tuple)
assert len(f_maps) > 1, "Required at least 2 levels in the U-Net"
# create encoder path
self.encoders = create_encoders(in_channels, f_maps, basic_module, conv_kernel_size, conv_padding, layer_order,
num_groups, pool_kernel_size)
# create decoder path
self.decoders = create_decoders(f_maps, basic_module, conv_kernel_size, conv_padding, layer_order, num_groups,
upsample=True)
# in the last layer a 1×1 convolution reduces the number of output
# channels to the number of labels
self.final_conv = nn.Conv3d(f_maps[0], out_channels, 1)
if is_segmentation:
# semantic segmentation problem
if final_sigmoid:
self.final_activation = nn.Sigmoid()
else:
self.final_activation = nn.Softmax(dim=1)
else:
# regression problem
self.final_activation = None
def forward(self, x):
# encoder part
encoders_features = []
my_feat =[]
for encoder in self.encoders:
x = encoder(x)
# reverse the encoder outputs to be aligned with the decoder
encoders_features.insert(0, x)
# remove the last encoder's output from the list
# !!remember: it's the 1st in the list
my_feat = encoders_features[0:]
encoders_features = encoders_features[1:]
# decoder part
for decoder, encoder_features in zip(self.decoders, encoders_features):
# pass the output from the corresponding encoder and the output
# of the previous decoder
x = decoder(encoder_features, x)
x = self.final_conv(x)
# apply final_activation (i.e. Sigmoid or Softmax) only during prediction. During training the network outputs
# logits and it's up to the user to normalize it before visualising with tensorboard or computing validation metric
if self.testing and self.final_activation is not None:
x = self.final_activation(x)
return x, my_feat
class UNet3D(Abstract3DUNet):
def __init__(self, in_channels, out_channels, final_sigmoid=True, f_maps=64, layer_order='gcr',
num_groups=8, num_levels=4, is_segmentation=True, conv_padding=1, **kwargs):
super(UNet3D, self).__init__(in_channels=in_channels,
out_channels=out_channels,
final_sigmoid=final_sigmoid,
basic_module=DoubleConv,
f_maps=f_maps,
layer_order=layer_order,
num_groups=num_groups,
num_levels=num_levels,
is_segmentation=is_segmentation,
conv_padding=conv_padding,
**kwargs)
My train batch size was 3! When I changed it into 2 or 4 the problem solved!
from __future__ import print_function
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from tensorflow.examples.tutorials.mnist import input_data
import torch.optim as optim
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
import matplotlib.pyplot as plt
%matplotlib inline
from plot import plot_loss_and_acc
mnist = input_data.read_data_sets("MNIST_data", one_hot=False)
batch_size = 250
epoch_num = 10
lr = 0.0001
disp_freq = 20
def next_batch(train=True):
# Reads the next batch of MNIST images and labels and returns them
if train:
batch_img, batch_label = mnist.train.next_batch(batch_size)
else:
batch_img, batch_label = mnist.test.next_batch(batch_size)
batch_label = torch.from_numpy(batch_label).long() # convert the numpy array into torch tensor
batch_label = Variable(batch_label) # create a torch variable
batch_img = torch.from_numpy(batch_img).float() # convert the numpy array into torch tensor
batch_img = Variable(batch_img) # create a torch variable
return batch_img, batch_label
class MLP(nn.Module):
def __init__(self, n_features, n_classes):
super(MLP, self).__init__()
self.layer1 = nn.Linear(n_features, 128)
self.layer2 = nn.Linear(128, 128)
self.layer3 = nn.Linear(128, n_classes)
def forward(self, x, training=True):
# a neural network with 2 hidden layers
# x -> FC -> relu -> dropout -> FC -> relu -> dropout -> FC -> output
x = F.relu(self.layer1(x))
x = F.dropout(x, 0.5, training=training)
x = F.relu(self.layer2(x))
x = F.dropout(x, 0.5, training=training)
x = self.layer3(x)
return x
def predict(self, x):
# a function to predict the labels of a batch of inputs
x = F.softmax(self.forward(x, training=False))
return x
def accuracy(self, x, y):
# a function to calculate the accuracy of label prediction for a batch of inputs
# x: a batch of inputs
# y: the true labels associated with x
prediction = self.predict(x)
maxs, indices = torch.max(prediction, 1)
acc = 100 * torch.sum(torch.eq(indices.float(), y.float()).float())/y.size()[0]
print(acc.data)
return acc.data
# define the neural network (multilayer perceptron)
net = MLP(784, 10)
# calculate the number of batches per epoch
batch_per_ep = mnist.train.num_examples // batch_size
# define the loss (criterion) and create an optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=lr)
print(' ')
print("__________Training__________________")
xArray = []
yLoss = []
yAcc = []
for ep in range(epoch_num): # epochs loop
for batch_n in range(batch_per_ep): # batches loop
features, labels = next_batch()
# Reset gradients
optimizer.zero_grad()
# Forward pass
output = net(features)
loss = criterion(output, labels)
# Backward pass and updates
loss.backward() # calculate the gradients (backpropagation)
optimizer.step() # update the weights
if batch_n % disp_freq == 0:
print('epoch: {} - batch: {}/{} '.format(ep, batch_n, batch_per_ep))
xArray.append(ep)
yLoss.append(loss.data)
#yAcc.append(acc.data)
print('loss: ', loss.data)
print('__________________________________')
# test the accuracy on a batch of test data
features, labels = next_batch(train=False)
print("Result")
print('Test accuracy: ', net.accuracy(features, labels))
print('loss: ', loss.data)
accuracy = net.accuracy(features, labels)
#Loss Plot
# plotting the points
plt.plot(xArray, yLoss)
# naming the x axis
plt.xlabel('epoch')
# naming the y axis
plt.ylabel('loss')
# giving a title to my graph
plt.title('Loss Plot')
# function to show the plot
plt.show()
#Accuracy Plot
# plotting the points
plt.plot(xArray, yAcc)
# naming the x axis
plt.xlabel('epoch')
# naming the y axis
plt.ylabel(' accuracy')
# giving a title to my graph
plt.title('Accuracy Plot ')
# function to show the plot
plt.show()
I want to display the accuracy of my training dataset. I have managed to display and plot the loss but I didn't manage to do it for accuracy. I know I am missing 1 or 2 lines of code and I don't know how to do it.
I mean if I can display the accuracy alongside each epoch like the loss I can do the plotting myself.
Hi replace this code print('epoch: {} - batch: {}/{} '.format(ep, batch_n, batch_per_ep)) with
print('epoch: {} - batch: {}/{} - accuracy: {}'.format(ep, batch_n, batch_per_ep, net.accuracy(features,labels)))
Hope this helps.
I use SWA methods to train model in pytorch.
SWA: https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/
my train code's loss jump up to nan immidiately.
model' loss jump up to nan
loss outputs is below.
1.loss: tensor(4.8463, device='cuda:0', grad_fn=)
2.loss: tensor(118317.8516, device='cuda:0', grad_fn=)
3.loss: tensor(5.7568e+22, device='cuda:0', grad_fn=)
4.loss: tensor(nan, device='cuda:0', grad_fn=)
without SWA methods,loss don't jump up.
is there any problems in train model's code with SWA method?
I would appreciate any advice,thank you.
#batch_size
batch_size=5
#DataLoader
train_dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=batch_size,shuffle=False)
#dict
dataloaders_dict={"train":train_dataloader,"val":val_dataloader}
#example outputs
#train :torch.Size([5, 25, 32, 32])
#target : torch.Size([5])
def train_model_withSWA(net,dataloaders_dict,criterion,optimizer,num_epochs):
loss_list=[]
acc_list=[]
#validation list
val_loss_list=[]
val_acc_list=[]
for epoch in tqdm(range(num_epochs)):
print("Epoch{}/{}".format(epoch+1,num_epochs))
print("--------------------------")
for phase in ["train","val"]:
if phase=="train":
net.train()
else:
net.eval()
epoch_loss=0.0
epoch_corrects=0
#if (epoch==0) and (phase=="train"):
continue
for inputs,labels in dataloaders_dict[phase]:
#optimizerを初期化:
optimizer.zero_grad()
with torch.set_grad_enabled(phase=="train"):
inputs=inputs.to(device)
labels=labels.to(device)
outputs=net(inputs)
loss=criterion(outputs,labels)
print("loss:",loss)
_,preds=torch.max(outputs,1)
if phase == "train":
loss.backward()
optimizer.step()
epoch_loss += loss.item()*inputs.size(0)
epoch_corrects +=torch.sum(preds==labels.data)
#for swa
optimizer.swap_swa_sgd()
epoch_loss=epoch_loss/len(dataloaders_dict[phase].dataset)
epoch_acc=epoch_corrects.double()/len(dataloaders_dict[phase].dataset)
print("{} Loss:{:.4f} Acc:{:.4f}".format(phase,epoch_loss,epoch_acc))
if phase=="train":
loss_list.append(epoch_loss.detach().numpy())
acc_list.append(epoch_acc.detach().numpy())
else:
val_loss_list.append(epoch_loss.detach().numpy())
val_acc_list.append(epoch_acc.detach().numpy())
from torchcontrib.optim import SWA
# ignore warning
import warnings
warnings.filterwarnings('ignore') # set to ignore
#criterion
criterion = nn.CrossEntropyLoss()
net=net.to(device)
base_opt = torch.optim.SGD(net.parameters(), lr=0.1)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
### train with SWA
train_model_withSWA(net=net,dataloaders_dict=dataloaders_dict,
criterion=criterion,
optimizer=optimizer,
num_epochs=num_epochs)
#model' loss jump up to nan....
#loss: tensor(4.8463, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(118317.8516, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(5.7568e+22, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
# In additional.
I use Self Attention and Positional Encoder code
class Self_Attention(nn.Module):
""" Self-Attention Layer"""
def __init__(self, in_dim):
super(Self_Attention, self).__init__()
#pointwise convolution
self.query_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.key_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.value_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
# softmax
self.softmax = nn.Softmax(dim=-2)
#output = x +gamma*o
# first:gamma=0
self.gamma = nn.Parameter(torch.zeros(1))
def forward(self, x):
x=x.to(device)
X = x
#B,C',W,H→B,C',N
proj_query = self.query_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
proj_query = proj_query.permute(0, 2, 1) # transpose
proj_key = self.key_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
# bmm
S = torch.bmm(proj_query, proj_key)
#
attention_map_T = self.softmax(S)
attention_map = attention_map_T.permute(0, 2, 1)
# Self-Attention Map
proj_value = self.value_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C,N
o = torch.bmm(proj_value, attention_map.permute(
0, 2, 1))
# Self-Attention Map
o = o.view(X.shape[0], X.shape[1], X.shape[2], X.shape[3])
out = x+self.gamma*o
#print("gamma:",self.gamma)
return out, attention_map
class PositionalEncoder(nn.Module):
def __init__(self, d_model=300, max_seq_len=256):
super(PositionalEncoder,self).__init__()
self.d_model = d_model
pe = torch.zeros(max_seq_len, d_model)
# GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pe = pe.to(device)
for pos in range(max_seq_len):
for i in range(0, d_model, 2):
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
pe[pos, i + 1] = math.cos(pos /
(10000 ** ((2 * (i + 1))/d_model)))
#
self.pe = pe.unsqueeze(0)
#
self.pe.requires_grad = False
def forward(self, x):
x=x.to(device)
ret = math.sqrt(self.d_model)*x + self.pe
return ret
Without normalization in data_preprocessing,loss don't jump up to nan.
this can be caused by my preprocessing error. I don't understand reration with normalization and loss's jump up in SWA method even now...
Im sorry for the mistake.