My hardware is a Ryzen 5000 series cpu with an nvidia rtx 3060 gpu. I'm currently working on a school assignment involving using a deep learning model (implemented in PyTorch) to predict COVID diagnosis from CT slice images. The dataset can be found at this url on GitHub: https://github.com/UCSD-AI4H/COVID-CT
I've written a custom dataset that takes the images from the dataset and resizes them to 224x224. I've also converted all rgba or grayscale images to rgb using skimage.color. Other transforms include random horizontal and vertical flipping, as well as ToTensor(). To evaluate the model I've used sklearn.metrics to compute the AUC, F1 score, and accuracy of the model.
My trouble is that I can't get the model to train. After 10 epochs the loss has not decreased. I've tried adjusting the learning rate of my optimizer but it hasn't helped. Any recommendations/thoughts would be greatly appreciated. Thanks!
class RONANet(nn.Module):
def __init__(self, classifier_type=None):
super(RONANet, self).__init__()
self.classifier_type = classifier_type
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
self.classifier = self.compose_classifier()
self.conv_layers = nn.Sequential(
nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(32),
self.relu,
self.maxpool,
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(64),
self.relu,
self.maxpool,
nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(128),
self.relu,
self.maxpool,
nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
self.relu,
self.maxpool,
nn.AdaptiveAvgPool2d(output_size=(1,1)),
)
def compose_classifier(self):
if 'fc' in self.classifier_type:
classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(14**2*256, 256),
self.relu,
nn.Linear(256, 128),
self.relu,
nn.Linear(128, 2))
elif 'conv'in self.classifier_type:
classifier = nn.Sequential(
nn.Conv2d(256, 1, kernel_size=1, stride=1))
return classifier
def forward(self, x):
features = self.conv_layers(x)
out = self.classifier(features)
if 'conv' in self.classifier_type:
out = out.reshape([-1,])
return out
RONANetv1 = RONANet(classifier_type='conv')
RONANetv1 = RONANetv1.cuda()
RONANetv2 = RONANet(classifier_type='fc')
RONANetv2 = RONANetv2.cuda()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(RONANetv1.parameters(), lr=0.1)
num_epochs = 100
best_auc = 0.5 # set threshold to random model performance
scores = {}
for epoch in range(num_epochs):
RONANetv1.train()
print(f'Current Epoch: {epoch+1}')
epoch_loss = 0
for images, labels in train_dataloader:
batch_loss = 0
optimizer.zero_grad()
with torch.set_grad_enabled(True):
images = images.cuda()
labels = labels.cuda()
out = RONANetv1(images)
loss = criterion(out, labels)
batch_loss += loss.item()
loss.backward()
optimizer.step()
epoch_loss += batch_loss
print(f'Loss this epoch: {epoch_loss}\n')
current_val_auc, current_val_f1, current_val_acc = get_scores(RONANetv1, val_dataloader)
if current_val_auc > best_auc:
best_auc = current_val_auc
torch.save(RONANetv1.state_dict(), 'RONANetv1.pth')
scores['AUC'] = current_val_auc
scores['f1'] = current_val_f1
scores['Accuracy'] = current_val_acc
print(scores)
.
Output:
Current Epoch: 1
Loss this epoch: 38.038745045661926
{'AUC': 0.6632183908045978, 'f1': 0.0, 'Accuracy': 0.4915254237288136}
Current Epoch: 2
Loss this epoch: 37.96312761306763
Current Epoch: 3
Loss this epoch: 37.93656861782074
Current Epoch: 4
Loss this epoch: 38.045261442661285
Current Epoch: 5
Loss this epoch: 38.01626980304718
Current Epoch: 6
Loss this epoch: 37.93017905950546
Current Epoch: 7
Loss this epoch: 37.913547694683075
Current Epoch: 8
Loss this epoch: 38.049841582775116
Current Epoch: 9
Loss this epoch: 37.95650988817215
can you try with this learning rate
optimizer = torch.optim.Adam(RONANetv1.parameters(), lr=0.001)
and probably wait for atleast 25 epochs
So the issue is you're only training the first part of the classifier and not the second
# this
optimizer = torch.optim.Adam(RONANetv1.parameters(), lr=0.1)
# needs to become this
from itertools import chain
optimizer = torch.optim.Adam(chain(RONANetv1.parameters(), RONANetv2.parameters()))
and you need to incorportate the other cnn in training too
intermediate_out = RONANetv1(images)
out = RONANetv2(intermediate_out)
loss = criterion(out, labels)
batch_loss += loss.item()
loss.backward()
optimizer.step()
Hope that helps best of luck!
Related
I have created a LSTM model in Pytorch which looks like this:
LSTMNet
Now I want to build another LSTM model (NewLSTMNet) on top of it (LSTMNet) by freezing the fc1 layer. I used:
model.fc1.weight.requires_grad = False
model.fc1.bias.requires_grad = False
and then I changed fc2 layer with a linear layer with input features = 40 and output features = 40.
So far I did:
class NewLSTMNet(nn.Module):
def __init__(self, model, input_size, hidden_size, num_layers):
super(NewLSTMNet, self).__init__()
self.model = model
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.model.fc2 = nn.Linear(40, 40)
# self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc3 = nn.Sequential(
# nn.ReLU(),
nn.Linear (40 , 128),
nn.ReLU(),
nn.Linear(128, 40),
nn.ReLU(),
nn.Linear(40,1),
nn.ReLU(),
)
def forward(self,x):
# input = self.model(x)
# h0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# c0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# _, (h_out,_) = self.lstm(input, (h0,c0))
# h_out = h_out.view(-1, self.hidden_size)
# print(h_out.shape)
# out = self.fc3(out)
out = self.model(x)
out = self.fc3(out)
return out
Now my new LSTM model looks like this:
NewLSTMNet
My training loop looks like this:
for epoch in range(EPOCHS):
model.train()
output = model(X_train)
train_loss = criterion(output, y_train)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
with torch.no_grad():
model.eval()
output_val = model(X_valid)
valid_loss = criterion(output_val, y_valid)
if valid_loss <= valid_loss_min:
torch.save(model.state_dict(), './state_dict_new.pt')
print(
f'Epoch {epoch + 0:01}: Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
early_stopping_counter = 0 # reset counter if validation loss decreases
else:
print(f'Epoch {epoch + 0:01}: Validation loss did not decrease')
early_stopping_counter += 1
if early_stopping_counter > early_stopping_patience:
print('Early stopped at epoch :', epoch)
break
print(f'\t Train_Loss: {train_loss:.4f} Val_Loss: {valid_loss:.4f} BEST VAL Loss: {valid_loss_min:.4f}\n')
Now the model is working fine. But I want to create a LSTM layer in the NewLSTMNet model. I already tried to add a LSTM layer but I was expecting a vector in the output but I am getting a matrix in the output of the prediction. So there is shape mismatch!
How should I modify my code? Any help is appreciated. Thanks in advance!
I'm training a model which includes batch normalization layer, but i noticed that the accuracy can fluctuate widely (from 55% to 31% in just one epoch), both train accuracy and test accuracy, so i think it's not caused by overfitting.
This is my accuracy over epoch
This is joint graph
This is my model architecture
return nn.Sequential(
nn.Conv2d(3,64,kernel_size=7,stride=2,padding=3),
nn.BatchNorm2d(64,momentum=momentum),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1),
Residual(64, 64),
Residual(64, 64),
Residual(64, 128, use_1x1=True, stride=2),
Residual(128, 128),
Residual(128, 256, use_1x1=True, stride=2),
Residual(256, 256),
Residual(256, 512, use_1x1=True, stride=2),
Residual(512, 512),
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten(),
nn.Linear(512, 176)
).to(device)
class Residual(nn.Module):
def __init__(self,input_channel,output_channel,use_1x1=False,stride=1):
super().__init__()
self.conv1=nn.Conv2d(input_channel,output_channel,kernel_size=3,padding=1,stride=stride)
self.conv2=nn.Conv2d(output_channel,output_channel,kernel_size=3,padding=1)
self.bn1=nn.BatchNorm2d(output_channel,momentum=momentum)
self.bn2=nn.BatchNorm2d(output_channel,momentum=momentum)
if use_1x1:
self.conv3=nn.Conv2d(input_channel,output_channel,kernel_size=1,stride=stride)
else:
self.conv3=None
def forward(self,X):
Y=F.relu(self.bn1(self.conv1(X)))
Y=self.bn2(self.conv2(Y))
if self.conv3 is not None:
X=self.conv3(X)
Y += X
return F.relu(Y)
But magically, if i don't call model.eval() in the accuracy evaluation function, which keeps the running_mean and running_var updating, the accuracy won't fluctuate
Furthermore, if i go through the training set after each epoch, as is shown in the code below
for epoch in range(epochs):
net.train()
for X, y in train_iter:
optimizer.zero_grad()
y_hat = net(X)
l = loss(y_hat, y)
l.backward()
optimizer.step()
for X,y in train_iter:
net(X)
eval_accuracy()
the accuracy doesn't fluctuate, too
I've tried to change the momentum, but it doesn't work
Now i'm totally confused, i don't have any idea why the accuracy fluctuates and why the method above works
I am using neural network for a regression task.
My input is an gray image whose size is 100x70x1.
The gray area has a unique value 60.
The input will go through a preprocessing layer, which multiply 1./255 on every pixel value.
My output is just three double number: [0.87077969, 0.98989031, 0.98888382]
I used ResNet152 model as shown below:
class Bottleneck(tf.keras.Model):
expansion = 4
def __init__(self, in_channels, out_channels, strides=1):
super(Bottleneck, self).__init__()
self.conv1 = tf.keras.layers.Conv2D(out_channels, 1, 1, use_bias=False)
self.bn1 = tf.keras.layers.BatchNormalization()
self.conv2 = tf.keras.layers.Conv2D(out_channels, 3, strides, padding="same", use_bias=False)
self.bn2 = tf.keras.layers.BatchNormalization()
self.conv3 = tf.keras.layers.Conv2D(out_channels*self.expansion, 1, 1, use_bias=False)
self.bn3 = tf.keras.layers.BatchNormalization()
if strides != 1 or in_channels != self.expansion * out_channels:
self.shortcut = tf.keras.Sequential([
tf.keras.layers.Conv2D(self.expansion*out_channels, kernel_size=1,
strides=strides, use_bias=False),
tf.keras.layers.BatchNormalization()]
)
else:
self.shortcut = lambda x,_: x
def call(self, x, training=False):
out = tf.nn.elu(self.bn1(self.conv1(x), training))
out = tf.nn.elu(self.bn2(self.conv2(out), training))
out = self.bn3(self.conv3(out), training)
out += self.shortcut(x, training)
return tf.nn.elu(out)
class ResNet(tf.keras.Model):
def __init__(self, block, num_blocks):
super(ResNet, self).__init__()
self.in_channels = 64
self.conv1 = tf.keras.layers.Conv2D(64, 7, 2, padding="same", use_bias=False) # 60x60
self.bn1 = tf.keras.layers.BatchNormalization()
self.pool1 = tf.keras.layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same') # 30x30
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.avg_pool2d = tf.keras.layers.GlobalAveragePooling2D()
self.flatten = tf.keras.layers.Flatten()
def _make_layer(self, block, out_channels, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels * block.expansion
return tf.keras.Sequential(layers)
def call(self, x, training=False):
out = self.pool1(tf.nn.elu(self.bn1(self.conv1(x), training)))
out = self.layer1(out, training=training)
out = self.layer2(out, training=training)
out = self.layer3(out, training=training)
out = self.layer4(out, training=training)
# For classification
out = self.flatten(out)
# out = tf.keras.layers.Reshape((out.shape[-1],))(out)
#out = self.linear(out)
return out
def model(self):
x = tf.keras.layers.Input(shape=(100,70,1))
return tf.keras.Model(inputs=[x], outputs=self.call(x))
def ResNet152():
return ResNet(Bottleneck, [3,8,36,3])
I used elu as activation function and changed the GlobalAveragePooling layer into flatten layer at the end of ResNet.
Before output I stack two Dense layer(2048 units and 3 units) on top of the ResNet model.
For training I used adam optimizer and inital learning rate is 1e-4, which will decreasing by factor 10 when the val_loss not decreasing for 3 epoch.
The loss is just mse error.
After early stopping while learning rate is 1e-8, the mse loss is still very high:8.6225
The prediction is [2.92318237, 5.53124916, 3.00686643] which is far away from the ground truth: [0.87077969, 0.98989031, 0.98888382]
I don't know why such a deep network cannot overfit such a sample.
Is this the reason that my input image has too few information? Could someone help me?
Hello below is the pytorch model I am trying to run. But getting error. I have posted the error trace as well. It was running very well unless I added convolution layers. I am still new to deep learning and Pytorch. So I apologize if this is silly question. I am using conv1d so why should conv1d expect 3 dimensional input and it is also getting a 2d input which is also odd.
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(CROP_SIZE*CROP_SIZE*3, 512)
self.conv1d1 = nn.Conv1d(in_channels=512, out_channels=64, kernel_size=1, stride=2)
self.fc2 = nn.Linear(64, 128)
self.conv1d2 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=1, stride=2)
self.fc3 = nn.Linear(64, 256)
self.conv1d3 = nn.Conv1d(in_channels=256, out_channels=64, kernel_size=1, stride=2)
self.fc4 = nn.Linear(64, 256)
self.fc4 = nn.Linear(256, 128)
self.fc5 = nn.Linear(128, 64)
self.fc6 = nn.Linear(64, 32)
self.fc7 = nn.Linear(32, 64)
self.fc8 = nn.Linear(64, frame['landmark_id'].nunique())
def forward(self, x):
x = F.relu(self.conv1d1(self.fc1(x)))
x = F.relu(self.conv1d2(self.fc2(x)))
x = F.relu(self.conv1d3(self.fc3(x)))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
x = self.fc8(x)
return F.log_softmax(x, dim=1)
net = Net()
import torch.optim as optim
loss_function = nn.CrossEntropyLoss()
net.to(torch.device('cuda:0'))
for epoch in range(3): # 3 full passes over the data
optimizer = optim.Adam(net.parameters(), lr=0.001)
for data in tqdm(train_loader): # `data` is a batch of data
X = data['image'].to(device) # X is the batch of features
y = data['landmarks'].to(device) # y is the batch of targets.
optimizer.zero_grad() # sets gradients to 0 before loss calc. You will do this likely every step.
output = net(X.view(-1,CROP_SIZE*CROP_SIZE*3)) # pass in the reshaped batch
# print(np.argmax(output))
# print(y)
loss = F.nll_loss(output, y) # calc and grab the loss value
loss.backward() # apply this loss backwards thru the network's parameters
optimizer.step() # attempt to optimize weights to account for loss/gradients
print(loss) # print loss. We hope loss (a measure of wrong-ness) declines!
Error trace
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-42-f5ed7999ce57> in <module>
5 y = data['landmarks'].to(device) # y is the batch of targets.
6 optimizer.zero_grad() # sets gradients to 0 before loss calc. You will do this likely every step.
----> 7 output = net(X.view(-1,CROP_SIZE*CROP_SIZE*3)) # pass in the reshaped batch
8 # print(np.argmax(output))
9 # print(y)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
<ipython-input-37-6d3e34d425a0> in forward(self, x)
16
17 def forward(self, x):
---> 18 x = F.relu(self.conv1d1(self.fc1(x)))
19 x = F.relu(self.conv1d2(self.fc2(x)))
20 x = F.relu(self.conv1d3(self.fc3(x)))
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/conv.py in forward(self, input)
210 _single(0), self.dilation, self.groups)
211 return F.conv1d(input, self.weight, self.bias, self.stride,
--> 212 self.padding, self.dilation, self.groups)
213
214
RuntimeError: Expected 3-dimensional input for 3-dimensional weight [64, 512, 1], but got 2-dimensional input of size [4, 512] instead
You should learn how convolutions work (e.g. see this answer) and some neural network basics (this tutorial from PyTorch).
Basically, Conv1d expects inputs of shape [batch, channels, features] (where features can be some timesteps and can vary, see example).
nn.Linear expects shape [batch, features] as it is fully connected and each input feature is connected to each output feature.
You can verify those shapes by yourself, for torch.nn.Linear:
import torch
layer = torch.nn.Linear(20, 10)
data = torch.randn(64, 20) # [batch, in_features]
layer(data).shape # [64, 10], [batch, out_features]
For Conv1d:
layer = torch.nn.Conv1d(in_channels=20, out_channels=10, kernel_size=3, padding=1)
data = torch.randn(64, 20, 15) # [batch, channels, timesteps]
layer(data).shape # [64, 10, 15], [batch, out_features]
layer(torch.randn(32, 20, 25)).shape # [32, 10, 25]
BTW. As you are working with images, you should use torch.nn.Conv2d instead.
Most of the Pytorch functions work on batch data i.e they accept input of size (batch_size, shape). #Szymon Maszke already posted answer related to that.
So in your case, you can use unsqueeze and sqeeze functions for adding and removing extra dimensions.
Here's the sample code:
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(100, 512)
self.conv1d1 = nn.Conv1d(in_channels=512, out_channels=64, kernel_size=1, stride=2)
self.fc2 = nn.Linear(64, 128)
def forward(self, x):
x = self.fc1(x)
x = x.unsqueeze(dim=2)
x = F.relu(self.conv1d1(x))
x = x.squeeze()
x = self.fc2(x)
return x
net = Net()
bsize = 4
inp = torch.randn((bsize, 100))
out = net(inp)
print(out.shape)
About the input. Sorry for the bad formatting. The for each two rows first row is the key and second row is the value. 18~20_ride is the label and is not included in the input. Below is one input. And train set consists of 400000 of these.
bus_route_id station_code latitude longitude 6~7_ride
0 4270000 344 33.48990 126.49373
7~8_ride 8~9_ride 9~10_ride 10~11_ride 11~12_ride 6~7_takeoff
0.0 1.0 2.0 5.0 2.0 6.0
7~8_takeoff 8~9_takeoff 9~10_takeoff 10~11_takeoff 11~12_takeoff
0.0 0.0 0.0 0.0 0.0
18~20_ride weekday dis_jejusi dis_seoquipo
0.0 6 2.954920 26.256744
Example weights: Captured at 4th epoch. After 20 epochs of training I got much smaller values (ex. -7e-44 or 1e-55)
2.3937e-11, -2.6920e-12, -1.0445e-11, ..., -1.0754e-11, 1.1128e-11, -1.4814e-11
The model's prediction and target
#Target
[2.],
[0.],
[0.]
#Prediction
[1.4187],
[1.4187],
[1.4187]
MyDataset.py
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import torch
import os
class MyDataset(Dataset):
def __init__(self, csv_filename):
self.dataset = pd.read_csv(csv_filename, index_col=0)
self.labels = self.dataset.pop("18~20_ride")
self.dataset = self.dataset.values
self.labels = np.reshape(self.labels.values,(-1,1))
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
return self.dataset[idx], self.labels[idx]
Model
class Network(nn.Module):
def __init__(self, input_num):
super(Network, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(input_num, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc2 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc3 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc4 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc5 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc6 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU)
)
self.fc7 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc8 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU())
)
self.fc9 = nn.Linear(64, 1)
The training and validation
def train(model, device, train_loader, optimizer, loss_fn, log_interval, epoch):
print("Training")
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.float().to(device), target.float().to(device)
optimizer.zero_grad()
output = model(data)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_idx+1) * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def validate(model, device, loader, loss_fn):
print("\nValidating")
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for batch_idx, (data, target) in enumerate(loader):
data, target = data.float().to(device), target.float().to(device)
output = model(data)
test_loss += loss_fn(output, target).item() # sum up batch loss
test_loss /= len(loader)
print('Validation average loss: {:.4f}\n'.format(
test_loss))
return test_loss
Entire process of training and validation
from MyDataset import MyDataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from datetime import datetime
train_dataset_path = "/content/drive/My Drive/root/bus/dataset/train.csv"
val_dataset_path = "/content/drive/My Drive/root/bus/dataset/val.csv"
model_base_path = "/content/drive/My Drive/root/bus/models/"
model_file = "/content/drive/My Drive/root/bus/models/checkpoints/1574427776.202017.pt"
"""
Training Config
"""
epochs = 10
batch_size = 32
learning_rate = 0.5
check_interval = 4
log_interval = int(40000/batch_size)
gamma = 0.1
load_model = False
save_model = True
make_checkpoint = True
"""
End of config
"""
# Read test set
train_set = MyDataset(train_dataset_path)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_set = MyDataset(val_dataset_path)
val_loader = DataLoader(val_set, batch_size=1)
print("Data READY")
device = torch.device("cuda")
net = Network(19).float().to(device)
if load_model:
net.load_state_dict(torch.load(model_file))
loss_fn = torch.nn.MSELoss()
optimizer = optim.AdamW(net.parameters(), lr=learning_rate)
best_loss = float('inf')
isAbort = False
for epoch in range(1, epochs+1):
train(net, device, train_loader, optimizer, loss_fn, log_interval, epoch)
val_loss = validate(net, device, val_loader, loss_fn)
if epoch%check_interval==0:
if make_checkpoint:
print("Saving new checkpoint")
torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
"""
if val_loss < best_loss and epoch%check_interval==0:
best_loss = val_loss
if make_checkpoint:
print("Saving new checkpoint")
torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
else:
print("Model overfit detected. Aborting training")
isAbort = True
break
"""
if save_model and not isAbort:
torch.save(net.state_dict(), model_base_path+"finals/"+str(datetime.today().timestamp())+".pt")
So I tried to train a fully connected model for a regression problem, with google colab. But it did not get trained well; The loss absolutely did not decrease. So I dug down and found out that the weights were really small. Any idea why this is happening and how I could avoid this? Thank you
I used MSE for loss and used ADaW optimizer. Below are the things I have tried
Tried other architectures (Changing number of layers sizes, Changed activation function ReLU, GELU)but the loss did not decrease
Tried changing the learning rate from 3e-1~1e-3, even tried 1
Tried other pre-processing(Used day/month/year instead of weekday) for the data
Given the label in the input data but loss did not decrease
Tried different batch_sizes(4, 10, 32, 64)
Removed batch_normalization
Other kinds of optimizer such as SGD, Adam
Training 20 epochs but loss did not decrease at all
The weights do change at loss.backward()
TL;DR: Invalid input data!! Check for NaN or NULL
Well it has been sometime since the question. Tried almost everything and though maybe messed up the project setup. So I deleted the project and tried it again: same. Delete again and migrate to TF2: THE SAME RESULT! So I found out that there wasn't any problem with the setup. So I searched other places. In the end I did find the reason. The input columns were actually modified by myself. (To remove some highly correlated features). It was not original. During the modification I messed up some float values and it ended up having NaN values. So check if you're dataset contains invalid values.