I am trying to train a model based on MobileNet to do landmark detection for Dog Faces (output is a tensor with x/y coordinates for the position of the eyes and the nose of the dog).
In my training, I am seeing the following graph for val_loss:
Question: What is going on with the random spikes in val_loss?
My model looks like this:
model = applications.MobileNet(weights="imagenet",
include_top=False,
input_shape=(224, 224, 3))
for layer in model.layers:
layer.trainable = True
x = model.output
p = 0.6
x = AveragePooling2D()(output_layer)
x = BatchNormalization(axis=1, name="net_out")(x)
x = Dropout(p/4)(x)
x = Flatten()(x)
x = Dense(512, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(p)(x)
x = Dense(512, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(p/2)(x)
# PREDICT_SIZE is 6 - the number of landmarks I'm extracting
x = Dense(PREDICT_SIZE, activation="linear")(x)
points = Reshape((PREDICT_SIZE,), name="f")(x)
model_final = Model(input=model.input, output=points)
model_final.compile(loss="mse",
optimizer=optimizers.Adam(epsilon=1e-7),
metrics=["accuracy"])
Further questions: Should I set epsilon to be an even bigger number in my optimizer? Will my model suffer if I make epsilon larger?
Should I try a different loss function?
Is the model itself at fault?
Should I try different top layers for my model?
Update: Removing the "relu" activation resulted both in no more funky jumps in val_loss, but also in a model with lower loss and even better precision! WOOHOO!
Here's the val_loss value after removing "relu" from the top layers (and no other changes).
Related
Problem description:
I want to train a GRU by using a small dataset, which has 40 training samples. Length of input sequence is 3 and length of output sequence is 1. Below is the configuraions:
hidden units: 128
input size(input feature num): 11
layer: 2
drupout: 0.5
bidirectional: False
lr: 0.0001
optimizer: Adam
batch size: 8
Below is my model. Thank Azhar Khan for helping me format the code.
class GRU(nn.Module):
def __init__(self, GRU_input_num, GRU_hidden_num, GRU_layer_num, dropout, bidirectional, seed):
super(GRU, self).__init__()
torch.manual_seed(seed)
self.GRU_input_num = GRU_input_num
self.GRU_hidden_num = GRU_hidden_num
self.GRU_layer_num = GRU_layer_num
self.dropout = dropout
self.bidirectional = bidirectional
self.direction = 2 if self.bidirectional else 1
if GRU_layer_num > 1:
self.gru = nn.GRU(GRU_input_num, GRU_hidden_num, GRU_layer_num, dropout=dropout, batch_first=True,
bidirectional=bidirectional)
else:
self.gru = nn.GRU(GRU_input_num, GRU_hidden_num, GRU_layer_num, batch_first=True,
bidirectional=bidirectional)
self.predict = nn.Linear(in_features=self.GRU_hidden_num * self.direction * self.GRU_layer_num,
out_features=constants.CATEGORY_NUM * constants.BACTERIA_PER_CATEGORY)
def forward(self, x):
"""GRU forward. Input shape: [batch, input_step, features]"""
# shape of hidden state: [layer * direction, batch, hidden]
batch = x.shape[0]
_, hidden_state = self.gru(x)
hidden_state = hidden_state.permute(1, 0, 2) # [batch, layer * direction, hidden]
hidden_state = hidden_state.contiguous().view(batch, 1, self.GRU_hidden_num * self.direction * self.GRU_layer_num).squeeze(1) # [batch, layer * direction * hidden_num]
return self.predict(hidden_state), hidden_state.cpu().detach().numpy() # [batch, features]
I am curious about how will the hidden state change during training. Here are my observations.
My observations:
Heatmap of hidden state.
I printed the heatmap of hidden states every 25 epochs. x axis is sample index in one batch, y axis is hidden state dimension and the title is the sum of all hidden states. The figures are showed below
after 25 epochs
after 75 epochs
after 150 epochs
after 300 epochs
after 550 epochs
The summation keeps shrink along with training.
Weights of GRU.
I also have a glance to the weights of GRU, below is a snapshot of part of weights.
snapshot of GRU weights
Almost all the weights are end with 'e-40(or 41 etc.)', which mean every single weight close to zero.
My question:
Is this a normal phenomenon? If not, what could be the cause of this issue?
Thanks for anyone who can give me some comments about this, and do let me know what else information you need.
I have a dataset with 8 features and 4 timesteps. I am trying to implement an LSTM but need help understanding if i have set my tensor correctly. The aim is to take the outputted features from the LSTM and pass them through a NN.
My tensor shape is currently #samples x #timesteps x #features i.e. 4500x4x8. This works with the code below. I want to make sure that the model is indeed taking each timestep matrix as a new sequence (with matrix 4500x[0]x8 being the first timestep matrix and 4500x[3]x8 being the last timestep). I then take the final timestep output (output[:,-1,:] to feed through a NN.
Is the code doing what i think it is doing? I ask as performance is marginally less than a simple RF that only uses the final timestep data. This would be unexpected as the data has strong time-series correlations (it tracks patients vitals declining before going on ventilation).
I have the following code:
class LSTM1(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers):
super(LSTM1, self).__init__()
self.num_classes = num_classes #number of classes
self.num_layers = num_layers #number of layers
self.input_size = input_size #input size
self.hidden_size = hidden_size #hidden state
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True) #lstm
self.fc_1 = nn.Linear(hidden_size, 32) #fully connected 1
self.fc_2 = nn.Linear(32, 12) #fully connected 1
self.fc_3 = nn.Linear(12, 1)
self.fc = nn.Sigmoid() #fully connected last layer
self.relu = nn.ReLU()
def forward(self,x):
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
# Propagate input through LSTM
output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
out = output[:,-1,:] #reshaping the data for Dense layer next
out = self.relu(out)
out = self.fc_1(out) #first Dense
out = self.relu(out) #relu
out = self.fc_2(out) #2nd dense
out = self.relu(out) #relu
out = self.fc_3(out) #3rd dense
out = self.relu(out) #relu
out = self.fc(out) #Final Output
return out
Error
Your error stems from the last three lines.
Do not use ReLU activation at the end of your network
Use nn.Linear -> nn.Sigmoid with BCELoss or
nn.Linear with nn.BCEWithLogitsLoss (see here for what logits are).
What is going on
With ReLu you output values in the range [0, +inf)
Applying sigmoid on top of it “squashes” values to (0, 1) with threshold being 0 (e.g. 0 becomes 0.5 probability, hence 1 after threaholding at 0.5!)
In effect, you always predict 1 with this code, which is not what you want probably
I am trying to perform an object localization task with MNIST based on Andrew Ng's lecture here. I am taking the MNIST digits and randomly placing them into a 90x90 shaped image and predicting the digit and it's center point. When I train, I am getting very poor results and my question is about whether or not my loss function is set up correctly. I basically just take the CrossEntropy for the digit, the MSE for the coordinates, and then add them all up. Is this correct? I don't get any errors, but the performance is just horrendous.
My dataset is defined as follows (which returns the label and the x y coordinates of the center of the digit):
class CustomMnistDataset_OL(Dataset):
def __init__(self, df, test=False):
'''
df is a pandas dataframe with 28x28 columns for each pixel value in MNIST
'''
self.df = df
self.test = test
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
if self.test:
image = np.reshape(np.array(self.df.iloc[idx,:]), (28,28)) / 255.
else:
image = np.reshape(np.array(self.df.iloc[idx,1:]), (28,28)) / 255.
# create the new image
new_img = np.zeros((90, 90)) # images will be 90x90
# randomly select a bottom left corner to use for img
x_min, y_min = randrange(90 - image.shape[0]), randrange(90 - image.shape[0])
x_max, y_max = x_min + image.shape[0], y_min + image.shape[0]
x_center = x_min + (x_max-x_min)/2
y_center = y_min + (y_max-x_min)/2
new_img[x_min:x_max, y_min:y_max] = image
label = [int(self.df.iloc[idx,0]), x_center, y_center] # the label consists of the digit and the center of the number
sample = {"image": new_img, "label": label}
return sample['image'], sample['label']
My training function is set up as follows:
loss_fn = nn.CrossEntropyLoss()
loss_mse = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
def train(dataloader, model, loss_fn, loss_mse, optimizer):
model.train() # very important... This turns the model back to training mode
size = len(train_dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y0, y1, y2 = X.to(device), y[0].to(device), y[1].to(device), y[2].to(device)
pred = model(X.float())
# DEFINE LOSS HERE -------
loss = loss_fn(pred[0], y0) + loss_mse(pred[1], y1.float()) + loss_mse(pred[2], y2.float())
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch*len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
I'm new to computer vision and deep learning. I'm trying to train this Unet model https://github.com/kevinlu1211/pytorch-unet-resnet-50-encoder with Resnet50 as encoder. I want to implement it in a way that I pass two rgb images which are first processed by resnet50 and then the layers are concated before being passed to the decoder. I tried doing it and changed n_classes in the code to 3 to output a 3 channel rgb image just like the inputs but it gives me a distorted image like this which I don't understand why. Please help me with this.
The part in the code that I modified to process two rgb inputs by resnet50 is here -
for i, block in enumerate(self.down_blocks, 2): # for all the down blocks
x = block(x)
if i == (UNetWithResnet50Encoder.DEPTH - 1):
continue
pre_pools[f"layer_{i}"] = x ## creating all the down sampling layers
pre_pools_inp2 = dict()
pre_pools_inp2[f"layer_0"] = y
y = self.input_block(y) #
pre_pools_inp2[f"layer_1"] = y
y = self.input_pool(y)
for i, block in enumerate(self.down_blocks, 2): # for all the down blocks
y = block(y)
if i == (UNetWithResnet50Encoder.DEPTH - 1):
continue
pre_pools_inp2[f"layer_{i}"] = y ## creating all the down sampling layers
x = torch.cat([x,y],1)
x = self.bridge(x) # this is now the bridge between down sampling and up sampling
for i, block in enumerate(self.up_blocks, 1):
key = f"layer_{UNetWithResnet50Encoder.DEPTH - 1 - i}" # now using that bridge for upsampling f
x = block(x, pre_pools[key])
output_feature_map = x
x = self.out(x)
del pre_pools
if with_output_feature_map:
return x, output_feature_map
else:
return x
I'm using neural nets for a regression problem where I have 3 features and I'm trying to predict one continuous value. I noticed that my neural net start learning good but after 10 epochs it get stuck on a high loss value and could not improve anymore.
I tried to use Adam and other adaptive optimizers instead of SGD but that didn't work. I tried a complex architectures like adding layers, neurons, batch normalization and other activations etc.. and that also didn't work.
I tried to debug and try to find out if something is wrong with the implementation but when I use only 10 examples of the data my model learn fast so there are no errors. I start to increase the examples of the data and monitoring my model results as I increase the data examples. when I reach 3000 data examples my model start to get stuck on a high value loss.
I tried to increase layers, neurons and also to try other activations, batch normalization. My data are also normalized between [-1, 1], my target value is not normalized since it is regression and I'm predicting a continuous value. I also tried using keras but I've got the same result.
My real dataset have 40000 data, I don't know what should I try, I almost try all things that I know for optimization but none of them worked. I would appreciate it if someone can guide me on this. I'll post my Code but maybe it is too messy to try to understand, I'm sure there is no problem with my implementation, I'm using skorch/pytorch and some SKlearn functions:
# take all features as an Independant variable except the bearing and distance
# here when I start small the model learn good but from 3000 data points as you can see the model stuck on a high value. I mean the start loss is 15 and it start to learn good but when it reach 9 it stucks there
# and if I try to use the whole dataset for training then the loss start at 47 and start decreasing until it reach 36 and then stucks there too
X = dataset.iloc[:3000, 0:-2].reset_index(drop=True).to_numpy().astype(np.float32)
# take distance and bearing as the output values:
y = dataset.iloc[:3000, -2:].reset_index(drop=True).to_numpy().astype(np.float32)
y_bearing = y[:, 0].reshape(-1, 1)
y_distance = y[:, 1].reshape(-1, 1)
# normalize the input values
scaler = StandardScaler()
X_norm = scaler.fit_transform(X, y)
X_br_train, X_br_test, y_br_train, y_br_test = train_test_split(X_norm,
y_bearing,
test_size=0.1,
random_state=42,
shuffle=True)
X_dis_train, X_dis_test, y_dis_train, y_dis_test = train_test_split(X_norm,
y_distance,
test_size=0.1,
random_state=42,
shuffle=True)
bearing_trainset = Dataset(X_br_train, y_br_train)
bearing_testset = Dataset(X_br_test, y_br_test)
distance_trainset = Dataset(X_dis_train, y_dis_train)
distance_testset = Dataset(X_dis_test, y_dis_test)
def root_mse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
class RMSELoss(nn.Module):
def __init__(self):
super().__init__()
self.mse = nn.MSELoss()
def forward(self, yhat, y):
return torch.sqrt(self.mse(yhat, y))
class AED(nn.Module):
"""custom average euclidean distance loss"""
def __init__(self):
super().__init__()
def forward(self, yhat, y):
return torch.dist(yhat, y)
def train(on_target,
hidden_units,
batch_size,
epochs,
optimizer,
lr,
regularisation_factor,
train_shuffle):
network = None
trainset = distance_trainset if on_target.lower() == 'distance' else bearing_trainset
testset = distance_testset if on_target.lower() == 'distance' else bearing_testset
print(f"shape of trainset.X = {trainset.X.shape}, shape of trainset.y = {trainset.y.shape}")
print(f"shape of testset.X = {testset.X.shape}, shape of testset.y = {testset.y.shape}")
mse = EpochScoring(scoring=mean_squared_error, lower_is_better=True, name='MSE')
r2 = EpochScoring(scoring=r2_score, lower_is_better=False, name='R2')
rmse = EpochScoring(scoring=make_scorer(root_mse), lower_is_better=True, name='RMSE')
checkpoint = Checkpoint(dirname=f'results/{on_target}/checkpoints')
train_end_checkpoint = TrainEndCheckpoint(dirname=f'results/{on_target}/checkpoints')
if on_target.lower() == 'bearing':
network = BearingNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=y_distance.shape[1])
elif on_target.lower() == 'distance':
network = DistanceNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=1)
model = NeuralNetRegressor(
module=network,
criterion=RMSELoss,
device='cpu',
batch_size=batch_size,
lr=lr,
optimizer=optim.Adam if optimizer.lower() == 'adam' else optim.SGD,
optimizer__weight_decay=regularisation_factor,
max_epochs=epochs,
iterator_train__shuffle=train_shuffle,
train_split=predefined_split(testset),
callbacks=[mse, r2, rmse, checkpoint, train_end_checkpoint]
)
print(f"{'*' * 10} start training the {on_target} model {'*' * 10}")
history = model.fit(trainset, y=None)
print(f"{'*' * 10} End Training the {on_target} Model {'*' * 10}")
if __name__ == '__main__':
args = parser.parse_args()
train(on_target=args.on_target,
hidden_units=args.hidden_units,
batch_size=args.batch_size,
epochs=args.epochs,
optimizer=args.optimizer,
lr=args.learning_rate,
regularisation_factor=args.regularisation_lambda,
train_shuffle=args.shuffle)
and this is my network declaration:
class DistanceNetwork(nn.Module):
"""separate NN for predicting distance"""
def __init__(self, n_features=5, n_hidden=16, n_out=1):
super().__init__()
self.model = nn.Sequential(
nn.Linear(n_features, n_hidden),
nn.LeakyReLU(),
nn.Linear(n_hidden, 5),
nn.LeakyReLU(),
nn.Linear(5, n_out)
)
here is the log while training: