Training the saving deep learning model in Pytorch - deep-learning

I have a deep learning model in pytorch (here I provide a simple overview of that). Since I have to run the model each day. I want to save the model in the previous day and then I train the saved model for small number of epochs (3-4) epochs more. Here is the model:
class NET(nn.Module):
def forward(self, y, par):
def test(self, y, num_samples):
def train_batch(model, optimizer, device):
model.train()
optimizer.zero_grad()
length = float(batch.size(0))
...
return loss1
def trainv(model, device, epochs, train_iterator, optimizer, validate_iterator):
for epoch in range(epochs):
for local_batch, local_labels in train_iterator:
train_loss = train_batch(model, optimizer, device)
validate_loss = runs_for_validate(validate_iterator, n_samples) # check on validate data
return train_losses, validate_loss
model = NET(inputs).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr =learning_rate )
train_losses, validate_loss = train(model, device, epochs, train_iterator, optimizer, validate_iterator)
My problem is that I dont know how to save the model and then again loading the saved model to train with more epochs.
previously I could save the model.state_dict() in '.pt' and can load it, but it seems it does not work here. I already saw this post, How can I save my training progress in PyTorch for a certain batch no.? , however I dont know which format I should save the model? and also when I should save the model? Could you please help me with this?

You can save the entire model, including the model architecture and its current state, by passing in the model object to the function.
torch.save(model, "model.pt")
You can save the model's state dictionary, which contains the model's parameters, separately.
torch.save(model.state_dict(), "model_state.pt")
You can load the model by using the torch.load function, passing in the path of the saved model.
model = torch.load("model.pt")
You can also load the state dictionary separately and use it to load the model's parameters.
state_dict = torch.load("model_state.pt")
model.load_state_dict(state_dict)
you can try like
class NET(nn.Module):
def __init__(self, inputs):
super(NET, self).__init__()
self.inputs = inputs
def forward(self, y, par):
# Your forward pass here
pass
def test(self, y, num_samples):
# Your test function here
pass
def train_batch(model, optimizer, device):
model.train()
optimizer.zero_grad()
length = float(batch.size(0))
...
return loss1
def trainv(model, device, epochs, train_iterator, optimizer, validate_iterator, save_path):
best_loss = float('inf')
for epoch in range(epochs):
for local_batch, local_labels in train_iterator:
train_loss = train_batch(model, optimizer, device)
validate_loss = runs_for_validate(validate_iterator, n_samples) # check on validate data
# Save the model if the validation loss improves
if validate_loss < best_loss:
best_loss = validate_loss
torch.save(model.state_dict(), save_path)
return train_losses, validate_loss
def load_model(model, load_path, device):
state_dict = torch.load(load_path, map_location=device)
model.load_state_dict(state_dict)
return model
# Initialize the model
model = NET(inputs).to(device)
# Load the model if a save path is provided
if load_path is not None:
model = load_model(model, load_path, device)
optimizer = torch.optim.Adam(model.parameters(), lr =learning_rate )
train_losses, validate_loss = trainv(model, device, epochs, train_iterator, optimizer, validate_iterator, save_path)

Related

convert pytorch model with multiple networks to onnx

I am trying to convert pytorch model with multiple networks to ONNX, and encounter some problem.
The git repo: https://github.com/InterDigitalInc/HRFAE
The Trainer Class:
class Trainer(nn.Module):
def __init__(self, config):
super(Trainer, self).__init__()
# Load Hyperparameters
self.config = config
# Networks
self.enc = Encoder()
self.dec = Decoder()
self.mlp_style = Mod_Net()
self.dis = Dis_PatchGAN()
...
Here is how the trained model process image:
def gen_encode(self, x_a, age_a, age_b=0, training=False, target_age=0):
if target_age:
self.target_age = target_age
age_modif = self.target_age*torch.ones(age_a.size()).type_as(age_a)
else:
age_modif = self.random_age(age_a, diff_val=25)
# Generate modified image
self.content_code_a, skip_1, skip_2 = self.enc(x_a)
style_params_a = self.mlp_style(age_a)
style_params_b = self.mlp_style(age_modif)
x_a_recon = self.dec(self.content_code_a, style_params_a, skip_1, skip_2)
x_a_modif = self.dec(self.content_code_a, style_params_b, skip_1, skip_2)
return x_a_recon, x_a_modif, age_modif
And as following is how I did to convert to onnx:
enc = Encoder()
dec = Decoder()
mlp = Mod_Net()
layers = [enc, mlp, dec]
model = torch.nn.Sequential(*layers)
# here is my confusion: how do I specify the inputs of each layer??
# E.g. one of the outputs of 'enc' layer should be input of 'mlp' layer,
# or the outputs of 'enc' layer should be part of inputs of 'dec' layer...
params = torch.load('./logs/001/checkpoint')
model[0].load_state_dict(params['enc_state_dict'])
model[1].load_state_dict(params['mlp_style_state_dict'])
model[2].load_state_dict(params['dec_state_dict'])
torch.onnx.export(model, torch.randn([1, 3, 1024, 1024]), 'trained_hrfae.onnx', do_constant_folding=True)
Maybe the convert-part code is in wrong way??
Could anyone help, many thanks!
#20210629-11:52GMT Edit:
I found there's constraint of using torch.nn.Sequential. The output of former layer in Sequential should be consistent with latter input.
So my code shouldn't work at all because the output of 'enc' layer is not consistent with input of 'mlp' layer.
Could anyone help how to convert this type of pytorch model to onnx? Many thanks, again :)
After research and try, I found a method which maybe in correct way:
Convert each net(Encoder, Mod_Net, Decoder) to onnx model, and handle their input/output in latter logic-process or any further procedure (e.g convert to tflite model).
I'm trying to port onto Android using this method.
#Edit 20210705-03:52GMT#
Another approach may be better: write a new net combines the three nets. I've prove the output is same as origin pytorch model.
class HRFAE(nn.Module):
def __init__(self):
super(HRFAE, self).__init__()
self.enc = Encoder()
self.mlp_style = Mod_Net()
self.dec = Decoder()
def forward(self, x, age_modif):
content_code_a, skip_1, skip_2 = self.enc(x)
style_params_b = self.mlp_style(age_modif)
x_a_modif = self.dec(content_code_a, style_params_b, skip_1, skip_2)
return x_a_modif
and then convert use following:
net = HRFAE()
params = torch.load('./logs/002/checkpoint')
net.enc.load_state_dict(params['enc_state_dict'])
net.mlp_style.load_state_dict(params['mlp_style_state_dict'])
net.dec.load_state_dict(params['dec_state_dict'])
net.eval()
torch.onnx.export(net, (torch.randn([1, 3, 512, 512]), torch.randn([1]).type(torch.long)), 'test_hrfae.onnx')
This should be the answer.

CNN + RNN architecture for video recognition

I am trying to replicate the ConvNet + LSTM approach presented in this paper using pytorch. But I am struggling to find the correct way to combine the CNN and the LSTM in my model. Here is my attempt :
class VideoRNN(nn.Module):
def __init__(self, hidden_size, n_classes):
super(VideoRNN, self).__init__()
self.hidden_size = hidden_size
vgg = models.vgg16(pretrained=True)
embed = nn.Sequential(*list(vgg.classifier.children())[:-1])
vgg.classifier = embed
for param in vgg.parameters():
param.requires_grad = False
self.embedding = vgg
self.GRU = nn.GRU(4096, hidden_size)
def forward(self, input, hidden=None):
embedded = self.embedding(input)
output, hidden = self.gru(output, hidden)
output = self.classifier(output.view(-1, 4096))
return output, hidden
As my videos have variable length, I provide a PackedSequence as an input. It is created from a Tensor with shape (M,B,C,H,W) where M is the maximum sequence length and B the batch size. The C,H,W are the channels, height and width of each frame.
I want the pre-trained CNN to be part of the model as I may later unfreeze some layer to finetune the CNN for my task. That's why I didn't compute the embedding of the images separately.
My questions are then the following :
Is the shape of my input data correct in order to handle batches of videos in my context or should I use something else than a PackedSequence?
In my forward function, how can I handle the batch of sequences of images with my VGG and my GRU unit ? I cannot feed directly the PackedSequence as an input to my VGG so how can I proceed?
Does this approach seem to respect the "pytorch way of doing things" or should is my approach flawed?
I finally found the solution to make it works. Here is a simplified yet complete example of how I managed to create a VideoRNN able to use packedSequence as an input :
class VideoRNN(nn.Module):
def __init__(self, n_classes, batch_size, device):
super(VideoRNN, self).__init__()
self.batch = batch_size
self.device = device
# Loading a VGG16
vgg = models.vgg16(pretrained=True)
# Removing last layer of vgg 16
embed = nn.Sequential(*list(vgg.classifier.children())[:-1])
vgg.classifier = embed
# Freezing the model 3 last layers
for param in vgg.parameters():
param.requires_grad = False
self.embedding = vgg
self.gru = nn.LSTM(4096, 2048, bidirectional=True)
# Classification layer (*2 because bidirectionnal)
self.classifier = nn.Sequential(
nn.Linear(2048 * 2, 256),
nn.ReLU(),
nn.Linear(256, n_classes),
)
def forward(self, input):
hidden = torch.zeros(2, self.batch , 2048).to(
self.device
)
c_0 = torch.zeros(self.num_layer * 2, self.batch, 2048).to(
self.device
)
embedded = self.simple_elementwise_apply(self.embedding, input)
output, hidden = self.gru(embedded, (hidden, c_0))
hidden = hidden[0].view(-1, 2048 * 2)
output = self.classifier(hidden)
return output
def simple_elementwise_apply(self, fn, packed_sequence):
return torch.nn.utils.rnn.PackedSequence(
fn(packed_sequence.data), packed_sequence.batch_sizes
)
the key is the simple_elementwise_apply methods allowing to feed the PackedSequence in the CNN networks and to retrieve a new PackedSequence made of embedding as an output.
I hope you'll find it useful.

Speeding up the trainning - RNN with LSTM in PyTorch

I am trying to train a LSTM for energy demand forecast but it takes too long. I do not understand why because the model looks “simple” and there is no much data. Might it be because I am not using the DataLoader? How could I use it with RNN since I have a sequence?
Complete code is in Colab: https://colab.research.google.com/drive/130rG8_j1Lf8RQoVRrfXCeo5h_CcC5NU6?usp=sharing
The interesting part to be improved may be this:
for seq, y_train in train_data:
optimizer.zero_grad()
model.hidden = (torch.zeros(1,1,model.hidden_size),
torch.zeros(1,1,model.hidden_size))
y_pred = model(seq)
loss = criterion(y_pred, y_train)
loss.backward()
optimizer.step()
Thanks in advance to anyone helping me.
Should you want to speed up the process of training, more data must be provided to the model per training. In my case I was providing just 1 batch. The best way to simply solve this is using the DataLoader.
Complete Colab with the solution can be found in this link: https://colab.research.google.com/drive/1QgtshCFETZ9oTvIYWy1Bdre-614kbwRX?usp=sharing
# This is to create the Dataset
from torch.utils.data import Dataset, DataLoader
class DemandDataset(Dataset):
def __init__(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def __len__(self):
return len(self.y_train)
def __getitem__(self, idx):
data = self.X_train[idx]
labels = self.y_train[idx]
return data, labels
#This is to convert from typical RNN sequences
sq_0 =[]
y_0 =[]
for seq, y_train in train_data:
sq_0.append(seq)
y_0.append(y_train)
dataset=DemandDataset(sq_0,y_0)
dataloader = DataLoader(dataset, batch_size=20)
epochs = 30
t = 50
for i in range(epochs):
print("New epoch")
for data,label in dataloader:
optimizer.zero_grad()
model.hidden = (torch.zeros(1,1,model.hidden_size),
torch.zeros(1,1,model.hidden_size))
y_pred = model(seq)
loss = criterion(y_pred, label)
loss.backward()
optimizer.step()
print(f'Epoch: {i+1:2} Loss: {loss.item():10.8f}')
preds = train_set[-window_size:].tolist()
for f in range(t):
seq = torch.FloatTensor(preds[-window_size:])
with torch.no_grad():
model.hidden = (torch.zeros(1,1,model.hidden_size),
torch.zeros(1,1,model.hidden_size))
preds.append(model(seq).item())
loss = criterion(torch.tensor(preds[-window_size:]),y[-t:])

what should I do if my regression model stuck at a high value loss?

I'm using neural nets for a regression problem where I have 3 features and I'm trying to predict one continuous value. I noticed that my neural net start learning good but after 10 epochs it get stuck on a high loss value and could not improve anymore.
I tried to use Adam and other adaptive optimizers instead of SGD but that didn't work. I tried a complex architectures like adding layers, neurons, batch normalization and other activations etc.. and that also didn't work.
I tried to debug and try to find out if something is wrong with the implementation but when I use only 10 examples of the data my model learn fast so there are no errors. I start to increase the examples of the data and monitoring my model results as I increase the data examples. when I reach 3000 data examples my model start to get stuck on a high value loss.
I tried to increase layers, neurons and also to try other activations, batch normalization. My data are also normalized between [-1, 1], my target value is not normalized since it is regression and I'm predicting a continuous value. I also tried using keras but I've got the same result.
My real dataset have 40000 data, I don't know what should I try, I almost try all things that I know for optimization but none of them worked. I would appreciate it if someone can guide me on this. I'll post my Code but maybe it is too messy to try to understand, I'm sure there is no problem with my implementation, I'm using skorch/pytorch and some SKlearn functions:
# take all features as an Independant variable except the bearing and distance
# here when I start small the model learn good but from 3000 data points as you can see the model stuck on a high value. I mean the start loss is 15 and it start to learn good but when it reach 9 it stucks there
# and if I try to use the whole dataset for training then the loss start at 47 and start decreasing until it reach 36 and then stucks there too
X = dataset.iloc[:3000, 0:-2].reset_index(drop=True).to_numpy().astype(np.float32)
# take distance and bearing as the output values:
y = dataset.iloc[:3000, -2:].reset_index(drop=True).to_numpy().astype(np.float32)
y_bearing = y[:, 0].reshape(-1, 1)
y_distance = y[:, 1].reshape(-1, 1)
# normalize the input values
scaler = StandardScaler()
X_norm = scaler.fit_transform(X, y)
X_br_train, X_br_test, y_br_train, y_br_test = train_test_split(X_norm,
y_bearing,
test_size=0.1,
random_state=42,
shuffle=True)
X_dis_train, X_dis_test, y_dis_train, y_dis_test = train_test_split(X_norm,
y_distance,
test_size=0.1,
random_state=42,
shuffle=True)
bearing_trainset = Dataset(X_br_train, y_br_train)
bearing_testset = Dataset(X_br_test, y_br_test)
distance_trainset = Dataset(X_dis_train, y_dis_train)
distance_testset = Dataset(X_dis_test, y_dis_test)
def root_mse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
class RMSELoss(nn.Module):
def __init__(self):
super().__init__()
self.mse = nn.MSELoss()
def forward(self, yhat, y):
return torch.sqrt(self.mse(yhat, y))
class AED(nn.Module):
"""custom average euclidean distance loss"""
def __init__(self):
super().__init__()
def forward(self, yhat, y):
return torch.dist(yhat, y)
def train(on_target,
hidden_units,
batch_size,
epochs,
optimizer,
lr,
regularisation_factor,
train_shuffle):
network = None
trainset = distance_trainset if on_target.lower() == 'distance' else bearing_trainset
testset = distance_testset if on_target.lower() == 'distance' else bearing_testset
print(f"shape of trainset.X = {trainset.X.shape}, shape of trainset.y = {trainset.y.shape}")
print(f"shape of testset.X = {testset.X.shape}, shape of testset.y = {testset.y.shape}")
mse = EpochScoring(scoring=mean_squared_error, lower_is_better=True, name='MSE')
r2 = EpochScoring(scoring=r2_score, lower_is_better=False, name='R2')
rmse = EpochScoring(scoring=make_scorer(root_mse), lower_is_better=True, name='RMSE')
checkpoint = Checkpoint(dirname=f'results/{on_target}/checkpoints')
train_end_checkpoint = TrainEndCheckpoint(dirname=f'results/{on_target}/checkpoints')
if on_target.lower() == 'bearing':
network = BearingNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=y_distance.shape[1])
elif on_target.lower() == 'distance':
network = DistanceNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=1)
model = NeuralNetRegressor(
module=network,
criterion=RMSELoss,
device='cpu',
batch_size=batch_size,
lr=lr,
optimizer=optim.Adam if optimizer.lower() == 'adam' else optim.SGD,
optimizer__weight_decay=regularisation_factor,
max_epochs=epochs,
iterator_train__shuffle=train_shuffle,
train_split=predefined_split(testset),
callbacks=[mse, r2, rmse, checkpoint, train_end_checkpoint]
)
print(f"{'*' * 10} start training the {on_target} model {'*' * 10}")
history = model.fit(trainset, y=None)
print(f"{'*' * 10} End Training the {on_target} Model {'*' * 10}")
if __name__ == '__main__':
args = parser.parse_args()
train(on_target=args.on_target,
hidden_units=args.hidden_units,
batch_size=args.batch_size,
epochs=args.epochs,
optimizer=args.optimizer,
lr=args.learning_rate,
regularisation_factor=args.regularisation_lambda,
train_shuffle=args.shuffle)
and this is my network declaration:
class DistanceNetwork(nn.Module):
"""separate NN for predicting distance"""
def __init__(self, n_features=5, n_hidden=16, n_out=1):
super().__init__()
self.model = nn.Sequential(
nn.Linear(n_features, n_hidden),
nn.LeakyReLU(),
nn.Linear(n_hidden, 5),
nn.LeakyReLU(),
nn.Linear(5, n_out)
)
here is the log while training:

How to continue training for a saved and then loaded Keras model?

Following the official keras documentation , I was able to save and load a model. Keras is using tensorflow as the backend.
However, is it possible to run more training for such saved and loaded models.
Following is the code borrowed from Link. Then edited.
In the following code, the model is trained for 75 epochs and saved then loaded again.
However, when I tried to train it further with more 75 epochs it seems model was not trained and I got the same result without any modifications.
# -*- coding: utf-8 -*-
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
import numpy
import os
# fix random seed for reproducibility
numpy.random.seed(7)
# load pima indians dataset
dataset = numpy.loadtxt("pima-indians-diabetes.txt", delimiter=",")
# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]
# create model
model = Sequential()
model.add(Dense(12, input_dim=8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X, Y, epochs=75, batch_size=10, verbose=0)
# evaluate the model
scores = model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file: json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
# later...
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
model.fit(X, Y, epochs=75, batch_size=10, verbose=0)
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
It looks like from that code that your evaluating the loaded_model twice, but your extra training is done on just the model. Instead of copying and pasting different variable names you could try something like this... I find it a little easier to keep track of. Also, add some white space to your code between comments, it will help keep things clear and organized.
# Save a model you have trained
model.save('trained_model.h5')
# Delete the model
del model
# Load the model
model = load_model('trained_model.h5')
# Train more on the loaded model
model.fit(data, labels, epochs, batch_size)