CNN + RNN architecture for video recognition - deep-learning

I am trying to replicate the ConvNet + LSTM approach presented in this paper using pytorch. But I am struggling to find the correct way to combine the CNN and the LSTM in my model. Here is my attempt :
class VideoRNN(nn.Module):
def __init__(self, hidden_size, n_classes):
super(VideoRNN, self).__init__()
self.hidden_size = hidden_size
vgg = models.vgg16(pretrained=True)
embed = nn.Sequential(*list(vgg.classifier.children())[:-1])
vgg.classifier = embed
for param in vgg.parameters():
param.requires_grad = False
self.embedding = vgg
self.GRU = nn.GRU(4096, hidden_size)
def forward(self, input, hidden=None):
embedded = self.embedding(input)
output, hidden = self.gru(output, hidden)
output = self.classifier(output.view(-1, 4096))
return output, hidden
As my videos have variable length, I provide a PackedSequence as an input. It is created from a Tensor with shape (M,B,C,H,W) where M is the maximum sequence length and B the batch size. The C,H,W are the channels, height and width of each frame.
I want the pre-trained CNN to be part of the model as I may later unfreeze some layer to finetune the CNN for my task. That's why I didn't compute the embedding of the images separately.
My questions are then the following :
Is the shape of my input data correct in order to handle batches of videos in my context or should I use something else than a PackedSequence?
In my forward function, how can I handle the batch of sequences of images with my VGG and my GRU unit ? I cannot feed directly the PackedSequence as an input to my VGG so how can I proceed?
Does this approach seem to respect the "pytorch way of doing things" or should is my approach flawed?

I finally found the solution to make it works. Here is a simplified yet complete example of how I managed to create a VideoRNN able to use packedSequence as an input :
class VideoRNN(nn.Module):
def __init__(self, n_classes, batch_size, device):
super(VideoRNN, self).__init__()
self.batch = batch_size
self.device = device
# Loading a VGG16
vgg = models.vgg16(pretrained=True)
# Removing last layer of vgg 16
embed = nn.Sequential(*list(vgg.classifier.children())[:-1])
vgg.classifier = embed
# Freezing the model 3 last layers
for param in vgg.parameters():
param.requires_grad = False
self.embedding = vgg
self.gru = nn.LSTM(4096, 2048, bidirectional=True)
# Classification layer (*2 because bidirectionnal)
self.classifier = nn.Sequential(
nn.Linear(2048 * 2, 256),
nn.ReLU(),
nn.Linear(256, n_classes),
)
def forward(self, input):
hidden = torch.zeros(2, self.batch , 2048).to(
self.device
)
c_0 = torch.zeros(self.num_layer * 2, self.batch, 2048).to(
self.device
)
embedded = self.simple_elementwise_apply(self.embedding, input)
output, hidden = self.gru(embedded, (hidden, c_0))
hidden = hidden[0].view(-1, 2048 * 2)
output = self.classifier(hidden)
return output
def simple_elementwise_apply(self, fn, packed_sequence):
return torch.nn.utils.rnn.PackedSequence(
fn(packed_sequence.data), packed_sequence.batch_sizes
)
the key is the simple_elementwise_apply methods allowing to feed the PackedSequence in the CNN networks and to retrieve a new PackedSequence made of embedding as an output.
I hope you'll find it useful.

Related

CNN-LSTM performance identical to LSTM, is there a mistake in my code? (Pytorch)

I am trying to recreate the models from a study in which CNN-LSTM outperformed LSTM, but my CNN-LSTM produces nearly identical results to the LSTM. So it seems like the addition of the convolutional layers is not doing anything. The study describes the CNN-LSTM model like this:
The model is constructed by a single LSTM layer and two CNN layers. To form the CNN part, two 1D convolutional neural networks are stacked without any pooling layer. The second CNN layer is followed by a Rectified Linear Unit (ReLU) activation function. Each of the flattened output of the CNN’s ReLU layer and the LSTM layer is projected to the same dimension using a fully connected layer. Finally, a dropout layer is placed before the output layer.
Did I make a mistake in the implementation? The results of my CNN-LSTM are almost exactly the same as when I use the LSTM on its own. The LSTM on its own is the exact same code as below, just without the two conv1d layers and without the ReLu activation function.
class CNN_LSTM(nn.Module):
def __init__(self, input_size, seq_len, params, output_size):
super(CNN_LSTM, self).__init__()
self.n_hidden = params['lstm_hidden'] # neurons in each lstm layer
self.seq_len = seq_len # length of the input sequence
self.n_layers = 1 # nr of recurrent layers in the lstm
self.n_filters = params['n_filters'] # size of filter in cnn
self.c1 = nn.Conv1d(in_channels=1, out_channels=params['n_filters'], kernel_size=1, stride=1)
self.c2 = nn.Conv1d(in_channels=params['n_filters'], out_channels=1, kernel_size=1, stride=1)
self.lstm = nn.LSTM(
input_size=input_size, # nr of input features
hidden_size=params['lstm_hidden'],
num_layers=1
)
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(in_features=seq_len*params['lstm_hidden'], out_features=params['dense_hidden'])
self.dropout = nn.Dropout(p=.4)
self.fc2 = nn.Linear(in_features=params['dense_hidden'], out_features=output_size) # output_size = nr of output features
def reset_hidden_state(self):
self.hidden = (
torch.zeros(self.n_layers, self.seq_len, self.n_hidden).to(device=device),
torch.zeros(self.n_layers, self.seq_len, self.n_hidden).to(device=device),
)
def forward(self, sequences):
out = self.c1(sequences.view(len(sequences), 1, -1))
out = self.c2(out.view(len(out), self.n_filters, -1))
out = F.relu(out)
out, self.hidden = self.lstm(
out.view(len(out), self.seq_len, -1),
self.hidden
)
out = self.flatten(out)
out = self.fc1(out)
out = self.dropout(out)
out = self.fc2(out)
return out
Source for the study I am using.

apply LSTM on BERT embedding

I use a code to classify toxic tweet
I want to modify the following code to used LSTM architecture
the code fed BERT embedding to LSTM Layers
class BertClassifier(nn.Module):
def __init__(self, freeze_bert=False):
super(BertClassifier, self).__init__()
# Specify hidden size of BERT, hidden size of our classifier, and number of labels
D_in, H, D_out = 768, 50, 2
# Instantiate BERT model
self.bert = BertModel.from_pretrained('aubmindlab/bert-base-arabertv02-twitter')
# Instantiate an one-layer feed-forward classifier
self.classifier = nn.Sequential(
nn.Linear(D_in, H),
nn.ReLU(),
#nn.Dropout(0.5),
nn.Linear(H, D_out)
)
if freeze_bert:
for param in self.bert.parameters():
param.requires_grad = False
when I use LSTM an error appears that forward function must be modified
def forward(self, input_ids, attention_mask):
#return logits (torch.Tensor): an output tensor with shape (batch_size,num_labels)
outputs = self.bert(input_ids=input_ids,
attention_mask=attention_mask)
last_hidden_state_cls = outputs[0][:, 0, :]
logits = self.classifier(last_hidden_state_cls)
return logits

Gradio - Pytorch MNIST Digit Recognizer

I watched the following video on YouTube https://www.youtube.com/watch?v=jx9iyQZhSwI where it was shown that it is possible to use Gradio and the learned model of MNIST dataset in Tensorflow. I have read and written that it is possible to use Pytorch in Gradio, but I have problems with its implementation. Does anyone have an idea how to do this?
My Pytorch code of cnn
import torch.nn as nn
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(
in_channels=1,
out_channels=16,
kernel_size=5,
stride=1,
padding=2,
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, 5, 1, 2),
nn.ReLU(),
nn.MaxPool2d(2),
)
# fully connected layer, output 10 classes
self.out = nn.Linear(32 * 7 * 7, 10)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
# flatten the output of conv2 to (batch_size, 32 * 7 * 7)
x = x.view(x.size(0), -1)
output = self.out(x)
return output, x # return x for visualization
By watching I find that I need to change function that Gradio use
def predict_image(img):
img_3d=img.reshape(-1,28,28)
im_resize=img_3d/255.0
prediction=CNN(im_resize)
pred=np.argmax(prediction)
return pred
Im sorry if I got your question wrong, but from what I understand you are getting an error when trying to predict the digit using your function predict image.
So here are two possible hints. Maybe you have implemented them already, but I don't know because of the very small code snippet.
First of all. Have you set your model into evaluation mode using
CNN.eval()
Do after you finished training your model and want to evaluate inputs without training the model.
Second of all, maybe you need to add a fourth dimension to your input tensor "im_resize". Normally your model expects a dimension for the number of channels, the batch size, the height and the width of your input.
In addition I can not tell if your input is a of the datatype torch.tensor . If not transform your array into a tensor first.
You can add a batch dimension to your input tensor by using
im_resize = im_resize.unsqueeze(0)
I hope that I understand your question correctly and was able to help you.

Why is my Transformer implementation losing to a BiLSTM?

I am dealing with a sequence tagging problem and I am using a single Transformer Encoder to obtain logits from each element of the sequence. Having experimented both with Transformer and BiLSTM it looks like in my case BiLSTM is working better, so I was wondering if maybe it is because my Transformer implementation has some problem... Below is my implementation of the Transformer Encoder and related functions for creating padding mask and positional embeddings:
def create_mask(src, lengths):
"""Create a mask hiding future tokens
Parameters:
src (tensor): the source tensor having shape [batch_size, number_of_steps, features_dimensions]
length (list): a list of integers representing the length (i.e. number_of_steps) of each sample in the batch."""
mask = []
max_len = src.shape[1]
for index, i in enumerate(src):
# The mask consists in tensors having false at the step number that doesn't need to be hidden and true otherwise
mask.append([False if (i+1)>lengths[index] else True for i in range(max_len)])
return torch.tensor(mask)
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000, device = 'cpu'):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
self.device = device
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(1, max_len, d_model)
pe[0, :, 0::2] = torch.sin(position * div_term)
pe[0, :, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1), :].to(self.device)
return self.dropout(x)
class Transformer(nn.Module):
"""Class implementing transformer ecnoder, partially based on
https://pytorch.org/tutorials/beginner/transformer_tutorial.html"""
def __init__(self, in_dim, h_dim, n_heads, n_layers, dropout=0.2, drop_out = 0.0, batch_first = True, device = 'cpu', positional_encoding = True):
super(Transformer, self).__init__()
self.model_type = 'Transformer'
self.pos_encoder = PositionalEncoding(in_dim, dropout, device = device)
encoder_layers = nn.TransformerEncoderLayer(in_dim, n_heads, h_dim, dropout)
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers, norm=nn.LayerNorm(in_dim))
self.in_dim = in_dim
self.drop_out = drop_out
self.positional_encoding = positional_encoding
def forward(self, src, mask = None, line_len=None):
src = src * math.sqrt(self.in_dim)
if self.positional_encoding:
src = self.pos_encoder(src)
if line_len is not None and mask is None:
mask = create_mask(src, line_len)
else:
mask = None
output = self.transformer_encoder(src, src_key_padding_mask = mask)
if self.drop_out:
output = F.dropout(output, p = self.drop_out)
return src, output
As it can be seen, the above network outputs the hidden states and then I pass them into an additional linear layer and train with a CrossEntropy loss over two classes and Adam optimizer. I have tried multiple combinations of hyperparameters but the BiLSTM still performs better. Can anyone spot anything off in my Transformer or suggest why I experience such a counterintuitive result?
This may be surprising, but Transformers don't always beat LSTMs. For example, Language Models with Transformers states:
Transformer architectures are suboptimal for language model itself. Neither self-attention nor the positional encoding in the Transformer is able to efficiently incorporate the word-level sequential context crucial to language modeling.
If you run the Transformer tutorial code itself (on which your code is based), you'll also see LSTM do better there. See this thread on stats.SE for more discussion on this topic (disclaimer: both the question and the answer there are mine)

what should I do if my regression model stuck at a high value loss?

I'm using neural nets for a regression problem where I have 3 features and I'm trying to predict one continuous value. I noticed that my neural net start learning good but after 10 epochs it get stuck on a high loss value and could not improve anymore.
I tried to use Adam and other adaptive optimizers instead of SGD but that didn't work. I tried a complex architectures like adding layers, neurons, batch normalization and other activations etc.. and that also didn't work.
I tried to debug and try to find out if something is wrong with the implementation but when I use only 10 examples of the data my model learn fast so there are no errors. I start to increase the examples of the data and monitoring my model results as I increase the data examples. when I reach 3000 data examples my model start to get stuck on a high value loss.
I tried to increase layers, neurons and also to try other activations, batch normalization. My data are also normalized between [-1, 1], my target value is not normalized since it is regression and I'm predicting a continuous value. I also tried using keras but I've got the same result.
My real dataset have 40000 data, I don't know what should I try, I almost try all things that I know for optimization but none of them worked. I would appreciate it if someone can guide me on this. I'll post my Code but maybe it is too messy to try to understand, I'm sure there is no problem with my implementation, I'm using skorch/pytorch and some SKlearn functions:
# take all features as an Independant variable except the bearing and distance
# here when I start small the model learn good but from 3000 data points as you can see the model stuck on a high value. I mean the start loss is 15 and it start to learn good but when it reach 9 it stucks there
# and if I try to use the whole dataset for training then the loss start at 47 and start decreasing until it reach 36 and then stucks there too
X = dataset.iloc[:3000, 0:-2].reset_index(drop=True).to_numpy().astype(np.float32)
# take distance and bearing as the output values:
y = dataset.iloc[:3000, -2:].reset_index(drop=True).to_numpy().astype(np.float32)
y_bearing = y[:, 0].reshape(-1, 1)
y_distance = y[:, 1].reshape(-1, 1)
# normalize the input values
scaler = StandardScaler()
X_norm = scaler.fit_transform(X, y)
X_br_train, X_br_test, y_br_train, y_br_test = train_test_split(X_norm,
y_bearing,
test_size=0.1,
random_state=42,
shuffle=True)
X_dis_train, X_dis_test, y_dis_train, y_dis_test = train_test_split(X_norm,
y_distance,
test_size=0.1,
random_state=42,
shuffle=True)
bearing_trainset = Dataset(X_br_train, y_br_train)
bearing_testset = Dataset(X_br_test, y_br_test)
distance_trainset = Dataset(X_dis_train, y_dis_train)
distance_testset = Dataset(X_dis_test, y_dis_test)
def root_mse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
class RMSELoss(nn.Module):
def __init__(self):
super().__init__()
self.mse = nn.MSELoss()
def forward(self, yhat, y):
return torch.sqrt(self.mse(yhat, y))
class AED(nn.Module):
"""custom average euclidean distance loss"""
def __init__(self):
super().__init__()
def forward(self, yhat, y):
return torch.dist(yhat, y)
def train(on_target,
hidden_units,
batch_size,
epochs,
optimizer,
lr,
regularisation_factor,
train_shuffle):
network = None
trainset = distance_trainset if on_target.lower() == 'distance' else bearing_trainset
testset = distance_testset if on_target.lower() == 'distance' else bearing_testset
print(f"shape of trainset.X = {trainset.X.shape}, shape of trainset.y = {trainset.y.shape}")
print(f"shape of testset.X = {testset.X.shape}, shape of testset.y = {testset.y.shape}")
mse = EpochScoring(scoring=mean_squared_error, lower_is_better=True, name='MSE')
r2 = EpochScoring(scoring=r2_score, lower_is_better=False, name='R2')
rmse = EpochScoring(scoring=make_scorer(root_mse), lower_is_better=True, name='RMSE')
checkpoint = Checkpoint(dirname=f'results/{on_target}/checkpoints')
train_end_checkpoint = TrainEndCheckpoint(dirname=f'results/{on_target}/checkpoints')
if on_target.lower() == 'bearing':
network = BearingNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=y_distance.shape[1])
elif on_target.lower() == 'distance':
network = DistanceNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=1)
model = NeuralNetRegressor(
module=network,
criterion=RMSELoss,
device='cpu',
batch_size=batch_size,
lr=lr,
optimizer=optim.Adam if optimizer.lower() == 'adam' else optim.SGD,
optimizer__weight_decay=regularisation_factor,
max_epochs=epochs,
iterator_train__shuffle=train_shuffle,
train_split=predefined_split(testset),
callbacks=[mse, r2, rmse, checkpoint, train_end_checkpoint]
)
print(f"{'*' * 10} start training the {on_target} model {'*' * 10}")
history = model.fit(trainset, y=None)
print(f"{'*' * 10} End Training the {on_target} Model {'*' * 10}")
if __name__ == '__main__':
args = parser.parse_args()
train(on_target=args.on_target,
hidden_units=args.hidden_units,
batch_size=args.batch_size,
epochs=args.epochs,
optimizer=args.optimizer,
lr=args.learning_rate,
regularisation_factor=args.regularisation_lambda,
train_shuffle=args.shuffle)
and this is my network declaration:
class DistanceNetwork(nn.Module):
"""separate NN for predicting distance"""
def __init__(self, n_features=5, n_hidden=16, n_out=1):
super().__init__()
self.model = nn.Sequential(
nn.Linear(n_features, n_hidden),
nn.LeakyReLU(),
nn.Linear(n_hidden, 5),
nn.LeakyReLU(),
nn.Linear(5, n_out)
)
here is the log while training: