MXNET CNN+LSTM save/serialize to json - json

I'm finding a hardtime figuring out how to correctly define a mxnet net so that i can serialize/convert this model to a json file.
The pipeline is composed of a CNN + biLSTM + CTC.
I now i must use HybridBlock and hybridize() but i can't seem to make it work or if its even possible or if there is any other way around.
I'm sure its lack of knowledge on my part and wonder is anyone can help.
Here is the net definition in python:
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
# conv layer
featurizer.add(gluon.nn.Conv2D(kernel_size=(3,3), padding=(1,1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
....
featurizer.hybridize()
return featurizer
class EncoderLayer(gluon.Block):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def forward(self, x):
x = x.transpose((0,3,1,2))
x = x.flatten()
x = x.split(num_outputs=SEQ_LEN, axis = 1) # (SEQ_LEN, N, CHANNELS)
x = nd.concat(*[elem.expand_dims(axis=0) for elem in x], dim=0)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.Sequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
decoder.hybridize()
return decoder
def get_net():
net = gluon.nn.Sequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
Any help would be highly appreciated.
Thank you very much.

There are few requirements for a model in Gluon to be exportable to json:
It needs to be hybridizable, meaning that each children block should be hybridizable as well and the model works in both modes
All parameters should be initialized. Since Gluon uses deferred parameter initialization, that means that you should do forward pass at least once before you can save the model.
I did some fixes for your code also introducing new constants when I needed. The most significant changes are:
Don't use split if you can avoid it, because it returns list of NDArrays. Use reshape, which works seemlessly with Symbol as well.
Starting from 1.3.0 version of MXNet, LSTM is also hybridizable, so you can wrap it in a HybridBlock instead of just a Block.
Use HybridSequential.
Here is the adjusted code with an example at the bottom how to save the model and how to load it back. You can find more information in this tutorial.
import mxnet as mx
from mxnet import gluon
from mxnet import nd
BATCH_SIZE = 1
CHANNELS = 100
ALPHABET_SIZE = 1000
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
HEIGHT = 100
WIDTH = 100
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
featurizer.add(
gluon.nn.Conv2D(kernel_size=(3, 3), padding=(1, 1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
return featurizer
class EncoderLayer(gluon.HybridBlock):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def hybrid_forward(self, F, x):
x = x.transpose((0, 3, 1, 2))
x = x.flatten()
x = x.reshape(shape=(SEQ_LEN, -1, CHANNELS)) #x.split(num_outputs=SEQ_LEN, axis=1) # (SEQ_LEN, N, CHANNELS)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.HybridSequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
return decoder
def get_net():
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
if __name__ == '__main__':
net = get_net()
net.initialize()
net.hybridize()
fake_data = mx.random.uniform(shape=(BATCH_SIZE, HEIGHT, WIDTH, CHANNELS))
out = net(fake_data)
net.export("mymodel")
deserialized_net = gluon.nn.SymbolBlock.imports("mymodel-symbol.json", ['data'],
"mymodel-0000.params", ctx=mx.cpu())
out2 = deserialized_net(fake_data)
# just to check that we get the same results
assert (out - out2).sum().asscalar() == 0

Related

Self attention module occpuying too large VRAM with large input

I was trying to add a self-attention module for Progressive GAN (ProGAN) and put it to the last layer before to RGB. After running a simple test file I found that when the model grows to 256x256 output, the process is killed.
Then I tried the following test code:
import torch
import torch.nn as nn
class SelfAttention(nn.Module):
def __init__(self, channels):
super(SelfAttention, self).__init__()
self.channels = channels
num_heads = 4
self.mha = nn.MultiheadAttention(channels, num_heads, batch_first=True)
self.ln = nn.LayerNorm([channels])
self.ff_self = nn.Sequential(
nn.LayerNorm([channels]),
nn.Linear(channels, channels),
nn.GELU(),
nn.Linear(channels, channels)
)
def forward(self, x):
size = x.shape[3]
print("SIZE", size)
print("CHANNELS", self.channels)
x = x.view(-1, self.channels, size * size).swapaxes(1, 2)
print()
x_ln = self.ln(x)
attention_value, _ = self.mha(x_ln, x_ln, x_ln)
attention_value = attention_value + x
attention_value = self.ff_self(attention_value) + attention_value
return attention_value.swapaxes(2, 1).view(-1, self.channels, size, size)
class SelfAttention2(nn.Module):
def __init__(self, channels):
super(SelfAttention2, self).__init__()
self.query = nn.Conv2d(channels, channels // 8, kernel_size=1)
self.key = nn.Conv2d(channels, channels // 8, kernel_size=1)
self.value = nn.Conv2d(channels, channels, kernel_size=1)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
N, C, H, W = x.size()
query = self.query(x).view(N, -1, W*H).permute(0, 2, 1) # (N, C, H*W)
key = self.key(x).view(N, -1, W*H) # (N, C, H*W)
energy = torch.bmm(query, key) # (N, H*W, H*W)
attention = self.softmax(energy)
value = self.value(x).view(N, -1, W*H) # (N, C, H*W)
out = torch.bmm(value, attention.permute(0, 2, 1))
out = out.view(N, C, H, W)
return out
if __name__ == '__main__':
x = torch.randn((1, 64, 256, 256))
sa1 = SelfAttention(64)
sa1(x)
sa2 = SelfAttention2(64)
sa2(x)
Neither module worked for trying to allocate 16G VRAM. (With this one module take up 16G I cannot even run the whole model in 3090)
And I am told explictly that the method itself, i.e. "Add attention to ProGAN or StyleGAN" will work and has been done.
So, is my understanding of the idea is false or the implementation has flaw?
Also, I have train the model to 32x32 and it worked ok.
Suggentions upon my understanding or my coding.

How to make multi-inputs and multi-outputs neural network model

I converted the following code from Keras to Pytorch. The main challenge here for me is to make multi-inputs and multi-outputs model similar to keras.models.Model. As how to implement the following code, in Pytorch, to accept the multi input and outputs.
from tensorflow import keras as k
import tensorflow as tf
class NetworkKeys:
NUM_UNITS = "num_units"
ACTIVATION = "activation"
L2_REG_FACT = "l2_reg_fact"
DROP_PROB = "drop_prob"
BATCH_NORM = "batch_norm"
def build_dense_network(input_dim, output_dim,
output_activation, params, with_output_layer=True):
model = k.models.Sequential()
activation = params.get(NetworkKeys.ACTIVATION, "relu")
l2_reg_fact = params.get(NetworkKeys.L2_REG_FACT, 0.0)
regularizer = k.regularizers.l2(l2_reg_fact) if l2_reg_fact > 0 else None
drop_prob = params.get(NetworkKeys.DROP_PROB, 0.0)
batch_norm = params.get(NetworkKeys.BATCH_NORM, False)
last_dim = input_dim
for i in range(len(params[NetworkKeys.NUM_UNITS])):
model.add(k.layers.Dense(units=params[NetworkKeys.NUM_UNITS][i],
kernel_regularizer=regularizer,
input_dim=last_dim))
if batch_norm:
model.add(k.layers.BatchNormalization())
model.add(k.layers.Activation(activation))
last_dim = params[NetworkKeys.NUM_UNITS][i]
if drop_prob > 0.0:
model.add(k.layers.Dropout(rate=drop_prob))
if with_output_layer:
model.add(k.layers.Dense(units=output_dim, activation=output_activation))
return model
ldre_net = build_dense_network(input_dim=input_dim, output_dim=1,
output_activation=k.activations.linear,
params=hidden_params)
p_samples = k.layers.Input(shape=(input_dim,))
q_samples = k.layers.Input(shape=(input_dim,))
train_model = k.models.Model(inputs=[p_samples, q_samples],
outputs=[ldre_net(p_samples),ldre_net(q_samples)])
Here is my attempt to convert the above code to Pytorch code:
def l2_penalty(model, l2_lambda=0.001):
"""Returns the L2 penalty of the params."""
l2_norm = sum(p.pow(2).sum() for p in model.parameters())
return l2_lambda*l2_norm
def build_dense_network(input_dim, output_dim,
output_activation, params, with_output_layer=True):
activation = params.get(NetworkKeys.ACTIVATION, "relu")
l2_reg_fact = params.get(NetworkKeys.L2_REG_FACT, 0.0)
drop_prob = params.get(NetworkKeys.DROP_PROB, 0.0)
batch_norm = params.get(NetworkKeys.BATCH_NORM, False)
layers=[]
last_dim = input_dim
for i in range(len(params[NetworkKeys.NUM_UNITS])):
layers.append(nn.Linear(last_dim,params[NetworkKeys.NUM_UNITS][i]))
if batch_norm:
layers.append(torch.nn.BatchNorm1d(params[NetworkKeys.NUM_UNITS][i]))
if activation=="relu":
layers.append(nn.ReLU())
elif activation=="LeakyRelu":
layers.append(nn.LeakyReLU(0.1,inplace=True))
else:
pass
last_dim = params[NetworkKeys.NUM_UNITS][i]
if drop_prob > 0.0:
layers.append(torch.nn.Dropout(p=drop_prob))
if with_output_layer:
layers.append(nn.Linear(params[NetworkKeys.NUM_UNITS][-1],output_dim))
model = nn.Sequential(*layers)
regularizer = l2_penalty(model, l2_lambda=0.001) if l2_reg_fact > 0 else None
return model, regularizer
class Split(torch.nn.Module):
def __init__(self, module, n_parts: int, dim=1):
super().__init__()
self._n_parts = n_parts
self._dim = dim
self._module = module
def forward(self, inputs):
output = self._module(inputs)
chunk_size = output.shape[self._dim] // self._n_parts
return torch.split(output, chunk_size, dim=self._dim)
class Net(nn.Module):
def __init__(self, hidden_params, input_dim):
self._ldre_net, ldre_regularizer = build_dense_network(input_dim=input_dim,
output_dim=1,output_activation="linear", params=hidden_params)
self._p_samples = nn.Linear(input_dim,input_dim)
self._q_samples = nn.Linear(input_dim,input_dim)
self._split_layers = Split(
self._ldre_net,
n_parts=2,
dim = 0
)
def forward(self, x, inTrain=True):
if inTrain:
p = self._p_samples(x)
q = self._q_samples(x)
p = x[:, 0, :]
q = x[:, 1, :]
combined = torch.cat((p.view(p.size(0), -1),
q.view(q.size(0), -1)), dim=0)
p_output, q_output =self._split_layers(combined)
return p_output, q_output
else:
return self._ldre_net(x)
I am wondering whether my implementation in the Net class is correct or not?
TLDR You control the number of inputs and outputs in PyTorch, in the form of a tensor (or a number of variables). Missing super initialization and the order of operations should be fixed. Also don't particularly like the way arguments are passed, recommend using *args and **kwargs.
Explanation
There were a few things for me to make it run, namely the parameters NetworkKeys are used to access the dictionary that is passed through. Seems like an overly complicated way to do things, as you tried to make default values, but in the end, it threw exceptions if there are none (namely num_units). Recommend just using args and kwargs and passing the dictionary as a parameter. Tried with the following example:
values = {NetworkKeys.BATCH_NORM: False,
NetworkKeys.L2_REG_FACT: 0.0,
NetworkKeys.DROP_PROB: 0.0,
NetworkKeys.ACTIVATION: "relu",
NetworkKeys.NUM_UNITS: [10, 10]
}
print(values)
Net(values, 10)
There were a few things to fix in the Net class
Needs initialization of super (e.g. super(Net, self).__init__())
Order of the forward pass didn't make sense, you are overriding the output of the linear layer, see that we are doing self_p_samples(p) now which is one of the dimensions p = x[:, 0, :].
class Net(nn.Module):
def __init__(self, hidden_params, input_dim):
super(Net, self).__init__()
self._ldre_net, ldre_regularizer = build_dense_network(input_dim=input_dim,
output_dim=1,output_activation="linear", params=hidden_params)
self._p_samples = nn.Linear(input_dim,input_dim)
self._q_samples = nn.Linear(input_dim,input_dim)
self._split_layers = Split(
self._ldre_net,
n_parts=2,
dim = 0
)
def forward(self, x, inTrain=True):
if inTrain:
p = x[:, 0, :]
q = x[:, 1, :]
p = self._p_samples(p)
q = self._q_samples(q)
combined = torch.cat((p.view(p.size(0), -1),
q.view(q.size(0), -1)), dim=0)
p_output, q_output =self._split_layers(combined)
return p_output, q_output
else:
return self._ldre_net(x)
While displaying the network got with a successful forward pass with input size of torch.randn((1,2,10)):
==========================================================================================
Layer (type:depth-idx) Output Shape Param #
==========================================================================================
Net -- --
├─Linear: 1-1 [1, 10] 110
├─Linear: 1-2 [1, 10] 110
├─Split: 1-3 [1, 1] --
├─Sequential: 1-4 [2, 1] --
│ └─Linear: 2-1 [2, 10] 110
│ └─ReLU: 2-2 [2, 10] --
│ └─Linear: 2-3 [2, 10] 110
│ └─ReLU: 2-4 [2, 10] --
│ └─Linear: 2-5 [2, 1] 11
==========================================================================================
Total params: 451
Trainable params: 451
Non-trainable params: 0
Total mult-adds (M): 0.00
==========================================================================================
Example output, will be in the form of:
(tensor([[-0.0699]], grad_fn=<SplitBackward0>),
tensor([[0.0394]], grad_fn=<SplitBackward0>))
Note: I didn't try to overfit this model (which you should do) to validate that it indeed can learn what you want.
Also a side note, if you really wanted multiple outputs for auxiliary which aren't part of tensor and you have to compute, you can just do return x,y in the forward pass

PyTorch: Batch size=1 but model gives 2 outputs

Here is some part of my PyTorch code:
test_loader = DataLoader(dataset = test_loader_hibiscus, batch_size = 1, shuffle=False, num_workers=0)
test_losses = []
y_pred_list = []
feat_list = []
with torch.no_grad():
model.eval()
test_loss = 0.0
if expe_temoin == False :
for test_dwi,test_adc,test_tmax,test_cbf,test_cbv,test_label in test_loader :
test_dwi = test_dwi.to(device)
test_adc = test_adc.to(device)
test_tmax = test_tmax.to(device)
test_cbf = test_cbf.to(device)
test_cbv = test_cbv.to(device)
in_imgs = torch.cat((train_dwi,train_adc,train_tmax,train_cbf,train_cbv), dim=1)
out_recon, my_feat = model(in_imgs)
print("my_feat", my_feat[0].shape)
But it prints:
my_feat torch.Size([2, 512, 1, 24, 24])
Could someone please tell me why 2 (batch?) Thanks!
Hint: When I run with test data size: 26, its OK, when run with data size: 25, mess up the batch! Is there something about being odd and even?!
Here is the Unet 3d model for 3D reconstruction and segmentation:
class Abstract3DUNet(nn.Module):
def __init__(self, in_channels, out_channels, final_sigmoid, basic_module, f_maps=64, layer_order='gcr',
num_groups=8, num_levels=4, is_segmentation=True, testing=False,
conv_kernel_size=3, pool_kernel_size=2, conv_padding=1, **kwargs):
super(Abstract3DUNet, self).__init__()
self.testing = testing
if isinstance(f_maps, int):
f_maps = number_of_features_per_level(f_maps, num_levels=num_levels)
assert isinstance(f_maps, list) or isinstance(f_maps, tuple)
assert len(f_maps) > 1, "Required at least 2 levels in the U-Net"
# create encoder path
self.encoders = create_encoders(in_channels, f_maps, basic_module, conv_kernel_size, conv_padding, layer_order,
num_groups, pool_kernel_size)
# create decoder path
self.decoders = create_decoders(f_maps, basic_module, conv_kernel_size, conv_padding, layer_order, num_groups,
upsample=True)
# in the last layer a 1×1 convolution reduces the number of output
# channels to the number of labels
self.final_conv = nn.Conv3d(f_maps[0], out_channels, 1)
if is_segmentation:
# semantic segmentation problem
if final_sigmoid:
self.final_activation = nn.Sigmoid()
else:
self.final_activation = nn.Softmax(dim=1)
else:
# regression problem
self.final_activation = None
def forward(self, x):
# encoder part
encoders_features = []
my_feat =[]
for encoder in self.encoders:
x = encoder(x)
# reverse the encoder outputs to be aligned with the decoder
encoders_features.insert(0, x)
# remove the last encoder's output from the list
# !!remember: it's the 1st in the list
my_feat = encoders_features[0:]
encoders_features = encoders_features[1:]
# decoder part
for decoder, encoder_features in zip(self.decoders, encoders_features):
# pass the output from the corresponding encoder and the output
# of the previous decoder
x = decoder(encoder_features, x)
x = self.final_conv(x)
# apply final_activation (i.e. Sigmoid or Softmax) only during prediction. During training the network outputs
# logits and it's up to the user to normalize it before visualising with tensorboard or computing validation metric
if self.testing and self.final_activation is not None:
x = self.final_activation(x)
return x, my_feat
class UNet3D(Abstract3DUNet):
def __init__(self, in_channels, out_channels, final_sigmoid=True, f_maps=64, layer_order='gcr',
num_groups=8, num_levels=4, is_segmentation=True, conv_padding=1, **kwargs):
super(UNet3D, self).__init__(in_channels=in_channels,
out_channels=out_channels,
final_sigmoid=final_sigmoid,
basic_module=DoubleConv,
f_maps=f_maps,
layer_order=layer_order,
num_groups=num_groups,
num_levels=num_levels,
is_segmentation=is_segmentation,
conv_padding=conv_padding,
**kwargs)
My train batch size was 3! When I changed it into 2 or 4 the problem solved!

WHat does Lambda do in this code (python keras)?

def AdaIN(x):
#Normalize x[0] (image representation)
mean = K.mean(x[0], axis = [1, 2], keepdims = True)
std = K.std(x[0], axis = [1, 2], keepdims = True) + 1e-7
y = (x[0] - mean) / std
#Reshape scale and bias parameters
pool_shape = [-1, 1, 1, y.shape[-1]]
scale = K.reshape(x[1], pool_shape)
bias = K.reshape(x[2], pool_shape)#Multiply by x[1] (GAMMA) and add x[2] (BETA)
return y * scale + bias
def g_block(input_tensor, latent_vector, filters):
gamma = Dense(filters, bias_initializer = 'ones')(latent_vector)
beta = Dense(filters)(latent_vector)
out = UpSampling2D()(input_tensor)
out = Conv2D(filters, 3, padding = 'same')(out)
out = Lambda(AdaIN)([out, gamma, beta])
out = Activation('relu')(out)
return out
Please see code above. I am currently studying styleGAN. I am trying to convert this code into pytorch but I cant seem to understand what does Lambda do in g_block. AdaIN needs only one input based on its declaration but some how is gamma and beta also used as input? Please inform me what does the Lambda do in this code.
Thank you very much.
Lambda layers in keras are used to call custom functions inside the model. In g_block Lambda calls AdaIN function and passes out, gamma, beta as arguments inside a list. And AdaIN function receives these 3 tensors encapsulated within a single list as x. And also those tensors are accessed inside AdaIN function by indexing list x(x[0], x[1], x[2]).
Here's pytorch equivalent:
import torch
import torch.nn as nn
import torch.nn.functional as F
class AdaIN(nn.Module):
def forward(self, out, gamma, beta):
bs, ch = out.size()[:2]
mean = out.reshape(bs, ch, -1).mean(dim=2).reshape(bs, ch, 1, 1)
std = out.reshape(bs, ch, -1).std(dim=2).reshape(bs, ch, 1, 1) + 1e-7
y = (out - mean) / std
bias = beta.unsqueeze(-1).unsqueeze(-1).expand_as(out)
scale = gamma.unsqueeze(-1).unsqueeze(-1).expand_as(out)
return y * scale + bias
class g_block(nn.Module):
def __init__(self, filters, latent_vector_shape, input_tensor_channels):
super().__init__()
self.gamma = nn.Linear(in_features = latent_vector_shape, out_features = filters)
# Initializes all bias to 1
self.gamma.bias.data = torch.ones(filters)
self.beta = nn.Linear(in_features = latent_vector_shape, out_features = filters)
# calculate appropriate padding
self.conv = nn.Conv2d(input_tensor_channels, filters, 3, 1, padding=1)# calc padding
self.adain = AdaIN()
def forward(self, input_tensor, latent_vector):
gamma = self.gamma(latent_vector)
beta = self.beta(latent_vector)
# check default interpolation mode in keras and replace mode below if different
out = F.interpolate(input_tensor, scale_factor=2, mode='nearest')
out = self.conv(out)
out = self.adain(out, gamma, beta)
out = torch.relu(out)
return out
# Sample:
input_tensor = torch.randn((1, 3, 10, 10))
latent_vector = torch.randn((1, 5))
g = g_block(3, latent_vector.shape[1], input_tensor.shape[1])
out = g(input_tensor, latent_vector)
print(out)
Note: you need to pass latent_vector and input_tensor shapes while creating g_block.

Concatenate encoder hidden states/cells/outputs from different sources for attention calculation - issues?

I am using Pytorch for an LSTM encoder-decoder sequence-to-sequence prediction problem. As a first step, I would like to forecast 2D trajectories (trajectory x, trajectory y) from multivariate input - 2-D or more (trajectory x, trajectory y, speed, rotation, etc.)
I am following the below notebook (link):
seq2seq with Attention
Here excerpts (encoder, decoder, attention):
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
super(EncoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
def forward(self, input_seqs, input_lengths, hidden=None):
# Note: we run this all at once (over multiple batches of multiple sequences)
embedded = self.embedding(input_seqs)
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
outputs, hidden = self.gru(packed, hidden)
outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
return outputs, hidden
class LuongAttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
super(LuongAttnDecoderRNN, self).__init__()
# Keep for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout = dropout
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.embedding_dropout = nn.Dropout(dropout)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
self.concat = nn.Linear(hidden_size * 2, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
# Choose attention model
if attn_model != 'none':
self.attn = Attn(attn_model, hidden_size)
def forward(self, input_seq, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# Get the embedding of the current input word (last output word)
batch_size = input_seq.size(0)
embedded = self.embedding(input_seq)
embedded = self.embedding_dropout(embedded)
embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N
# Get current hidden state from input word and last hidden state
rnn_output, hidden = self.gru(embedded, last_hidden)
# Calculate attention from current RNN state and all encoder outputs;
# apply to encoder outputs to get weighted average
attn_weights = self.attn(rnn_output, encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N
# Attentional vector using the RNN hidden state and context vector
# concatenated together (Luong eq. 5)
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
concat_input = torch.cat((rnn_output, context), 1)
concat_output = F.tanh(self.concat(concat_input))
# Finally predict next token (Luong eq. 6, without softmax)
output = self.out(concat_output)
# Return final output, hidden state, and attention weights (for visualization)
return output, hidden, attn_weights
For calculating attention in the decoder stage, the encoder hidden state and encoder outputs are input and used as below:
class Attn(nn.Module):
def __init__(self, method, hidden_size):
super(Attn, self).__init__()
self.method = method
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))
def forward(self, hidden, encoder_outputs):
max_len = encoder_outputs.size(0)
this_batch_size = encoder_outputs.size(1)
# Create variable to store attention energies
attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S
if USE_CUDA:
attn_energies = attn_energies.cuda()
# For each batch of encoder outputs
for b in range(this_batch_size):
# Calculate energy for each encoder output
for i in range(max_len):
attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))
# Normalize energies to weights in range 0 to 1, resize to 1 x B x S
return F.softmax(attn_energies).unsqueeze(1)
def score(self, hidden, encoder_output):
if self.method == 'dot':
energy = hidden.dot(encoder_output)
return energy
elif self.method == 'general':
energy = self.attn(encoder_output)
energy = hidden.dot(energy)
return energy
elif self.method == 'concat':
energy = self.attn(torch.cat((hidden, encoder_output), 1))
energy = self.v.dot(energy)
return energy
My actual goal is to extend the method by adding further information to be fed into the decoder, such as image data at each input time step. Technically, I want to use two (or more) encoders, one for the trajectories as in the link above, and one separate one for image data (convolutional encoder).
I do this by concatenating embeddings produced by the trajectory encoder and the convolutional encoder (as well as the cell states etc.) and feeding the concatenated tensors to the decoder.
For example, image embedding (256-length tensor) concatenated with trajectory data embedding (256-length tensor) yields a 512-length embedding.
My question is: is it a problem for the attention calculation if I use a concatenated encoder hidden state, concatenated encoder cell state, and concatenated encoder output coming from those different sources rather than hidden states, cells, outputs coming from a single source?
What are the caveats or pre-processing that should happen to make this work?
Thank you very much in advance.