I am trying to concatenate embedding layer with other features. It doesn’t give me any error, but doesn’t do any training either. Is anything wrong with this model definition, how to debug this?
Note: The last column (feature) in my X is feature with word2ix (single word).
Note: The net works fine without the embedding feature/layer
originally posted on pytorch forum
class Net(torch.nn.Module):
def __init__(self, n_features, h_sizes, num_words, embed_dim, out_size, dropout=None):
super().__init__()
self.num_layers = len(h_sizes) # hidden + input
self.embedding = torch.nn.Embedding(num_words, embed_dim)
self.hidden = torch.nn.ModuleList()
self.bnorm = torch.nn.ModuleList()
if dropout is not None:
self.dropout = torch.nn.ModuleList()
else:
self.dropout = None
for k in range(len(h_sizes)):
if k == 0:
self.hidden.append(torch.nn.Linear(n_features, h_sizes[0]))
self.bnorm.append(torch.nn.BatchNorm1d(h_sizes[0]))
if self.dropout is not None:
self.dropout.append(torch.nn.Dropout(p=dropout))
else:
if k == 1:
input_dim = h_sizes[0] + embed_dim
else:
input_dim = h_sizes[k-1]
self.hidden.append(torch.nn.Linear(input_dim, h_sizes[k]))
self.bnorm.append(torch.nn.BatchNorm1d(h_sizes[k]))
if self.dropout is not None:
self.dropout.append(torch.nn.Dropout(p=dropout))
# Output layer
self.out = torch.nn.Linear(h_sizes[-1], out_size)
def forward(self, inputs):
# Feedforward
for l in range(self.num_layers):
if l == 0:
x = self.hidden[l](inputs[:, :-1])
x = self.bnorm[l](x)
if self.dropout is not None:
x= self.dropout[l](x)
embeds = self.embedding(inputs[:,-1])#.view((1, -1)
x = torch.cat((embeds, x),dim=1)
else:
x = self.hidden[l](x)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
x = F.relu(x)
output= self.out(x)
return output
There were a few issues. The key one was data type. I mixed float features and int indices.
sample data and training before fix:
NUM_TARGETS = 4
NUM_FEATURES = 3
NUM_TEXT_FEATURES = 1
x = np.random.rand(5, NUM_FEATURES)
y = np.random.rand(5, NUM_TARGETS)
word_ix = np.arange(5).reshape(-1,1).astype(int)
x_train = np.append(x, word_ix, axis=1)
x_train = torch.from_numpy(x).float().to(device)
y_train = torch.from_numpy(y).float().to(device)
h_sizes = [2,2]
net = Net(x_train.shape[1] , h_sizes=h_sizes, num_words=5, embed_dim=2, out_size=y_train.shape[1],dropout=.01) # define the network
print(net) # net architecture
net = net.float()
net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001, weight_decay=.01)
loss_func = torch.nn.MSELoss() # this is for regression mean squared loss
# one training loop
prediction = net(x_train) # input x and predict based on x
loss = loss_func(prediction, y_train) # must be (1. nn output, 2. target)
optimizer.zero_grad() # clear gradients for next train
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
# train_losses.append(loss.detach().to('cpu').numpy())
To resolve this, I separated word index feature from x, and also removed net.float().
changed the dtypes conversion to:
x_train = torch.from_numpy(x).float().to(device)
y_train = torch.from_numpy(y).float().to(device)
# NOTE: word index needs to be long
word_ix = torch.from_numpy(word_ix).to(torch.long).to(device)
and forward method changed to :
def forward(self, inputs, word_ix):
# Feedforward
for l in range(self.num_layers):
if l == 0:
x = self.hidden[l](inputs)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
embeds = self.embedding(word_ix)
# NOTE:
# embeds has a shape of (batch_size, 1, embed_dim)
# inorder to merge this change this with x, reshape this to
# (batch_size, embed_dim)
embeds = embeds.view(embeds.shape[0], embeds.shape[2])
x = torch.cat((x, embeds.view(x.shape)),dim=1)
else:
x = self.hidden[l](x)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
x = F.relu(x)
output= self.out(x)
return output
Related
Hello guys I've joined a university-level image recognition competition.
In the test, they will give two images (people face) and my model need to detect pair of the image is the same person or not
My model is resnet18 with IR block and SE block. and it will use Arcface loss.
I can use only the MS1M dataset with a total of 86876 classes
The problem is that loss is getting better, but accuracy is 0 and not changing.
Here's part of code I'm working on.
Train
def train_model(model, net, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
for phase in ['train']:
if phase == 'train':
model.train() # Set model to training mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in notebook.tqdm(dataloader):
inputs = inputs.to(device)
labels = labels.to(device).long()
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
features = model(inputs)
outputs = net(features, labels)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / len(dataloader)
epoch_acc = running_corrects.double() / len(dataloader)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'train' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
torch.save({'epoch': epoch,
'mode_state_dict': model.state_dict(),
'fc_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler': scheduler.state_dict(), # HERE IS THE CHANGE
}, f'/content/drive/MyDrive/inha_data/training_saver/training_stat{epoch}.pth')
print(f'finished {epoch} and saved model_save_{epoch}.pt')
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best train Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), 'model_save.pt')
return model
Parameters
train_dataset = MS1MDataset('train')
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True,num_workers=4)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 디바이스 설정
num_classes = 86876
# normal classifier
# net = nn.Sequential(nn.Linear(512, num_classes))
# Feature extractor backbone, input is 112x112 image output is 512 feature vector
model_ft = resnet18(True)
#set metric
metric_fc = metrics.ArcMarginProduct(512, num_classes, s = 30.0, m = 0.50, easy_margin = False)
metric_fc.to(device)
# net = net.to(device)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = torch.optim.Adam([{'params': model_ft.parameters()}, {'params': metric_fc.parameters()}],
lr=0.1)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=4, gamma=0.1)
Arcface
from __future__ import print_function
from __future__ import division
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import math
class ArcMarginProduct(nn.Module):
r"""Implement of large margin arc distance: :
Args:
in_features: size of each input sample
out_features: size of each output sample
s: norm of input feature
m: margin
cos(theta + m)
"""
def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
super(ArcMarginProduct, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.s = s
self.m = m
self.weight = Parameter(torch.FloatTensor(out_features, in_features))
nn.init.xavier_uniform_(self.weight)
self.easy_margin = easy_margin
self.cos_m = math.cos(m)
self.sin_m = math.sin(m)
self.th = math.cos(math.pi - m)
self.mm = math.sin(math.pi - m) * m
def forward(self, input, label):
# --------------------------- cos(theta) & phi(theta) ---------------------------
cosine = F.linear(F.normalize(input), F.normalize(self.weight))
sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
phi = cosine * self.cos_m - sine * self.sin_m
if self.easy_margin:
phi = torch.where(cosine > 0, phi, cosine)
else:
phi = torch.where(cosine > self.th, phi, cosine - self.mm)
# --------------------------- convert label to one-hot ---------------------------
# one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
one_hot = torch.zeros(cosine.size(), device='cuda')
one_hot.scatter_(1, label.view(-1, 1).long(), 1)
# -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
output = (one_hot * phi) + ((1.0 - one_hot) * cosine) # you can use torch.where if your torch.__version__ is 0.4
output *= self.s
# print(output)
return output
dataset
data_transforms = {
'train': transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.125, contrast=0.125, saturation=0.125),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
#train_ms1_data = torchvision.datasets.ImageFolder('/content/drive/MyDrive/inha_data/train', transform = data_transforms)
class MS1MDataset(Dataset):
def __init__(self,split):
self.file_list = '/content/drive/MyDrive/inha_data/ID_List.txt'
self.images = []
self.labels = []
self.transformer = data_transforms['train']
with open(self.file_list) as f:
files = f.read().splitlines()
for i, fi in enumerate(files):
fi = fi.split()
image = "/content/" + fi[1]
label = int(fi[0])
self.images.append(image)
self.labels.append(label)
def __getitem__(self, index):
img = Image.open(self.images[index])
img = self.transformer(img)
label = self.labels[index]
return img, label
def __len__(self):
return len(self.images)
You can try to use a smaller m in ArcFace, even a minus value.
Here is some part of my PyTorch code:
test_loader = DataLoader(dataset = test_loader_hibiscus, batch_size = 1, shuffle=False, num_workers=0)
test_losses = []
y_pred_list = []
feat_list = []
with torch.no_grad():
model.eval()
test_loss = 0.0
if expe_temoin == False :
for test_dwi,test_adc,test_tmax,test_cbf,test_cbv,test_label in test_loader :
test_dwi = test_dwi.to(device)
test_adc = test_adc.to(device)
test_tmax = test_tmax.to(device)
test_cbf = test_cbf.to(device)
test_cbv = test_cbv.to(device)
in_imgs = torch.cat((train_dwi,train_adc,train_tmax,train_cbf,train_cbv), dim=1)
out_recon, my_feat = model(in_imgs)
print("my_feat", my_feat[0].shape)
But it prints:
my_feat torch.Size([2, 512, 1, 24, 24])
Could someone please tell me why 2 (batch?) Thanks!
Hint: When I run with test data size: 26, its OK, when run with data size: 25, mess up the batch! Is there something about being odd and even?!
Here is the Unet 3d model for 3D reconstruction and segmentation:
class Abstract3DUNet(nn.Module):
def __init__(self, in_channels, out_channels, final_sigmoid, basic_module, f_maps=64, layer_order='gcr',
num_groups=8, num_levels=4, is_segmentation=True, testing=False,
conv_kernel_size=3, pool_kernel_size=2, conv_padding=1, **kwargs):
super(Abstract3DUNet, self).__init__()
self.testing = testing
if isinstance(f_maps, int):
f_maps = number_of_features_per_level(f_maps, num_levels=num_levels)
assert isinstance(f_maps, list) or isinstance(f_maps, tuple)
assert len(f_maps) > 1, "Required at least 2 levels in the U-Net"
# create encoder path
self.encoders = create_encoders(in_channels, f_maps, basic_module, conv_kernel_size, conv_padding, layer_order,
num_groups, pool_kernel_size)
# create decoder path
self.decoders = create_decoders(f_maps, basic_module, conv_kernel_size, conv_padding, layer_order, num_groups,
upsample=True)
# in the last layer a 1×1 convolution reduces the number of output
# channels to the number of labels
self.final_conv = nn.Conv3d(f_maps[0], out_channels, 1)
if is_segmentation:
# semantic segmentation problem
if final_sigmoid:
self.final_activation = nn.Sigmoid()
else:
self.final_activation = nn.Softmax(dim=1)
else:
# regression problem
self.final_activation = None
def forward(self, x):
# encoder part
encoders_features = []
my_feat =[]
for encoder in self.encoders:
x = encoder(x)
# reverse the encoder outputs to be aligned with the decoder
encoders_features.insert(0, x)
# remove the last encoder's output from the list
# !!remember: it's the 1st in the list
my_feat = encoders_features[0:]
encoders_features = encoders_features[1:]
# decoder part
for decoder, encoder_features in zip(self.decoders, encoders_features):
# pass the output from the corresponding encoder and the output
# of the previous decoder
x = decoder(encoder_features, x)
x = self.final_conv(x)
# apply final_activation (i.e. Sigmoid or Softmax) only during prediction. During training the network outputs
# logits and it's up to the user to normalize it before visualising with tensorboard or computing validation metric
if self.testing and self.final_activation is not None:
x = self.final_activation(x)
return x, my_feat
class UNet3D(Abstract3DUNet):
def __init__(self, in_channels, out_channels, final_sigmoid=True, f_maps=64, layer_order='gcr',
num_groups=8, num_levels=4, is_segmentation=True, conv_padding=1, **kwargs):
super(UNet3D, self).__init__(in_channels=in_channels,
out_channels=out_channels,
final_sigmoid=final_sigmoid,
basic_module=DoubleConv,
f_maps=f_maps,
layer_order=layer_order,
num_groups=num_groups,
num_levels=num_levels,
is_segmentation=is_segmentation,
conv_padding=conv_padding,
**kwargs)
My train batch size was 3! When I changed it into 2 or 4 the problem solved!
TRAINING CODE
if os.path.isfile(PATH):
print("checkpoint training '{}' ...".format(PATH))
checkpoint = torch.load(PATH)
start_epoch = checkpoint['epoch']
start_i = checkpoint['i']
net.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (trained for {} epochs, {} i)".format(PATH, checkpoint['epoch'],
checkpoint['i']))
else:
print('new training')
for epoch in range(num_epochs): # loop over the dataset multiple times
running_loss = 0.0
for i in range(len(train_folder_list2)):
# get the inputs; data is a list of [inputs, labels]
# net.train()
inputs, labels = train_input[i], train_list[i]
inputs = torch.as_tensor(inputs).cuda()
inputs = inputs.transpose(1, 3)
labels = torch.as_tensor(labels).cuda()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
# zero the parameter gradients
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 100 == 1:
save_checkpoint({
'epoch': start_epoch + epoch + 1,
'i': start_i + i + 1,
'state_dict': net.state_dict(),
})
TEST CODE
PATH = './checkpoint.pth'
model = Net().cuda()
if os.path.isfile(PATH):
print('checkpoint check!')
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
model.eval()
for k in range(len(train_folder_list2)):
inputs = train_input[k]
inputs = torch.as_tensor(inputs).cuda()
inputs = inputs.transpose(1, 3)
outputs = model(inputs)
result = outputs.cpu().detach().numpy()
This is the code to find the edges of the image.
If I run the training code, train it, and test it with the test code, it doesn't seem to find any edges in the image. The edges are on the same side, whatever image i put.
**ADD
CNN CODE
In addition, we added cnn code to give you information. Data input was put in the list separately from the image and label.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(293904, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 18)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 293904)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
x = x.view(18)
return x
I'm finding a hardtime figuring out how to correctly define a mxnet net so that i can serialize/convert this model to a json file.
The pipeline is composed of a CNN + biLSTM + CTC.
I now i must use HybridBlock and hybridize() but i can't seem to make it work or if its even possible or if there is any other way around.
I'm sure its lack of knowledge on my part and wonder is anyone can help.
Here is the net definition in python:
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
# conv layer
featurizer.add(gluon.nn.Conv2D(kernel_size=(3,3), padding=(1,1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
....
featurizer.hybridize()
return featurizer
class EncoderLayer(gluon.Block):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def forward(self, x):
x = x.transpose((0,3,1,2))
x = x.flatten()
x = x.split(num_outputs=SEQ_LEN, axis = 1) # (SEQ_LEN, N, CHANNELS)
x = nd.concat(*[elem.expand_dims(axis=0) for elem in x], dim=0)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.Sequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
decoder.hybridize()
return decoder
def get_net():
net = gluon.nn.Sequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
Any help would be highly appreciated.
Thank you very much.
There are few requirements for a model in Gluon to be exportable to json:
It needs to be hybridizable, meaning that each children block should be hybridizable as well and the model works in both modes
All parameters should be initialized. Since Gluon uses deferred parameter initialization, that means that you should do forward pass at least once before you can save the model.
I did some fixes for your code also introducing new constants when I needed. The most significant changes are:
Don't use split if you can avoid it, because it returns list of NDArrays. Use reshape, which works seemlessly with Symbol as well.
Starting from 1.3.0 version of MXNet, LSTM is also hybridizable, so you can wrap it in a HybridBlock instead of just a Block.
Use HybridSequential.
Here is the adjusted code with an example at the bottom how to save the model and how to load it back. You can find more information in this tutorial.
import mxnet as mx
from mxnet import gluon
from mxnet import nd
BATCH_SIZE = 1
CHANNELS = 100
ALPHABET_SIZE = 1000
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
HEIGHT = 100
WIDTH = 100
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
featurizer.add(
gluon.nn.Conv2D(kernel_size=(3, 3), padding=(1, 1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
return featurizer
class EncoderLayer(gluon.HybridBlock):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def hybrid_forward(self, F, x):
x = x.transpose((0, 3, 1, 2))
x = x.flatten()
x = x.reshape(shape=(SEQ_LEN, -1, CHANNELS)) #x.split(num_outputs=SEQ_LEN, axis=1) # (SEQ_LEN, N, CHANNELS)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.HybridSequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
return decoder
def get_net():
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
if __name__ == '__main__':
net = get_net()
net.initialize()
net.hybridize()
fake_data = mx.random.uniform(shape=(BATCH_SIZE, HEIGHT, WIDTH, CHANNELS))
out = net(fake_data)
net.export("mymodel")
deserialized_net = gluon.nn.SymbolBlock.imports("mymodel-symbol.json", ['data'],
"mymodel-0000.params", ctx=mx.cpu())
out2 = deserialized_net(fake_data)
# just to check that we get the same results
assert (out - out2).sum().asscalar() == 0
I am using Pytorch for an LSTM encoder-decoder sequence-to-sequence prediction problem. As a first step, I would like to forecast 2D trajectories (trajectory x, trajectory y) from multivariate input - 2-D or more (trajectory x, trajectory y, speed, rotation, etc.)
I am following the below notebook (link):
seq2seq with Attention
Here excerpts (encoder, decoder, attention):
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
super(EncoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
def forward(self, input_seqs, input_lengths, hidden=None):
# Note: we run this all at once (over multiple batches of multiple sequences)
embedded = self.embedding(input_seqs)
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
outputs, hidden = self.gru(packed, hidden)
outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
return outputs, hidden
class LuongAttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
super(LuongAttnDecoderRNN, self).__init__()
# Keep for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout = dropout
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.embedding_dropout = nn.Dropout(dropout)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
self.concat = nn.Linear(hidden_size * 2, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
# Choose attention model
if attn_model != 'none':
self.attn = Attn(attn_model, hidden_size)
def forward(self, input_seq, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# Get the embedding of the current input word (last output word)
batch_size = input_seq.size(0)
embedded = self.embedding(input_seq)
embedded = self.embedding_dropout(embedded)
embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N
# Get current hidden state from input word and last hidden state
rnn_output, hidden = self.gru(embedded, last_hidden)
# Calculate attention from current RNN state and all encoder outputs;
# apply to encoder outputs to get weighted average
attn_weights = self.attn(rnn_output, encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N
# Attentional vector using the RNN hidden state and context vector
# concatenated together (Luong eq. 5)
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
concat_input = torch.cat((rnn_output, context), 1)
concat_output = F.tanh(self.concat(concat_input))
# Finally predict next token (Luong eq. 6, without softmax)
output = self.out(concat_output)
# Return final output, hidden state, and attention weights (for visualization)
return output, hidden, attn_weights
For calculating attention in the decoder stage, the encoder hidden state and encoder outputs are input and used as below:
class Attn(nn.Module):
def __init__(self, method, hidden_size):
super(Attn, self).__init__()
self.method = method
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))
def forward(self, hidden, encoder_outputs):
max_len = encoder_outputs.size(0)
this_batch_size = encoder_outputs.size(1)
# Create variable to store attention energies
attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S
if USE_CUDA:
attn_energies = attn_energies.cuda()
# For each batch of encoder outputs
for b in range(this_batch_size):
# Calculate energy for each encoder output
for i in range(max_len):
attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))
# Normalize energies to weights in range 0 to 1, resize to 1 x B x S
return F.softmax(attn_energies).unsqueeze(1)
def score(self, hidden, encoder_output):
if self.method == 'dot':
energy = hidden.dot(encoder_output)
return energy
elif self.method == 'general':
energy = self.attn(encoder_output)
energy = hidden.dot(energy)
return energy
elif self.method == 'concat':
energy = self.attn(torch.cat((hidden, encoder_output), 1))
energy = self.v.dot(energy)
return energy
My actual goal is to extend the method by adding further information to be fed into the decoder, such as image data at each input time step. Technically, I want to use two (or more) encoders, one for the trajectories as in the link above, and one separate one for image data (convolutional encoder).
I do this by concatenating embeddings produced by the trajectory encoder and the convolutional encoder (as well as the cell states etc.) and feeding the concatenated tensors to the decoder.
For example, image embedding (256-length tensor) concatenated with trajectory data embedding (256-length tensor) yields a 512-length embedding.
My question is: is it a problem for the attention calculation if I use a concatenated encoder hidden state, concatenated encoder cell state, and concatenated encoder output coming from those different sources rather than hidden states, cells, outputs coming from a single source?
What are the caveats or pre-processing that should happen to make this work?
Thank you very much in advance.