Multihead Attention with for loop - deep-learning

According to the Attention is All You Need paper
Instead of performing a single attention function with dmodel-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to dk, dk and dv dimensions, respectively
What I understand is there should be n_heads different linear layers so that each learns a different projection. Thus, I implemented the logic as follows
class Attention(nn.Module):
def __init__(self, embed_size=512, out_feat=64) -> None:
super().__init__()
self.embed_size = embed_size
self.out_feat = out_feat
self.value_fc = nn.Linear(embed_size, out_feat)
self.query_fc = nn.Linear(embed_size, out_feat)
self.key_fc = nn.Linear(embed_size, out_feat)
def forward(self, value, key, query, mask=None) -> torch.Tensor:
"""
value: torch.Tensor of shape (N, seq_len, embed_size)
key: torch.Tensor of shape (N, seq_len, embed_size)
query: torch.Tensor of shape (N, seq_len, embed_size)
mask: torch.Tensor of shape (N, seq_len, seq_len)
returns torch.Tensor of shape (N, seq_len, out_feat)
"""
value = self.value_fc(value) # N, seq_len, out_feat
key = self.key_fc(key)
query = self.query_fc(query)
weights = torch.bmm(query, torch.transpose(key, 1, 2))
weights /= math.sqrt(self.out_feat) # N, query_len, key_len
if mask != None:
weights += mask
weights = torch.softmax(weights, dim=2)
return torch.bmm(weights, value)
class MultiHeadAttention(nn.Module):
def __init__(self, embed_size=512, n_heads=8) -> None:
super().__init__()
assert embed_size % n_heads == 0, "Input feat. dim must be div. by n_heads"
self.embed_size = embed_size
self.n_heads = n_heads
self.out_feat = embed_size // n_heads
self.attention_layers = nn.ModuleList(
[Attention(self.embed_size, self.out_feat) for _ in range(self.n_heads)]
)
self.fc = nn.Linear(embed_size, embed_size)
def forward(self, value, key, query, mask=None):
values = [
attention(value, key, query, mask) for attention in self.attention_layers
]
return self.fc(torch.cat(values, dim=2))
However, all the (official) implementations I have seen use a single Attention layer but reshapes the keys, queries and values into (batch_size, len, n_heads, d_model // n_heads) (then they apply transpose before computing the attention weights). Although this is more efficient to compute, there is only one linear layer for each key, query and value rather than n_heads linear layers. I think this contradicts with what the paper says. I know this increases the amount of learnable parameters a lot but isn't this the correct way according to the paper?

Related

Why model with one GRU layer return zero gradients?

I'm trying to compare between 2 models in order to learn about the behaviour of the gradients.
import torch
import torch.nn as nn
import torchinfo
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.Identity = nn.Identity ()
self.GRU = nn.GRU(input_size=3, hidden_size=32, num_layers=2, batch_first=True)
self.fc = nn.Linear(32, 5)
def forward(self, input_series):
self.Identity(input_series)
output, h = self.GRU(input_series)
output = output[:, -1, :] # get last state
output = self.fc(output)
output = output.view(-1, 5, 1) # reorginize output
return output
class SecondModel(nn.Module):
def __init__(self):
super(SecondModel, self).__init__()
self.GRU = nn.GRU(input_size=3, hidden_size=32, num_layers=2, batch_first=True)
def forward(self, input_series):
output, h = self.GRU(input_series)
return output
Checking the gradient of the first model gives True (zero gradients):
model = MyModel()
x = torch.rand([2, 10, 3])
y = model(x)
y.retain_grad()
y[:, -1].sum().backward()
print(torch.allclose(y.grad[:, :-1], torch.tensor(0.))) # gradients w.r.t previous outputs are zeroes
Checking the gradient of the second model also gives True (zero gradients):
model = SecondModel()
x = torch.rand([2, 10, 3])
y = model(x)
y.retain_grad()
y[:, -1].sum().backward()
print(torch.allclose(y.grad[:, :-1], torch.tensor(0.))) # gradients w.r.t previous outputs are zeroes
According to the answer here:
Do linear layer after GRU saved the sequence output order?
the second model (with just GRU layer) need to give non zero gradients.
What am I missing ?
When will we get zero or non-zero gradients ?
The value of y.grad[:, :-1] theoretically shouldn't be zeroes, but here they are because y[:, :-1] doesn't seem to refer to the same tensor objects used to compute y[:, -1] in the GRU implementation. As an illustration, a simple 1-layer GRU implementation looks like
import torch
import torch.nn as nn
class GRU(nn.Module):
def __init__(self, input_size, hidden_size):
super().__init__()
self.lin_r = nn.Linear(input_size + hidden_size, hidden_size)
self.lin_z = nn.Linear(input_size + hidden_size, hidden_size)
self.lin_in = nn.Linear(input_size, hidden_size)
self.lin_hn = nn.Linear(hidden_size, hidden_size)
self.hidden_size = hidden_size
def forward(self, x):
bsz, len_, in_ = x.shape
h = torch.zeros([bsz, self.hidden_size])
hs = []
for i in range(len_):
r = self.lin_r(torch.cat([x[:, i], h], dim=-1)).sigmoid()
z = self.lin_z(torch.cat([x[:, i], h], dim=-1)).sigmoid()
n = (self.lin_in(x[:, i]) + r * self.lin_hn(h)).tanh()
h = (1.-z)*n + z*h
hs.append(h)
# Return the output both as a single tensor and as a list of
# tensors actually used in computing the hidden vectors
return torch.stack(hs, dim=1), hs
Then, we have
model = GRU(input_size=3, hidden_size=32)
x = torch.rand([2, 10, 3])
y, hs = model(x)
y.retain_grad()
for h in hs:
h.retain_grad()
y[:, -1].sum().backward()
print(torch.allclose(y.grad[:, -1], torch.tensor(0.))) # False, as expected (sanity check)
print(torch.allclose(y.grad[:, :-1], torch.tensor(0.))) # True, unexpected
print(any(torch.allclose(h.grad, torch.tensor(0.)) for h in hs)) # False, as expected
It appears PyTorch computes the gradients w.r.t all tensors in hs as expected but not those w.r.t y.
So, to answer your question:
I don't think you miss anything. The linked answer is just not quite right as it incorrectly assumes PyTorch would compute y.grad as expected.
The theory given as a comment in the linked answer is still right, but not quite complete: gradient is always zero iff the input doesn't matter.

Tensor shape for multivariable LSTM on Pytorch

I have a dataset with 8 features and 4 timesteps. I am trying to implement an LSTM but need help understanding if i have set my tensor correctly. The aim is to take the outputted features from the LSTM and pass them through a NN.
My tensor shape is currently #samples x #timesteps x #features i.e. 4500x4x8. This works with the code below. I want to make sure that the model is indeed taking each timestep matrix as a new sequence (with matrix 4500x[0]x8 being the first timestep matrix and 4500x[3]x8 being the last timestep). I then take the final timestep output (output[:,-1,:] to feed through a NN.
Is the code doing what i think it is doing? I ask as performance is marginally less than a simple RF that only uses the final timestep data. This would be unexpected as the data has strong time-series correlations (it tracks patients vitals declining before going on ventilation).
I have the following code:
class LSTM1(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers):
super(LSTM1, self).__init__()
self.num_classes = num_classes #number of classes
self.num_layers = num_layers #number of layers
self.input_size = input_size #input size
self.hidden_size = hidden_size #hidden state
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True) #lstm
self.fc_1 = nn.Linear(hidden_size, 32) #fully connected 1
self.fc_2 = nn.Linear(32, 12) #fully connected 1
self.fc_3 = nn.Linear(12, 1)
self.fc = nn.Sigmoid() #fully connected last layer
self.relu = nn.ReLU()
def forward(self,x):
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
# Propagate input through LSTM
output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
out = output[:,-1,:] #reshaping the data for Dense layer next
out = self.relu(out)
out = self.fc_1(out) #first Dense
out = self.relu(out) #relu
out = self.fc_2(out) #2nd dense
out = self.relu(out) #relu
out = self.fc_3(out) #3rd dense
out = self.relu(out) #relu
out = self.fc(out) #Final Output
return out
Error
Your error stems from the last three lines.
Do not use ReLU activation at the end of your network
Use nn.Linear -> nn.Sigmoid with BCELoss or
nn.Linear with nn.BCEWithLogitsLoss (see here for what logits are).
What is going on
With ReLu you output values in the range [0, +inf)
Applying sigmoid on top of it “squashes” values to (0, 1) with threshold being 0 (e.g. 0 becomes 0.5 probability, hence 1 after threaholding at 0.5!)
In effect, you always predict 1 with this code, which is not what you want probably

Normalization of the dataset, Error: all elements of input should be between 0 and 1

I have a problem with data normalization in PyTorch when I try to execute the training. First thing you need to know is that the dataset is composed of 3024 signal windows (so 1 channel), each one with a length of 5000 samples, so the dimension of the CSV file is 5000x3024. Each signal has 1 label that needs to be predicted.
Here is the code for how I load and normalize the data:
class CSVDataset(Dataset):
# load the dataset
def __init__(self, path, normalize = False):
# load the csv file as a dataframe
df = read_csv(path)
df = df.transpose()
# store the inputs and outputs
self.X = df.values[:, :-1]
self.y = df.values[:, -1]
print("Dataset length: ", self.X.shape[0])
# ensure input data is floats
self.X = self.X.astype(np.float)
self.y = self.y.astype(np.float)
if normalize:
self.X = self.X.reshape(self.X.shape[1], self.X.shape[0])
min_X = np.min(self.X,0) # returns an array of means for each signal window
max_X = np.max(self.X,0)
self.X = (self.X - min_X)/(max_X-min_X)
min_y = np.min(self.y)
max_y = np.max(self.y)
self.y = (self.y - min_y)/(max_y-min_y)
# reshape input data
self.X = self.X.reshape(self.X.shape[0], 1, self.X.shape[1])
self.y = self.y.reshape(self.y.shape[0], 1)
# label encode target and ensure the values are floats
self.y = LabelEncoder().fit_transform(self.y)
self.y = self.y.astype(np.float)
# prepare the dataset
def prepare_data(path):
# load the dataset
dataset = CSVDataset(path, normalize = True)
# calculate split
train, test = dataset.get_splits()
# prepare data loaders
train_dl = DataLoader(train, batch_size=32, shuffle=True)
test_dl = DataLoader(test, batch_size=1024, shuffle=False)
return train_dl, test_dl
While the train method is:
def train_model(train_dl, model):
# define the optimization
criterion = BCELoss()
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
model = model.float()
# enumerate epochs
for epoch in range(100):
# enumerate mini batches
for i, (inputs, targets) in enumerate(iter(train_dl)):
targets = torch.reshape(targets, (32, 1))
# clear the gradients
optimizer.zero_grad()
# compute the model output
yhat = model(inputs.float())
# calculate loss
loss = criterion(yhat, targets.float())
# credit assignment
loss.backward()
# update model weights
optimizer.step()
The error that I get is in the line loss = criterion(yhat, targets.float()) and it says:
RuntimeError: all elements of input should be between 0 and 1
I have tried inspecting the X in the variable explorer and it doesn't seem that there are any values that are not between 0 and 1. I don't know what I could have done wrong in normalization. Can you help me?
Builtin loss functions refer to input and target to designate the prediction and label instances respectively. The error message should be understood as "input of the criterion" i.e. yhat, and not as "input of the model".
It seems yhat does not belong in [0, 1], while BCELoss expects a probability, not a logit. You can either
add a sigmoid layer as the last layer of your model, or
use nn.BCEWithLogitsLoss instead, which combines a sigmoid and the bce loss.

network values goes to 0 by linear layers

I designed the Graph Attention Network.
However, during the operations inside the layer, the values of features becoming equal.
class GraphAttentionLayer(nn.Module):
## in_features = out_features = 1024
def __init__(self, in_features, out_features, dropout):
super(GraphAttentionLayer, self).__init__()
self.dropout = dropout
self.in_features = in_features
self.out_features = out_features
self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
self.a1 = nn.Parameter(torch.zeros(size=(out_features, 1)))
self.a2 = nn.Parameter(torch.zeros(size=(out_features, 1)))
nn.init.xavier_normal_(self.W.data, gain=1.414)
nn.init.xavier_normal_(self.a1.data, gain=1.414)
nn.init.xavier_normal_(self.a2.data, gain=1.414)
self.leakyrelu = nn.LeakyReLU()
def forward(self, input, adj):
h = torch.mm(input, self.W)
a_input1 = torch.mm(h, self.a1)
a_input2 = torch.mm(h, self.a2)
a_input = torch.mm(a_input1, a_input2.transpose(1, 0))
e = self.leakyrelu(a_input)
zero_vec = torch.zeros_like(e)
attention = torch.where(adj > 0, e, zero_vec) # most of values is close to 0
attention = F.softmax(attention, dim=1) # all values are 0.0014 which is 1/707 (707^2 is the dimension of attention)
attention = F.dropout(attention, self.dropout)
return attention
The dimension of 'attention' is (707 x 707) and I observed the value of attention is near 0 before the softmax.
After the softmax, all values are 0.0014 which is 1/707.
I wonder how to keep the values normalized and prevent this situation.
Thanks
Since you say this happens during training I would assume it is at the start. With random initialization you often get near identical values at the end of the network during the start of the training process.
When all values are more or less equal the output of the softmax will be 1/num_elements for every element, so they sum up to 1 over the dimension you chose. So in your case you get 1/707 as all the values, which just sounds to me your weights are freshly initialized and the outputs are mostly random at this stage.
I would let it train for a while and observe if this changes.

Concatenate encoder hidden states/cells/outputs from different sources for attention calculation - issues?

I am using Pytorch for an LSTM encoder-decoder sequence-to-sequence prediction problem. As a first step, I would like to forecast 2D trajectories (trajectory x, trajectory y) from multivariate input - 2-D or more (trajectory x, trajectory y, speed, rotation, etc.)
I am following the below notebook (link):
seq2seq with Attention
Here excerpts (encoder, decoder, attention):
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
super(EncoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
def forward(self, input_seqs, input_lengths, hidden=None):
# Note: we run this all at once (over multiple batches of multiple sequences)
embedded = self.embedding(input_seqs)
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
outputs, hidden = self.gru(packed, hidden)
outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
return outputs, hidden
class LuongAttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
super(LuongAttnDecoderRNN, self).__init__()
# Keep for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout = dropout
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.embedding_dropout = nn.Dropout(dropout)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
self.concat = nn.Linear(hidden_size * 2, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
# Choose attention model
if attn_model != 'none':
self.attn = Attn(attn_model, hidden_size)
def forward(self, input_seq, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# Get the embedding of the current input word (last output word)
batch_size = input_seq.size(0)
embedded = self.embedding(input_seq)
embedded = self.embedding_dropout(embedded)
embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N
# Get current hidden state from input word and last hidden state
rnn_output, hidden = self.gru(embedded, last_hidden)
# Calculate attention from current RNN state and all encoder outputs;
# apply to encoder outputs to get weighted average
attn_weights = self.attn(rnn_output, encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N
# Attentional vector using the RNN hidden state and context vector
# concatenated together (Luong eq. 5)
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
concat_input = torch.cat((rnn_output, context), 1)
concat_output = F.tanh(self.concat(concat_input))
# Finally predict next token (Luong eq. 6, without softmax)
output = self.out(concat_output)
# Return final output, hidden state, and attention weights (for visualization)
return output, hidden, attn_weights
For calculating attention in the decoder stage, the encoder hidden state and encoder outputs are input and used as below:
class Attn(nn.Module):
def __init__(self, method, hidden_size):
super(Attn, self).__init__()
self.method = method
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))
def forward(self, hidden, encoder_outputs):
max_len = encoder_outputs.size(0)
this_batch_size = encoder_outputs.size(1)
# Create variable to store attention energies
attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S
if USE_CUDA:
attn_energies = attn_energies.cuda()
# For each batch of encoder outputs
for b in range(this_batch_size):
# Calculate energy for each encoder output
for i in range(max_len):
attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))
# Normalize energies to weights in range 0 to 1, resize to 1 x B x S
return F.softmax(attn_energies).unsqueeze(1)
def score(self, hidden, encoder_output):
if self.method == 'dot':
energy = hidden.dot(encoder_output)
return energy
elif self.method == 'general':
energy = self.attn(encoder_output)
energy = hidden.dot(energy)
return energy
elif self.method == 'concat':
energy = self.attn(torch.cat((hidden, encoder_output), 1))
energy = self.v.dot(energy)
return energy
My actual goal is to extend the method by adding further information to be fed into the decoder, such as image data at each input time step. Technically, I want to use two (or more) encoders, one for the trajectories as in the link above, and one separate one for image data (convolutional encoder).
I do this by concatenating embeddings produced by the trajectory encoder and the convolutional encoder (as well as the cell states etc.) and feeding the concatenated tensors to the decoder.
For example, image embedding (256-length tensor) concatenated with trajectory data embedding (256-length tensor) yields a 512-length embedding.
My question is: is it a problem for the attention calculation if I use a concatenated encoder hidden state, concatenated encoder cell state, and concatenated encoder output coming from those different sources rather than hidden states, cells, outputs coming from a single source?
What are the caveats or pre-processing that should happen to make this work?
Thank you very much in advance.