Passing word2vec embedding to a custom LSTM pytorch model - deep-learning

I have a set of input sentences. I am using the pretrained word2vec model from gensim to get the embedding of the input sentences. I want to pass these embeddings as input to a custom pytorch LSTM model
hidden_size = 32
num_layers = 1
num_classes = 2
class customModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(customModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=False, bidirectional=True)
self.fcl = nn.Linear(hidden_size*2, num_classes)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, hidden = self.bilstm(x, (h0, c0))
fw_bilstm = out[-1, :, :self.hidden_size]
bk_bilstm = out[0, :, :self.hidden_size]
concat_fw_bw = torch.cat((fw_bilstm, bk_bilstm), dim = 1)
fc = self.fcl(concat_fw_bw)
x = F.softmax(F.relu(fc))
return x
Now I initialize the model object.
model = customModel(300, hidden_size, num_layers, num_classes)
Get embedding for the input sentences
sentences = [['my', 'name', 'is', 'nad'], ['i', 'love', 'nlp', 'proc']]
embedding = create_embedding(sentences)
embedding_torch = torch.FloatTensor(embedding)
Now I want to pass these embeddings to the model to get the prediction
for item in embedding_torch:
item = item.view((1, item.size()[0], item.size()[1]))
for epoch in range(1):
tag_scores = model(item)
print (tag_scores)
Which throws me runtime error
RuntimeError: Expected hidden[0] size (2, 4, 32), got (2, 1, 32)
I am not sure why this is happening. My understanding is h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device) line is calculating the hidden dimension properly.
What am I missing? Please suggest.

The backbone of your model is nn.LSTM which expects inputs with size [sequence_length, batch_size, embedding_size]. On the other hand, the inputs you are providing the model have size [1, sequence_lenth, embedding_size]. What I would do is create the nn.LSTM as:
# With batch_first=True
self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
That way, the model would expect the inputs to be of size [batch_size, sequence_length, embedding_size]. Then, instead of going through each element in the batch separately, do:
tag_scores = model(embedding_torch)

Related

Why model with one GRU layer return zero gradients?

I'm trying to compare between 2 models in order to learn about the behaviour of the gradients.
import torch
import torch.nn as nn
import torchinfo
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.Identity = nn.Identity ()
self.GRU = nn.GRU(input_size=3, hidden_size=32, num_layers=2, batch_first=True)
self.fc = nn.Linear(32, 5)
def forward(self, input_series):
self.Identity(input_series)
output, h = self.GRU(input_series)
output = output[:, -1, :] # get last state
output = self.fc(output)
output = output.view(-1, 5, 1) # reorginize output
return output
class SecondModel(nn.Module):
def __init__(self):
super(SecondModel, self).__init__()
self.GRU = nn.GRU(input_size=3, hidden_size=32, num_layers=2, batch_first=True)
def forward(self, input_series):
output, h = self.GRU(input_series)
return output
Checking the gradient of the first model gives True (zero gradients):
model = MyModel()
x = torch.rand([2, 10, 3])
y = model(x)
y.retain_grad()
y[:, -1].sum().backward()
print(torch.allclose(y.grad[:, :-1], torch.tensor(0.))) # gradients w.r.t previous outputs are zeroes
Checking the gradient of the second model also gives True (zero gradients):
model = SecondModel()
x = torch.rand([2, 10, 3])
y = model(x)
y.retain_grad()
y[:, -1].sum().backward()
print(torch.allclose(y.grad[:, :-1], torch.tensor(0.))) # gradients w.r.t previous outputs are zeroes
According to the answer here:
Do linear layer after GRU saved the sequence output order?
the second model (with just GRU layer) need to give non zero gradients.
What am I missing ?
When will we get zero or non-zero gradients ?
The value of y.grad[:, :-1] theoretically shouldn't be zeroes, but here they are because y[:, :-1] doesn't seem to refer to the same tensor objects used to compute y[:, -1] in the GRU implementation. As an illustration, a simple 1-layer GRU implementation looks like
import torch
import torch.nn as nn
class GRU(nn.Module):
def __init__(self, input_size, hidden_size):
super().__init__()
self.lin_r = nn.Linear(input_size + hidden_size, hidden_size)
self.lin_z = nn.Linear(input_size + hidden_size, hidden_size)
self.lin_in = nn.Linear(input_size, hidden_size)
self.lin_hn = nn.Linear(hidden_size, hidden_size)
self.hidden_size = hidden_size
def forward(self, x):
bsz, len_, in_ = x.shape
h = torch.zeros([bsz, self.hidden_size])
hs = []
for i in range(len_):
r = self.lin_r(torch.cat([x[:, i], h], dim=-1)).sigmoid()
z = self.lin_z(torch.cat([x[:, i], h], dim=-1)).sigmoid()
n = (self.lin_in(x[:, i]) + r * self.lin_hn(h)).tanh()
h = (1.-z)*n + z*h
hs.append(h)
# Return the output both as a single tensor and as a list of
# tensors actually used in computing the hidden vectors
return torch.stack(hs, dim=1), hs
Then, we have
model = GRU(input_size=3, hidden_size=32)
x = torch.rand([2, 10, 3])
y, hs = model(x)
y.retain_grad()
for h in hs:
h.retain_grad()
y[:, -1].sum().backward()
print(torch.allclose(y.grad[:, -1], torch.tensor(0.))) # False, as expected (sanity check)
print(torch.allclose(y.grad[:, :-1], torch.tensor(0.))) # True, unexpected
print(any(torch.allclose(h.grad, torch.tensor(0.)) for h in hs)) # False, as expected
It appears PyTorch computes the gradients w.r.t all tensors in hs as expected but not those w.r.t y.
So, to answer your question:
I don't think you miss anything. The linked answer is just not quite right as it incorrectly assumes PyTorch would compute y.grad as expected.
The theory given as a comment in the linked answer is still right, but not quite complete: gradient is always zero iff the input doesn't matter.

LSTM for time series forecasting

I have created a LSTM model in Pytorch which looks like this:
LSTMNet
Now I want to build another LSTM model (NewLSTMNet) on top of it (LSTMNet) by freezing the fc1 layer. I used:
model.fc1.weight.requires_grad = False
model.fc1.bias.requires_grad = False
and then I changed fc2 layer with a linear layer with input features = 40 and output features = 40.
So far I did:
class NewLSTMNet(nn.Module):
def __init__(self, model, input_size, hidden_size, num_layers):
super(NewLSTMNet, self).__init__()
self.model = model
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.model.fc2 = nn.Linear(40, 40)
# self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc3 = nn.Sequential(
# nn.ReLU(),
nn.Linear (40 , 128),
nn.ReLU(),
nn.Linear(128, 40),
nn.ReLU(),
nn.Linear(40,1),
nn.ReLU(),
)
def forward(self,x):
# input = self.model(x)
# h0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# c0 = Variable(torch.zeros(self.num_layers, input.size(0), self.hidden_size))
# _, (h_out,_) = self.lstm(input, (h0,c0))
# h_out = h_out.view(-1, self.hidden_size)
# print(h_out.shape)
# out = self.fc3(out)
out = self.model(x)
out = self.fc3(out)
return out
Now my new LSTM model looks like this:
NewLSTMNet
My training loop looks like this:
for epoch in range(EPOCHS):
model.train()
output = model(X_train)
train_loss = criterion(output, y_train)
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
with torch.no_grad():
model.eval()
output_val = model(X_valid)
valid_loss = criterion(output_val, y_valid)
if valid_loss <= valid_loss_min:
torch.save(model.state_dict(), './state_dict_new.pt')
print(
f'Epoch {epoch + 0:01}: Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
early_stopping_counter = 0 # reset counter if validation loss decreases
else:
print(f'Epoch {epoch + 0:01}: Validation loss did not decrease')
early_stopping_counter += 1
if early_stopping_counter > early_stopping_patience:
print('Early stopped at epoch :', epoch)
break
print(f'\t Train_Loss: {train_loss:.4f} Val_Loss: {valid_loss:.4f} BEST VAL Loss: {valid_loss_min:.4f}\n')
Now the model is working fine. But I want to create a LSTM layer in the NewLSTMNet model. I already tried to add a LSTM layer but I was expecting a vector in the output but I am getting a matrix in the output of the prediction. So there is shape mismatch!
How should I modify my code? Any help is appreciated. Thanks in advance!

input.size(-1) must be equal to input_size. Expected 763, got 1

I am tring to train my model with the batch size of 50. However I am getting error:
input.size(-1) must be equal to input_size. Expected 763, got 1
My code is:
for epoch in range(1, n_epochs + 1):
for i, (x_batch, y_batch) in enumerate(trn_dl):
#model.to(device)
#model.train()
x_batch = x_batch.to(device)
y_batch = y_batch.to(device)
#sched.step()
print('shape of the input batch')
print(x_batch.shape)
opt.zero_grad()
x_batch=torch.unsqueeze(x_batch,2)
print(x_batch.shape)
print(x_batch)
out = model(x_batch) # here I am getting error
y_batch=torch.unsqueeze(y_batch,0)
print('NOW')
print(y_batch.dtype)
y_batch = y_batch.to(torch.float32)
out = out.to(torch.float32)
out=torch.transpose(out,1,0)
loss = loss_function(out, torch.max(y_batch, 1)[1])
#(out, y_batch)
#targets = targets.to(torch.float32)
loss.backward()
opt.step()
My model is:
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size, hidden_size)
self.linear =nn.Linear(hidden_size, output_size)
self.hidden_cell = (torch.zeros(1,1,self.hidden_size),
torch.zeros(1,1,self.hidden_size))
def forward(self, input_seq):
h0 = torch.zeros(1, input_seq.size(0), self.hidden_size).to(device)
c0 = torch.zeros(1, input_seq.size(0), self.hidden_size).to(device)
lstm_out, _ = self.lstm(input_seq, (h0,c0))
lstm_out = self.fc(lstm_out[:, -1, :])
predictions = self.Linear(lstm_out.view(len(input_seq), -1))
print("predictions",predictions)
return predictions[-1]
Could anyone please look into it and help me.
By the looks of it, you are trying to pick the last step of the LSTM's output: lstm_out[:, -1, :]. However, by default with nn.RNNs the batch axis is second, not first: (sequence_length, batch_size, features). So you end up picking the last batch element, not the last sequence step. You might want to use batch_first=True when initializing your nn.LSTM:
Something like:
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)

GAN, generate regression output by the real image, not from the random noise

Is this concept possible to be implemented with the GAN algorithm?
I want the GAN to generate a regression-output(G-Value) of the shape(4,) by the real-image, not from the random noise, and discriminate G-Value with real regression-value(R-Value) of the same shape(4, ). R-Value is of the "y-train" dataset.
It means that if an image has a pattern like circular, it generally has the 4 features of position x, y, z, and alpha. I call it Real-Value(R-Value) and I want the GAN to generate fake value (G-Value) fooling the discriminator.
I have tried to implement it as below.
class UTModel:
def __init__(self):
optimizer__ = Adam(2e-4)
self.__dropout = .3
self.optimizerGenerator = Adam(1e-4)
self.optimizerDiscriminator = Adam(1e-4)
self.generator, self.discriminator = self.build()
def build(self):
# build the generator
g = Sequential()
g.add(Conv2D(512, kernel_size=3, strides=2, input_shape=(128, 128, 1), padding='same'))
g.add(BatchNormalization(momentum=0.8))
g.add(LeakyReLU(alpha=0.2))
g.add(Dropout(self.__dropout))
g.add(Conv2D(256, kernel_size=3, strides=2, padding='same'))
g.add(BatchNormalization(momentum=0.8))
g.add(LeakyReLU(alpha=0.2))
g.add(Dropout(self.__dropout))
g.add(Conv2D(128, kernel_size=3, strides=2, padding='same'))
g.add(BatchNormalization(momentum=0.8))
g.add(LeakyReLU(alpha=0.2))
g.add(Dropout(self.__dropout))
g.add(Conv2D(64, kernel_size=3, strides=1, padding='same'))
g.add(BatchNormalization(momentum=0.8))
g.add(LeakyReLU(alpha=0.2))
g.add(Dropout(self.__dropout))
g.add(Flatten())
g.add(Dense(4, activation='linear'))
# build the discriminator
d = Sequential()
d.add(Dense(128, input_shape=(4,)))
d.add(LeakyReLU(alpha=0.2))
d.add(Dropout(self.__dropout))
d.add(Dense(64))
d.add(LeakyReLU(alpha=0.2))
d.add(Dropout(self.__dropout))
d.add(Dense(64))
d.add(LeakyReLU(alpha=0.2))
d.add(Dropout(self.__dropout))
d.add(Dense(32))
d.add(LeakyReLU(alpha=0.2))
d.add(Dropout(self.__dropout))
d.add(Dense(1, activation='sigmoid'))
return g, d
def computeLosses(self, rValid, fValid):
bce = BinaryCrossentropy(from_logits=True)
# Discriminator loss
rLoss = bce(tf.ones_like(rValid), rValid)
fLoss = bce(tf.zeros_like(fValid), fValid)
dLoss = rLoss + fLoss
# Generator loss
gLoss = bce(tf.zeros_like(fValid), fValid)
return dLoss, gLoss
def train(self, images, rValues):
with tf.GradientTape() as gTape, tf.GradientTape() as dTape:
gValues = self.generator(images, training=True)
rValid = self.discriminator(rValues, training=True)
fValid = self.discriminator(gValues, training=True)
dLoss, gLoss = self.computeLosses(rValid, fValid)
dGradients = dTape.gradient(dLoss, self.discriminator.trainable_variables)
gGradients = gTape.gradient(gLoss, self.generator.trainable_variables)
self.optimizerDiscriminator.apply_gradients(zip(dGradients, self.discriminator.trainable_variables))
self.optimizerGenerator.apply_gradients(zip(gGradients, self.generator.trainable_variables))
print (dLoss, gLoss)
class UTTrainer:
def __init__(self):
self.env = 3DPatterns()
self.model = UTModel()
def start(self):
if not self.env.available:
return
batch = 32
for epoch in range(1):
# set new episod
while self.env.setEpisod():
for i in range(0, self.env.episodelen, batch):
self.model.train(self.env.episode[i:i+batch], self.env.y[i:i+batch])
But the G-Values have not generated as valid values. It converges the 1 or -1 always. The proper value should be like [-0.192798, 0.212887, -0.034519, -0.015000]. Please help me to find the right way.
Thank you.

Output dimension of a custom LSTM model in Pytorch

I have a custom LSTM model in PyTorch like below:
hidden_size = 32
num_layers = 1
num_classes = 2
class customModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(customModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
self.fcl = nn.Linear(hidden_size*2, num_classes)
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, hidden = self.bilstm(x, (h0, c0))
fw_bilstm = out[-1, :, :self.hidden_size]
bk_bilstm = out[0, :, :self.hidden_size]
concat_fw_bw = torch.cat((fw_bilstm, bk_bilstm), dim = 1)
fc = self.fcl(concat_fw_bw)
x = F.softmax(F.relu(fc))
return x
I can pass an input of type torch.Tensor to this model. The input is of length 67349, each is a 300 dimension vector.
After model initiation and prediction, I get an output vector of length 1.
model = customModel(300, hidden_size, num_layers, num_classes)
output = model(input_torch)
The output shows tensor([[0.5020, 0.4980]], grad_fn=<SoftmaxBackward>) when I print it out.
Why is this output of length 1? It seems like I should NOT have barch_first=True in my model but changing that requires other input dimension changes which I am not sure how to do.
Please suggest how can I get a vector output of length 67349 (input length) instead of 1?
Explanation
I see #gorjan suggested some modifications in the forward method of the network. So I wanted to clarify more what I am trying to build
Feed the embedding to a BiLSTM (done)
Get the hidden states of the last step in each direction and concatenate
Fed the concatenated output (from step 2) to a fully connected layer
with ReLUs
Fed the output from step 3 to a softmax layer
I have commented the def forward(...) method your module, have a look:
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, hidden = self.bilstm(x, (h0, c0)) # out is of size [batch_size, sequence_length, hidden_size * num_directions]
fw_bilstm = out[-1, :, :self.hidden_size] # This is wrong: You are taking only last batch element
bk_bilstm = out[0, :, :self.hidden_size] # This is wrong: You are taking only the first batch element
concat_fw_bw = torch.cat((fw_bilstm, bk_bilstm), dim = 1) # This is not needed: If you want to obtain the hidden states for all elements in the sequence
fc = self.fcl(concat_fw_bw) # Because of the above mentioned issues, this is wrong as well.
x = F.softmax(F.relu(fc)) # This is wrong: Never stack activation on top of activation.
return x
Now, according to what you asked:
Please suggest how can I get a vector output of length 67349 (input length) instead of 1?
I suppose that you want to obtain the hidden states for each of you elements in the batch. Here is how you should structure your forward pass:
def forward(self, x):
# Set initial hidden and cell states
h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, hidden = self.bilstm(x, (h0, c0)) # out is of size [batch_size, sequence_length, hidden_size * num_directions]
fc = self.fcl(out) # fc is of size [batch_size, sequence_length, num_classes]
x = F.softmax(fc) # Just softmax so that you can get the probabilities for each of your classes
return x
If we test out the updated model, these are the results:
# Assuming 32 elements in the batch, each elements has 177 elements in the sequence, and each sequence element has size 300
inputs = torch.rand(32, 177, 300)
# Obtaining the outputs from the model
outputs = model(inputs)
# The size is as expected: torch.Size([32, 177, 2])
print(outputs.shape)
Another thing to keep in mind, you say:
The input is of length 67349, each is a 300 dimension vector.
This is an extremely long sequence. You model will drastically underperform and I guess that your training would last forever. However, that is a completely different issues and should be discusses in a separate thread.