DQN not converge for cartpole using Gazebo model - reinforcement-learning

I am using DQN to train the cartpole stay up task, but the loss function cannot converge, as u can see the loss function, I update the target net every 1000 episodes (followed one suggestion from stack overflow), but still, the loss will gradually grow, and the reward seems not learn, any one has some insights ? , parameters are
gamma = 0.99
buffer size 10000
mini batch size 200
net work structure is:
class DQN(torch.nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
super(DQN, self).__init__()
self.device = device
self.input_shape = state_dim #
# hidden layer
self.linear1 = nn.Linear(self.input_shape, hidden_dim*4)
self.linear2 = nn.Linear(hidden_dim*4, hidden_dim*2)
self.linear3 = nn.Linear(hidden_dim*2, hidden_dim)
self.out = nn.Linear(hidden_dim, action_dim)
self.linear1.weight.data.uniform_(-init_w, init_w)
self.linear1.bias.data.uniform_(-init_w, init_w)
self.linear2.weight.data.uniform_(-init_w, init_w)
self.linear2.bias.data.uniform_(-init_w, init_w)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
self.out.weight.data.uniform_(-init_w, init_w)
self.out.bias.data.uniform_(-init_w, init_w)
def forward(self, x):
x = torch.from_numpy(x).float()
x = self.linear1(x)
x = nn.functional.relu(x)
x = self.linear2(x)
x = nn.functional.relu(x)
x = self.linear3(x) #
x = nn.functional.relu(x)
# need return the FF result
x = self.out(x)
return x #
The training code
def learn_from_batch2(self, experiences):
batch_xp = Transition(*zip(*experiences)) # stack the experience based on the catogery
obs_batch = np.squeeze(np.array(batch_xp.state)) # n_batch x n_state
action_batch = np.squeeze(np.array(batch_xp.action))# n_batch x n_action
reward_batch = np.squeeze(np.array(batch_xp.reward)) # n_batch x 1 (scalar)
next_obs_batch = np.squeeze(np.array(batch_xp.next_state)) # n_batch x n_state
done_batch = np.squeeze(np.array(batch_xp.done)) # bool
# set command to use target net or not
rospy.loginfo("learn batch size is:%d", len(done_batch))
if self.use_target_network:
# update the target net or not
if self._eps % self.updateRate== 0:
rospy.logerr("update target net...")
self.Q_tagetNet.load_state_dict(self.Q_net.state_dict())
# compute the target Q: detached value, no need on the compute graph.
td_target = reward_batch + ~done_batch * \
np.tile(self.gamma, len(next_obs_batch)) * \
self.Q_tagetNet(next_obs_batch).max(1)[0].detach().numpy()
# print it out
# print(self.Q_tagetNet(next_obs_batch).max(1))
else:
td_target = reward_batch + ~done_batch *\
np.tile(self.gamma, len(next_obs_batch)) *\
self.Q_net(next_obs_batch).max(1)[0].detach().numpy()
# td_target is purely a numpy array now.
# convert the numpy to tensor for later computation
td_target = torch.from_numpy(td_target)
action_idx = torch.from_numpy(action_batch) # because action is numbered same with the index .
# print((self.device))
# build loss function
#
#
loss = torch.nn.functional.mse_loss(self.Q_net(obs_batch).gather(1, action_idx.view(-1,1)), td_target.float().unsqueeze(1))
# now update the network.
self.optimizer.zero_grad() # zero out the accumulate gradient in all the leafs of the network's parameter.
loss.mean().backward() # update the leaf tensors' grad.
self.optimizer.step() # update

Related

RuntimeError: shape '[128, -1]' is invalid for input of size 378 pytorch

I'm running a spiking neural network for data that has 21 features with a batch size of 128. I get the following error after many iterations of training (this error doesn't arise immediately!):
RuntimeError: shape '[128, -1]' is invalid for input of size 378 pytorch
When I went to go print out what the shapes of the tensors are before, I get the following:
Train
torch.Size([128, 21])
Test
torch.Size([128, 21])
This is my network:
class SpikingNeuralNetwork(nn.Module):
"""
Parameters in SpikingNeuralNetwork class:
1. number_inputs: Number of inputs to the SNN.
2. number_hidden: Number of hidden layers.
3. number_outputs: Number of output classes.
4. beta: Decay rate.
"""
def __init__(self, number_inputs, number_hidden, number_outputs, beta):
super().__init__()
self.number_inputs = number_inputs
self.number_hidden = number_hidden
self.number_outputs = number_outputs
self.beta = beta
# Initialize layers
self.fc1 = nn.Linear(self.number_inputs, self.number_hidden) # Applies linear transformation to all input points
self.lif1 = snn.Leaky(beta = self.beta) # Integrates weighted input over time, emitting a spike if threshold condition is met
self.fc2 = nn.Linear(self.number_hidden, self.number_outputs) # Applies linear transformation to output spikes of lif1
self.lif2 = snn.Leaky(beta = self.beta) # Another spiking neuron, integrating the weighted spikes over time
"""
Forward propagation of SNN. The code below function will only be called once the input argument x
is explicitly passed into net.
#param x: input passed into the network
#return layer of output after applying final spiking neuron
"""
def forward(self, x):
num_steps = 25
# Initialize hidden states at t = 0
mem1 = self.lif1.init_leaky()
mem2 = self.lif2.init_leaky()
# Record the final layer
spk2_rec = []
mem2_rec = []
for step in range(num_steps):
cur1 = self.fc1(x)
spk1, mem1 = self.lif1(cur1, mem1)
cur2 = self.fc2(spk1)
spk2, mem2 = self.lif2(cur2, mem2)
spk2_rec.append(spk2)
mem2_rec.append(mem2)
return torch.stack(spk2_rec, dim = 0), torch.stack(mem2_rec, dim = 0)
This is my training loop:
def training_loop(net, train_loader, test_loader, dtype, device, optimizer):
num_epochs = 1
loss_history = []
test_loss_history = []
counter = 0
# Temporal dynamics
num_steps = 25
# Outer training loop
for epoch in range(num_epochs):
iter_counter = 0
train_batch = iter(train_loader)
# Minibatch training loop
for data, targets in train_batch:
data = data.to(device)
targets = targets.to(device)
# Forward pass
net.train()
print("Train")
print(data.size())
spk_rec, mem_rec = net(data.view(batch_size, -1))
# Initialize the loss and sum over time
loss_val = torch.zeros((1), dtype = dtype, device = device)
for step in range(num_steps):
loss_val += loss_function(mem_rec[step], targets.long().flatten().to(device))
# Gradient calculation and weight update
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# Store loss history for future plotting
loss_history.append(loss_val.item())
# Test set
with torch.no_grad():
net.eval()
test_data, test_targets = next(iter(test_loader))
test_data = test_data.to(device)
test_targets = test_targets.to(device)
# Test set forward pass
print("Test")
print(test_data.size())
test_spk, test_mem = net(test_data.view(batch_size, -1))
# Test set loss
test_loss = torch.zeros((1), dtype = dtype, device = device)
for step in range(num_steps):
test_loss += loss_function(test_mem[step], test_targets.long().flatten().to(device))
test_loss_history.append(test_loss.item())
# Print train/test loss and accuracy
if counter % 50 == 0:
train_printer(epoch, iter_counter, counter, loss_history, data, targets, test_data, test_targets)
counter = counter + 1
iter_counter = iter_counter + 1
return loss_history, test_loss_history
The error occurs on spk_rec, mem_rec = net(data.view(batch_size, -1)).
The code was adopted from https://snntorch.readthedocs.io/en/latest/tutorials/tutorial_5.html, where it was originally used for the MNIST dataset. However, I am not working with an image dataset. I am working with a dataset that has 21 features and predicts just one target (with 100 classes). I tried to change data.view(batch_size, -1) and test_data.view(batch_size, -1) to data.view(batch_size, 21) and test_data.view(batch_size, 21) based on some other forum answers that I saw, and my program is running for now through the training loop. Does anyone have any suggestions for how I can run through the training with no errors?
EDIT: I now get the error RuntimeError: shape '[128, 21]' is invalid for input of size 378 from spk_rec, mem_rec = net(data.view(batch_size, -1)).
Here are my DataLoaders:
train_loader = DataLoader(dataset = train, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(dataset = test, batch_size = batch_size, shuffle = True)
My batch size is 128.
Tryng to run it by myself to try to solve your problem I luck also: net params and snn.snn.Leaky
import torch
from torch import nn
from torch.utils.data import DataLoader
class SpikingNeuralNetwork(nn.Module):
"""
Parameters in SpikingNeuralNetwork class:
1. number_inputs: Number of inputs to the SNN.
2. number_hidden: Number of hidden layers.
3. number_outputs: Number of output classes.
4. beta: Decay rate.
"""
def __init__(self, number_inputs, number_hidden, number_outputs, beta):
super().__init__()
self.number_inputs = number_inputs
self.number_hidden = number_hidden
self.number_outputs = number_outputs
self.beta = beta
# Initialize layers
self.fc1 = nn.Linear(self.number_inputs,
self.number_hidden) # Applies linear transformation to all input points
self.lif1 = snn.Leaky(
beta=self.beta) # Integrates weighted input over time, emitting a spike if threshold condition is met
self.fc2 = nn.Linear(self.number_hidden,
self.number_outputs) # Applies linear transformation to output spikes of lif1
self.lif2 = snn.Leaky(beta=self.beta) # Another spiking neuron, integrating the weighted spikes over time
"""
Forward propagation of SNN. The code below function will only be called once the input argument x
is explicitly passed into net.
#param x: input passed into the network
#return layer of output after applying final spiking neuron
"""
def forward(self, x):
num_steps = 25
# Initialize hidden states at t = 0
mem1 = self.lif1.init_leaky()
mem2 = self.lif2.init_leaky()
# Record the final layer
spk2_rec = []
mem2_rec = []
for step in range(num_steps):
cur1 = self.fc1(x)
spk1, mem1 = self.lif1(cur1, mem1)
cur2 = self.fc2(spk1)
spk2, mem2 = self.lif2(cur2, mem2)
spk2_rec.append(spk2)
mem2_rec.append(mem2)
return torch.stack(spk2_rec, dim=0), torch.stack(mem2_rec, dim=0)
batch_size = 2
train = torch.rand(128, 21)
test = torch.rand(128, 21)
train_loader = DataLoader(dataset=train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test, batch_size=batch_size, shuffle=True)
net = SpikingNeuralNetwork(number_inputs=1)
loss_function = nn.CrossEntropyLoss()
optimizer = nn.optim.Adam(net.parameters(), lr=0.1)
def training_loop(net, train_loader, test_loader, dtype, device, optimizer):
num_epochs = 1
loss_history = []
test_loss_history = []
counter = 0
# Temporal dynamics
num_steps = 25
# Outer training loop
for epoch in range(num_epochs):
iter_counter = 0
train_batch = iter(train_loader)
# Minibatch training loop
for data, targets in train_batch:
data = data.to(device)
targets = targets.to(device)
# Forward pass
net.train()
print("Train")
print(data.size())
spk_rec, mem_rec = net(data.view(batch_size, -1))
# Initialize the loss and sum over time
loss_val = torch.zeros((1), dtype=dtype, device=device)
for step in range(num_steps):
loss_val += loss_function(mem_rec[step], targets.long().flatten().to(device))
# Gradient calculation and weight update
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# Store loss history for future plotting
loss_history.append(loss_val.item())
# Test set
with torch.no_grad():
net.eval()
test_data, test_targets = next(iter(test_loader))
test_data = test_data.to(device)
test_targets = test_targets.to(device)
# Test set forward pass
print("Test")
print(test_data.size())
test_spk, test_mem = net(test_data.view(batch_size, -1))
# Test set loss
test_loss = torch.zeros((1), dtype=dtype, device=device)
for step in range(num_steps):
test_loss += loss_function(test_mem[step], test_targets.long().flatten().to(device))
test_loss_history.append(test_loss.item())
# Print train/test loss and accuracy
if counter % 50 == 0:
train_printer(epoch, iter_counter, counter, loss_history, data, targets, test_data, test_targets)
counter = counter + 1
iter_counter = iter_counter + 1
return loss_history, test_loss_history
Your code works just fine on the MNIST dataset, so I think it might be a problem with how the DataLoader is being called. My guess is that the total dataset is not evenly divisible by your batch_size. If this is true, then you have two options:
Instead of spk_rec, mem_rec = net(data.view(batch_size, -1)), try spk_rec, mem_rec = net(data.flatten(1)) which preserves the first dimension of your data.
Alternatively, you may need to set drop_last=True in the DataLoader functions.

Finding the number of of nodes and gpus of DistributedDataParallel

I would like to know what number should I select for nodes and gpus.
I use Tesla V100-SXM2 (8 boards).
I tried:
nodes = 1, gpus=1 (only the first gpu works)
nodes=1, gpus =8 (It took very long time and cannot execute)
Did I got wrong parameter for the nodes and gpus? or Is my code wrong ? I would appreciate if you could help me out. The code below is simplified sample code of DPP.
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
help='number of total epochs to run')
args = parser.parse_args()
args.world_size = args.gpus * args.nodes
os.environ['MASTER_ADDR'] = 'host1'
os.environ['MASTER_PORT'] = '7777'
mp.spawn(train, nprocs=args.gpus, args=(args,))
def train(gpu, args):
rank = args.nr * args.gpus + gpu
dist.init_process_group(
backend='nccl',
init_method='env://',
world_size=args.world_size,
rank=rank
)
torch.manual_seed(0)
model = ConvNet()
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size = 100
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Wrapper around our model to handle parallel training
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
# Data loading code
train_dataset = get_datasets()
# Sampler that takes care of the distribution of the batches such that
# the data is not repeated in the iteration and sampled accordingly
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset,
num_replicas=args.world_size,
rank=rank
)
# We pass in the train_sampler which can be used by the DataLoader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0,
pin_memory=True,
sampler=train_sampler)
start = datetime.now()
total_step = len(train_loader)
for epoch in range(args.epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
epoch + 1,
args.epochs,
i + 1,
total_step,
loss.item())
)
if gpu == 0:
print("Training complete)

why loss jump up to nan with swa method?

I use SWA methods to train model in pytorch.
SWA: https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/
my train code's loss jump up to nan immidiately.
model' loss jump up to nan
loss outputs is below.
1.loss: tensor(4.8463, device='cuda:0', grad_fn=)
2.loss: tensor(118317.8516, device='cuda:0', grad_fn=)
3.loss: tensor(5.7568e+22, device='cuda:0', grad_fn=)
4.loss: tensor(nan, device='cuda:0', grad_fn=)
without SWA methods,loss don't jump up.
is there any problems in train model's code with SWA method?
I would appreciate any advice,thank you.
#batch_size
batch_size=5
#DataLoader
train_dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=batch_size,shuffle=False)
#dict
dataloaders_dict={"train":train_dataloader,"val":val_dataloader}
#example outputs
#train :torch.Size([5, 25, 32, 32])
#target : torch.Size([5])
def train_model_withSWA(net,dataloaders_dict,criterion,optimizer,num_epochs):
loss_list=[]
acc_list=[]
#validation list
val_loss_list=[]
val_acc_list=[]
for epoch in tqdm(range(num_epochs)):
print("Epoch{}/{}".format(epoch+1,num_epochs))
print("--------------------------")
for phase in ["train","val"]:
if phase=="train":
net.train()
else:
net.eval()
epoch_loss=0.0
epoch_corrects=0
#if (epoch==0) and (phase=="train"):
continue
for inputs,labels in dataloaders_dict[phase]:
#optimizerを初期化:
optimizer.zero_grad()
with torch.set_grad_enabled(phase=="train"):
inputs=inputs.to(device)
labels=labels.to(device)
outputs=net(inputs)
loss=criterion(outputs,labels)
print("loss:",loss)
_,preds=torch.max(outputs,1)
if phase == "train":
loss.backward()
optimizer.step()
epoch_loss += loss.item()*inputs.size(0)
epoch_corrects +=torch.sum(preds==labels.data)
#for swa
optimizer.swap_swa_sgd()
epoch_loss=epoch_loss/len(dataloaders_dict[phase].dataset)
epoch_acc=epoch_corrects.double()/len(dataloaders_dict[phase].dataset)
print("{} Loss:{:.4f} Acc:{:.4f}".format(phase,epoch_loss,epoch_acc))
if phase=="train":
loss_list.append(epoch_loss.detach().numpy())
acc_list.append(epoch_acc.detach().numpy())
else:
val_loss_list.append(epoch_loss.detach().numpy())
val_acc_list.append(epoch_acc.detach().numpy())
from torchcontrib.optim import SWA
# ignore warning
import warnings
warnings.filterwarnings('ignore') # set to ignore
#criterion
criterion = nn.CrossEntropyLoss()
net=net.to(device)
base_opt = torch.optim.SGD(net.parameters(), lr=0.1)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
### train with SWA
train_model_withSWA(net=net,dataloaders_dict=dataloaders_dict,
criterion=criterion,
optimizer=optimizer,
num_epochs=num_epochs)
#model' loss jump up to nan....
#loss: tensor(4.8463, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(118317.8516, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(5.7568e+22, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
# In additional.
I use Self Attention and Positional Encoder code
class Self_Attention(nn.Module):
""" Self-Attention Layer"""
def __init__(self, in_dim):
super(Self_Attention, self).__init__()
#pointwise convolution
self.query_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.key_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.value_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
# softmax
self.softmax = nn.Softmax(dim=-2)
#output = x +gamma*o
# first:gamma=0
self.gamma = nn.Parameter(torch.zeros(1))
def forward(self, x):
x=x.to(device)
X = x
#B,C',W,H→B,C',N
proj_query = self.query_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
proj_query = proj_query.permute(0, 2, 1) # transpose
proj_key = self.key_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
# bmm
S = torch.bmm(proj_query, proj_key)
#
attention_map_T = self.softmax(S)
attention_map = attention_map_T.permute(0, 2, 1)
# Self-Attention Map
proj_value = self.value_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C,N
o = torch.bmm(proj_value, attention_map.permute(
0, 2, 1))
# Self-Attention Map
o = o.view(X.shape[0], X.shape[1], X.shape[2], X.shape[3])
out = x+self.gamma*o
#print("gamma:",self.gamma)
return out, attention_map
class PositionalEncoder(nn.Module):
def __init__(self, d_model=300, max_seq_len=256):
super(PositionalEncoder,self).__init__()
self.d_model = d_model
pe = torch.zeros(max_seq_len, d_model)
# GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pe = pe.to(device)
for pos in range(max_seq_len):
for i in range(0, d_model, 2):
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
pe[pos, i + 1] = math.cos(pos /
(10000 ** ((2 * (i + 1))/d_model)))
#
self.pe = pe.unsqueeze(0)
#
self.pe.requires_grad = False
def forward(self, x):
x=x.to(device)
ret = math.sqrt(self.d_model)*x + self.pe
return ret
Without normalization in data_preprocessing,loss don't jump up to nan.
this can be caused by my preprocessing error. I don't understand reration with normalization and loss's jump up in SWA method even now...
Im sorry for the mistake.

how to concatenate embedding layer in pytorch

I am trying to concatenate embedding layer with other features. It doesn’t give me any error, but doesn’t do any training either. Is anything wrong with this model definition, how to debug this?
Note: The last column (feature) in my X is feature with word2ix (single word).
Note: The net works fine without the embedding feature/layer
originally posted on pytorch forum
class Net(torch.nn.Module):
def __init__(self, n_features, h_sizes, num_words, embed_dim, out_size, dropout=None):
super().__init__()
self.num_layers = len(h_sizes) # hidden + input
self.embedding = torch.nn.Embedding(num_words, embed_dim)
self.hidden = torch.nn.ModuleList()
self.bnorm = torch.nn.ModuleList()
if dropout is not None:
self.dropout = torch.nn.ModuleList()
else:
self.dropout = None
for k in range(len(h_sizes)):
if k == 0:
self.hidden.append(torch.nn.Linear(n_features, h_sizes[0]))
self.bnorm.append(torch.nn.BatchNorm1d(h_sizes[0]))
if self.dropout is not None:
self.dropout.append(torch.nn.Dropout(p=dropout))
else:
if k == 1:
input_dim = h_sizes[0] + embed_dim
else:
input_dim = h_sizes[k-1]
self.hidden.append(torch.nn.Linear(input_dim, h_sizes[k]))
self.bnorm.append(torch.nn.BatchNorm1d(h_sizes[k]))
if self.dropout is not None:
self.dropout.append(torch.nn.Dropout(p=dropout))
# Output layer
self.out = torch.nn.Linear(h_sizes[-1], out_size)
def forward(self, inputs):
# Feedforward
for l in range(self.num_layers):
if l == 0:
x = self.hidden[l](inputs[:, :-1])
x = self.bnorm[l](x)
if self.dropout is not None:
x= self.dropout[l](x)
embeds = self.embedding(inputs[:,-1])#.view((1, -1)
x = torch.cat((embeds, x),dim=1)
else:
x = self.hidden[l](x)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
x = F.relu(x)
output= self.out(x)
return output
There were a few issues. The key one was data type. I mixed float features and int indices.
sample data and training before fix:
NUM_TARGETS = 4
NUM_FEATURES = 3
NUM_TEXT_FEATURES = 1
x = np.random.rand(5, NUM_FEATURES)
y = np.random.rand(5, NUM_TARGETS)
word_ix = np.arange(5).reshape(-1,1).astype(int)
x_train = np.append(x, word_ix, axis=1)
x_train = torch.from_numpy(x).float().to(device)
y_train = torch.from_numpy(y).float().to(device)
h_sizes = [2,2]
net = Net(x_train.shape[1] , h_sizes=h_sizes, num_words=5, embed_dim=2, out_size=y_train.shape[1],dropout=.01) # define the network
print(net) # net architecture
net = net.float()
net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001, weight_decay=.01)
loss_func = torch.nn.MSELoss() # this is for regression mean squared loss
# one training loop
prediction = net(x_train) # input x and predict based on x
loss = loss_func(prediction, y_train) # must be (1. nn output, 2. target)
optimizer.zero_grad() # clear gradients for next train
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
# train_losses.append(loss.detach().to('cpu').numpy())
To resolve this, I separated word index feature from x, and also removed net.float().
changed the dtypes conversion to:
x_train = torch.from_numpy(x).float().to(device)
y_train = torch.from_numpy(y).float().to(device)
# NOTE: word index needs to be long
word_ix = torch.from_numpy(word_ix).to(torch.long).to(device)
and forward method changed to :
def forward(self, inputs, word_ix):
# Feedforward
for l in range(self.num_layers):
if l == 0:
x = self.hidden[l](inputs)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
embeds = self.embedding(word_ix)
# NOTE:
# embeds has a shape of (batch_size, 1, embed_dim)
# inorder to merge this change this with x, reshape this to
# (batch_size, embed_dim)
embeds = embeds.view(embeds.shape[0], embeds.shape[2])
x = torch.cat((x, embeds.view(x.shape)),dim=1)
else:
x = self.hidden[l](x)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
x = F.relu(x)
output= self.out(x)
return output

Concatenate encoder hidden states/cells/outputs from different sources for attention calculation - issues?

I am using Pytorch for an LSTM encoder-decoder sequence-to-sequence prediction problem. As a first step, I would like to forecast 2D trajectories (trajectory x, trajectory y) from multivariate input - 2-D or more (trajectory x, trajectory y, speed, rotation, etc.)
I am following the below notebook (link):
seq2seq with Attention
Here excerpts (encoder, decoder, attention):
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
super(EncoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
def forward(self, input_seqs, input_lengths, hidden=None):
# Note: we run this all at once (over multiple batches of multiple sequences)
embedded = self.embedding(input_seqs)
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
outputs, hidden = self.gru(packed, hidden)
outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs) # unpack (back to padded)
outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
return outputs, hidden
class LuongAttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
super(LuongAttnDecoderRNN, self).__init__()
# Keep for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout = dropout
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.embedding_dropout = nn.Dropout(dropout)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
self.concat = nn.Linear(hidden_size * 2, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
# Choose attention model
if attn_model != 'none':
self.attn = Attn(attn_model, hidden_size)
def forward(self, input_seq, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# Get the embedding of the current input word (last output word)
batch_size = input_seq.size(0)
embedded = self.embedding(input_seq)
embedded = self.embedding_dropout(embedded)
embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N
# Get current hidden state from input word and last hidden state
rnn_output, hidden = self.gru(embedded, last_hidden)
# Calculate attention from current RNN state and all encoder outputs;
# apply to encoder outputs to get weighted average
attn_weights = self.attn(rnn_output, encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N
# Attentional vector using the RNN hidden state and context vector
# concatenated together (Luong eq. 5)
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
concat_input = torch.cat((rnn_output, context), 1)
concat_output = F.tanh(self.concat(concat_input))
# Finally predict next token (Luong eq. 6, without softmax)
output = self.out(concat_output)
# Return final output, hidden state, and attention weights (for visualization)
return output, hidden, attn_weights
For calculating attention in the decoder stage, the encoder hidden state and encoder outputs are input and used as below:
class Attn(nn.Module):
def __init__(self, method, hidden_size):
super(Attn, self).__init__()
self.method = method
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))
def forward(self, hidden, encoder_outputs):
max_len = encoder_outputs.size(0)
this_batch_size = encoder_outputs.size(1)
# Create variable to store attention energies
attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S
if USE_CUDA:
attn_energies = attn_energies.cuda()
# For each batch of encoder outputs
for b in range(this_batch_size):
# Calculate energy for each encoder output
for i in range(max_len):
attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))
# Normalize energies to weights in range 0 to 1, resize to 1 x B x S
return F.softmax(attn_energies).unsqueeze(1)
def score(self, hidden, encoder_output):
if self.method == 'dot':
energy = hidden.dot(encoder_output)
return energy
elif self.method == 'general':
energy = self.attn(encoder_output)
energy = hidden.dot(energy)
return energy
elif self.method == 'concat':
energy = self.attn(torch.cat((hidden, encoder_output), 1))
energy = self.v.dot(energy)
return energy
My actual goal is to extend the method by adding further information to be fed into the decoder, such as image data at each input time step. Technically, I want to use two (or more) encoders, one for the trajectories as in the link above, and one separate one for image data (convolutional encoder).
I do this by concatenating embeddings produced by the trajectory encoder and the convolutional encoder (as well as the cell states etc.) and feeding the concatenated tensors to the decoder.
For example, image embedding (256-length tensor) concatenated with trajectory data embedding (256-length tensor) yields a 512-length embedding.
My question is: is it a problem for the attention calculation if I use a concatenated encoder hidden state, concatenated encoder cell state, and concatenated encoder output coming from those different sources rather than hidden states, cells, outputs coming from a single source?
What are the caveats or pre-processing that should happen to make this work?
Thank you very much in advance.