network values goes to 0 by linear layers - deep-learning

I designed the Graph Attention Network.
However, during the operations inside the layer, the values of features becoming equal.
class GraphAttentionLayer(nn.Module):
## in_features = out_features = 1024
def __init__(self, in_features, out_features, dropout):
super(GraphAttentionLayer, self).__init__()
self.dropout = dropout
self.in_features = in_features
self.out_features = out_features
self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
self.a1 = nn.Parameter(torch.zeros(size=(out_features, 1)))
self.a2 = nn.Parameter(torch.zeros(size=(out_features, 1)))
nn.init.xavier_normal_(self.W.data, gain=1.414)
nn.init.xavier_normal_(self.a1.data, gain=1.414)
nn.init.xavier_normal_(self.a2.data, gain=1.414)
self.leakyrelu = nn.LeakyReLU()
def forward(self, input, adj):
h = torch.mm(input, self.W)
a_input1 = torch.mm(h, self.a1)
a_input2 = torch.mm(h, self.a2)
a_input = torch.mm(a_input1, a_input2.transpose(1, 0))
e = self.leakyrelu(a_input)
zero_vec = torch.zeros_like(e)
attention = torch.where(adj > 0, e, zero_vec) # most of values is close to 0
attention = F.softmax(attention, dim=1) # all values are 0.0014 which is 1/707 (707^2 is the dimension of attention)
attention = F.dropout(attention, self.dropout)
return attention
The dimension of 'attention' is (707 x 707) and I observed the value of attention is near 0 before the softmax.
After the softmax, all values are 0.0014 which is 1/707.
I wonder how to keep the values normalized and prevent this situation.
Thanks

Since you say this happens during training I would assume it is at the start. With random initialization you often get near identical values at the end of the network during the start of the training process.
When all values are more or less equal the output of the softmax will be 1/num_elements for every element, so they sum up to 1 over the dimension you chose. So in your case you get 1/707 as all the values, which just sounds to me your weights are freshly initialized and the outputs are mostly random at this stage.
I would let it train for a while and observe if this changes.

Related

Tensor shape for multivariable LSTM on Pytorch

I have a dataset with 8 features and 4 timesteps. I am trying to implement an LSTM but need help understanding if i have set my tensor correctly. The aim is to take the outputted features from the LSTM and pass them through a NN.
My tensor shape is currently #samples x #timesteps x #features i.e. 4500x4x8. This works with the code below. I want to make sure that the model is indeed taking each timestep matrix as a new sequence (with matrix 4500x[0]x8 being the first timestep matrix and 4500x[3]x8 being the last timestep). I then take the final timestep output (output[:,-1,:] to feed through a NN.
Is the code doing what i think it is doing? I ask as performance is marginally less than a simple RF that only uses the final timestep data. This would be unexpected as the data has strong time-series correlations (it tracks patients vitals declining before going on ventilation).
I have the following code:
class LSTM1(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers):
super(LSTM1, self).__init__()
self.num_classes = num_classes #number of classes
self.num_layers = num_layers #number of layers
self.input_size = input_size #input size
self.hidden_size = hidden_size #hidden state
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=True) #lstm
self.fc_1 = nn.Linear(hidden_size, 32) #fully connected 1
self.fc_2 = nn.Linear(32, 12) #fully connected 1
self.fc_3 = nn.Linear(12, 1)
self.fc = nn.Sigmoid() #fully connected last layer
self.relu = nn.ReLU()
def forward(self,x):
h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
# Propagate input through LSTM
output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
out = output[:,-1,:] #reshaping the data for Dense layer next
out = self.relu(out)
out = self.fc_1(out) #first Dense
out = self.relu(out) #relu
out = self.fc_2(out) #2nd dense
out = self.relu(out) #relu
out = self.fc_3(out) #3rd dense
out = self.relu(out) #relu
out = self.fc(out) #Final Output
return out
Error
Your error stems from the last three lines.
Do not use ReLU activation at the end of your network
Use nn.Linear -> nn.Sigmoid with BCELoss or
nn.Linear with nn.BCEWithLogitsLoss (see here for what logits are).
What is going on
With ReLu you output values in the range [0, +inf)
Applying sigmoid on top of it “squashes” values to (0, 1) with threshold being 0 (e.g. 0 becomes 0.5 probability, hence 1 after threaholding at 0.5!)
In effect, you always predict 1 with this code, which is not what you want probably

Policy Network returning different outputs for batched states and individual states

I am implementing REINFORCE applied to the CartPole-V0 openAI gym environment. I am trying 2 different implementations of the same, and the issue I am not able to resolve is the following:
Upon passing a single state to the Policy Network, I get an output Tensor of size 2, containing the action probabilities of the 2 actions. However, when I pass a `batch of states' to the Policy Network to compute the output action probabilities of all of them, the values that I obtain are very different from when each state is individually passed to the network.
Can someone help me understand the issue?
My code for the same is below: (Note: this is NOT the complete REINFORCE algorithm -- I am aware that I need to compute the loss from the probabilities. But I am trying to understand the difference in the computation of the two probabilities, which I think should be the same, before proceeding.)
# architecture of the Policy Network
class PolicyNetwork(nn.Module):
def __init__(self, state_dim, n_actions):
super().__init__()
self.n_actions = n_actions
self.model = nn.Sequential(
nn.Linear(state_dim, 64),
nn.ReLU(),
nn.Linear(64, n_actions),
nn.Softmax(dim=0)
).float()
def forward(self, X):
return self.model(X)
def train_reinforce_agent(env, episode_length, max_episodes, gamma, visualize_step, learning_rate=0.003):
# define the parametric model for the Policy: this is an instantiation of the PolicyNetwork class
model = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
# define the optimizer for updating the weights of the Policy Network
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# hyperparameters of the reinforce agent
EPISODE_LENGTH = episode_length
MAX_EPISODES = max_episodes
GAMMA = gamma
VISUALIZE_STEP = max(1, visualize_step)
score = []
for episode in range(MAX_EPISODES):
# reset the environment
curr_state = env.reset()
done = False
episode_t = []
# rollout an entire episode from the Policy Network
pred_vals = []
for t in range(EPISODE_LENGTH):
act_prob = model(torch.from_numpy(curr_state).float())
pred_vals.append(act_prob)
action = np.random.choice(np.array(list(range(env.action_space.n))), p=act_prob.data.numpy())
prev_state = curr_state
curr_state, _, done, info = env.step(action)
episode_t.append((prev_state, action, t+1))
if done:
break
score.append(len(episode_t))
# reward_batch = torch.Tensor([r for (s,a,r) in episode_t]).flip(dims=(0,))
reward_batch = torch.Tensor([r for (s, a, r) in episode_t])
# compute the return for every state-action pair from the rewards at every time-step
batch_Gvals = []
for i in range(len(episode_t)):
new_Gval = 0
power = 0
for j in range(i, len(episode_t)):
new_Gval = new_Gval + ((GAMMA ** power) * reward_batch[j]).numpy()
power += 1
batch_Gvals.append(new_Gval)
# normalize the returns for the batch
expected_returns_batch = torch.FloatTensor(batch_Gvals)
if torch.is_nonzero(expected_returns_batch.max()):
expected_returns_batch /= expected_returns_batch.max()
# batch the states, actions, prob after the episode
state_batch = torch.Tensor([s for (s,a,r) in episode_t])
print("State batch:", state_batch)
all_states = [s for (s,a,r) in episode_t]
print("All states:", all_states)
action_batch = torch.Tensor([a for (s,a,r) in episode_t])
pred_batch_v1 = model(state_batch)
pred_batch_v2 = torch.stack(pred_vals)
print("Batched state pred_vals:", pred_batch_v1)
print("Individual state pred_vals:", pred_batch_v2) ### Why is this different from the above predicted values??
My main function where I pass the environment is:
def main():
env = gym.make('CartPole-v0')
# train a REINFORCE-agent to learn the optimal policy
episode_length = 500
n_episodes = 500
gamma = 0.99
vis_steps = 50
train_reinforce_agent(env, episode_length, n_episodes, gamma, vis_steps)
In your policy, you have Softmax over dim 0. This normalizes the probability of each action across your batch. You want to do it across actions by dim=1.

Difference between WGAN and WGAN-GP (Gradient Penalty)

I just find that in the code here:
https://github.com/NUS-Tim/Pytorch-WGAN/tree/master/models
The "generator" loss, G, between WGAN and WGAN-GP is different, for WGAN:
g_loss = self.D(fake_images)
g_loss = g_loss.mean().mean(0).view(1)
g_loss.backward(one) # !!!
g_cost = -g_loss
But for WGAN-GP:
g_loss = self.D(fake_images)
g_loss = g_loss.mean()
g_loss.backward(mone) # !!!
g_cost = -g_loss
Why one is one=1 and another is mone=-1?
You might have misread the source code, the first sample you gave is not averaging the resut of D to compute its loss but instead uses the binary cross-entropy.
To be more precise:
The first method ("GAN") uses the BCE loss to compute the loss terms for D and G. The standard GAN optimization objective for D is to minimize E_x[log(D(x))] + E_z[log(1-D(G(z)))]. Source code:
outputs = self.D(images)
d_loss_real = self.loss(outputs.flatten(), real_labels) # <- bce loss
real_score = outputs
# Compute BCELoss using fake images
fake_images = self.G(z)
outputs = self.D(fake_images)
d_loss_fake = self.loss(outputs.flatten(), fake_labels) # <- bce loss
fake_score = outputs
# Optimizie discriminator
d_loss = d_loss_real + d_loss_fake
self.D.zero_grad()
d_loss.backward()
self.d_optimizer.step()
For d_loss_real you optimize towards 1s (output is considered real), while d_loss_fake optimizes towards 0s (output is considered fake).
While the second ("WCGAN") uses the Wasserstein loss (ref) whereby we maximise for D the loss: E_x[D(x)] - E_z[D(G(z))]. Source code:
# Train discriminator
# WGAN - Training discriminator more iterations than generator
# Train with real images
d_loss_real = self.D(images)
d_loss_real = d_loss_real.mean()
d_loss_real.backward(mone)
# Train with fake images
z = self.get_torch_variable(torch.randn(self.batch_size, 100, 1, 1))
fake_images = self.G(z)
d_loss_fake = self.D(fake_images)
d_loss_fake = d_loss_fake.mean()
d_loss_fake.backward(one)
# [...]
Wasserstein_D = d_loss_real - d_loss_fake
By doing d_loss_real.backward(mone) you backpropage with a gradient of opposite sign, i.e. its's a gradient ascend, and you end up maximizing d_loss_real.
In order to Update D network:
lossD = Expectation of D(fake data) - Expectation of D(real data) + gradient penalty
lossD ↓,D(real data) ↑
so you need to add minus one to the gradient process

Deep Q Learning - Cartpole Environment

I have a concern in understanding the Cartpole code as an example for Deep Q Learning. The DQL Agent part of the code as follow:
class DQLAgent:
def __init__(self, env):
# parameter / hyperparameter
self.state_size = env.observation_space.shape[0]
self.action_size = env.action_space.n
self.gamma = 0.95
self.learning_rate = 0.001
self.epsilon = 1 # explore
self.epsilon_decay = 0.995
self.epsilon_min = 0.01
self.memory = deque(maxlen = 1000)
self.model = self.build_model()
def build_model(self):
# neural network for deep q learning
model = Sequential()
model.add(Dense(48, input_dim = self.state_size, activation = "tanh"))
model.add(Dense(self.action_size,activation = "linear"))
model.compile(loss = "mse", optimizer = Adam(lr = self.learning_rate))
return model
def remember(self, state, action, reward, next_state, done):
# storage
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
# acting: explore or exploit
if random.uniform(0,1) <= self.epsilon:
return env.action_space.sample()
else:
act_values = self.model.predict(state)
return np.argmax(act_values[0])
def replay(self, batch_size):
# training
if len(self.memory) < batch_size:
return
minibatch = random.sample(self.memory,batch_size)
for state, action, reward, next_state, done in minibatch:
if done:
target = reward
else:
target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
train_target = self.model.predict(state)
train_target[0][action] = target
self.model.fit(state,train_target, verbose = 0)
def adaptiveEGreedy(self):
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
In the training section, we found our target and train_target. So why did we set train_target[0][action] = target here?
Every predict made while learning is not correct, but thanks to error calculation and backpropagation, the predict made at the end of the network will get closer and closer, but when we make train_target[0][action] = target here the error becomes 0, and in this case, how will the learning be?
self.model.predict(state) will return a tensor of shape of (1, 2) containing the estimated Q values for each action (in cartpole the action space is {0,1}).
As you know the Q value is a measure of the expected reward.
By setting self.model.predict(state)[0][action] = target (where target is the expected sum of rewards) it is creating a target Q value on which to train the model. By then calling model.fit(state, train_target) it is using the target Q value to train said model to approximate better Q values for each state.
I don't understand why you are saying that the loss becomes 0: the target is set to the discounted sum of rewards plus the current reward
target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
while the network prediction for the highest Q value is
np.amax(self.model.predict(next_state)[0])
The loss between the target and the predicted values is what is used to train the model.
Edit - more detailed explaination
(you can ignore the [0] to the predicted values, as it is just to access the right column and unimportant in the understanding)
The target variable is set to the sum between the current reward and the estimated sum of future rewards, or the Q value. Note that this variable is called target but it is not the target of the network, but the target Q value for the chosen action.
The train_target variable is used as what you call the "dataset". It represents the target of the network.
train_target = self.model.predict(state)
train_target[0][action] = target
You can clearly see that:
train_target[<taken action>] = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
train_target[<any other action>] = <prediction from the model>
the loss (mean squared error):
prediction = self.model.predict(state)
loss = (train_target - prediction)^2
For any line of the that is not the the loss is 0. For the one line that has been set the loss is
(target - prediction[action])^2
or
((reward + self.gamma*np.amax(self.model.predict(next_state)[0])) - self.model.predict(state)[0][action])^2
which is clearly different from 0.
Note that this agent is not ideal. I would strongly recommend the use of a target model instead of creating target Q values that way. Check out this answer as for why.

what should I do if my regression model stuck at a high value loss?

I'm using neural nets for a regression problem where I have 3 features and I'm trying to predict one continuous value. I noticed that my neural net start learning good but after 10 epochs it get stuck on a high loss value and could not improve anymore.
I tried to use Adam and other adaptive optimizers instead of SGD but that didn't work. I tried a complex architectures like adding layers, neurons, batch normalization and other activations etc.. and that also didn't work.
I tried to debug and try to find out if something is wrong with the implementation but when I use only 10 examples of the data my model learn fast so there are no errors. I start to increase the examples of the data and monitoring my model results as I increase the data examples. when I reach 3000 data examples my model start to get stuck on a high value loss.
I tried to increase layers, neurons and also to try other activations, batch normalization. My data are also normalized between [-1, 1], my target value is not normalized since it is regression and I'm predicting a continuous value. I also tried using keras but I've got the same result.
My real dataset have 40000 data, I don't know what should I try, I almost try all things that I know for optimization but none of them worked. I would appreciate it if someone can guide me on this. I'll post my Code but maybe it is too messy to try to understand, I'm sure there is no problem with my implementation, I'm using skorch/pytorch and some SKlearn functions:
# take all features as an Independant variable except the bearing and distance
# here when I start small the model learn good but from 3000 data points as you can see the model stuck on a high value. I mean the start loss is 15 and it start to learn good but when it reach 9 it stucks there
# and if I try to use the whole dataset for training then the loss start at 47 and start decreasing until it reach 36 and then stucks there too
X = dataset.iloc[:3000, 0:-2].reset_index(drop=True).to_numpy().astype(np.float32)
# take distance and bearing as the output values:
y = dataset.iloc[:3000, -2:].reset_index(drop=True).to_numpy().astype(np.float32)
y_bearing = y[:, 0].reshape(-1, 1)
y_distance = y[:, 1].reshape(-1, 1)
# normalize the input values
scaler = StandardScaler()
X_norm = scaler.fit_transform(X, y)
X_br_train, X_br_test, y_br_train, y_br_test = train_test_split(X_norm,
y_bearing,
test_size=0.1,
random_state=42,
shuffle=True)
X_dis_train, X_dis_test, y_dis_train, y_dis_test = train_test_split(X_norm,
y_distance,
test_size=0.1,
random_state=42,
shuffle=True)
bearing_trainset = Dataset(X_br_train, y_br_train)
bearing_testset = Dataset(X_br_test, y_br_test)
distance_trainset = Dataset(X_dis_train, y_dis_train)
distance_testset = Dataset(X_dis_test, y_dis_test)
def root_mse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
class RMSELoss(nn.Module):
def __init__(self):
super().__init__()
self.mse = nn.MSELoss()
def forward(self, yhat, y):
return torch.sqrt(self.mse(yhat, y))
class AED(nn.Module):
"""custom average euclidean distance loss"""
def __init__(self):
super().__init__()
def forward(self, yhat, y):
return torch.dist(yhat, y)
def train(on_target,
hidden_units,
batch_size,
epochs,
optimizer,
lr,
regularisation_factor,
train_shuffle):
network = None
trainset = distance_trainset if on_target.lower() == 'distance' else bearing_trainset
testset = distance_testset if on_target.lower() == 'distance' else bearing_testset
print(f"shape of trainset.X = {trainset.X.shape}, shape of trainset.y = {trainset.y.shape}")
print(f"shape of testset.X = {testset.X.shape}, shape of testset.y = {testset.y.shape}")
mse = EpochScoring(scoring=mean_squared_error, lower_is_better=True, name='MSE')
r2 = EpochScoring(scoring=r2_score, lower_is_better=False, name='R2')
rmse = EpochScoring(scoring=make_scorer(root_mse), lower_is_better=True, name='RMSE')
checkpoint = Checkpoint(dirname=f'results/{on_target}/checkpoints')
train_end_checkpoint = TrainEndCheckpoint(dirname=f'results/{on_target}/checkpoints')
if on_target.lower() == 'bearing':
network = BearingNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=y_distance.shape[1])
elif on_target.lower() == 'distance':
network = DistanceNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=1)
model = NeuralNetRegressor(
module=network,
criterion=RMSELoss,
device='cpu',
batch_size=batch_size,
lr=lr,
optimizer=optim.Adam if optimizer.lower() == 'adam' else optim.SGD,
optimizer__weight_decay=regularisation_factor,
max_epochs=epochs,
iterator_train__shuffle=train_shuffle,
train_split=predefined_split(testset),
callbacks=[mse, r2, rmse, checkpoint, train_end_checkpoint]
)
print(f"{'*' * 10} start training the {on_target} model {'*' * 10}")
history = model.fit(trainset, y=None)
print(f"{'*' * 10} End Training the {on_target} Model {'*' * 10}")
if __name__ == '__main__':
args = parser.parse_args()
train(on_target=args.on_target,
hidden_units=args.hidden_units,
batch_size=args.batch_size,
epochs=args.epochs,
optimizer=args.optimizer,
lr=args.learning_rate,
regularisation_factor=args.regularisation_lambda,
train_shuffle=args.shuffle)
and this is my network declaration:
class DistanceNetwork(nn.Module):
"""separate NN for predicting distance"""
def __init__(self, n_features=5, n_hidden=16, n_out=1):
super().__init__()
self.model = nn.Sequential(
nn.Linear(n_features, n_hidden),
nn.LeakyReLU(),
nn.Linear(n_hidden, 5),
nn.LeakyReLU(),
nn.Linear(5, n_out)
)
here is the log while training: