How do I compute bootstrapped cross entropy loss in PyTorch? - deep-learning

I have read some papers that use something called "Bootstrapped Cross Entropy Loss" to train their segmentation network. The idea is to focus only on the hardest k% (say 15%) of the pixels into account to improve learning performance, especially when easy pixels dominate.
Currently, I am using the standard cross entropy:
loss = F.binary_cross_entropy(mask, gt)
How do I convert this to the bootstrapped version efficiently in PyTorch?

Often we would also add a "warm-up" period to the loss such that the network can learn to adapt to the easy regions first and transit to the harder regions.
This implementation starts from k=100 and continues for 20000 iterations, then linearly decay it to k=15 for another 50000 iterations.
class BootstrappedCE(nn.Module):
def __init__(self, start_warm=20000, end_warm=70000, top_p=0.15):
super().__init__()
self.start_warm = start_warm
self.end_warm = end_warm
self.top_p = top_p
def forward(self, input, target, it):
if it < self.start_warm:
return F.cross_entropy(input, target), 1.0
raw_loss = F.cross_entropy(input, target, reduction='none').view(-1)
num_pixels = raw_loss.numel()
if it > self.end_warm:
this_p = self.top_p
else:
this_p = self.top_p + (1-self.top_p)*((self.end_warm-it)/(self.end_warm-self.start_warm))
loss, _ = torch.topk(raw_loss, int(num_pixels * this_p), sorted=False)
return loss.mean(), this_p

Addition to self answer by #hkchengrex (for future self and API parity with PyTorch);
one could implement functional version first (with some additional arguments provided in original torch.nn.functional.cross_entropy) like this (also I prefer reduction to be callable instead of predefined strings):
import typing
import torch
def bootstrapped_cross_entropy(
inputs,
targets,
iteration,
p: float,
warmup: typing.Union[typing.Callable[[float, int], float], int] = -1,
weight=None,
ignore_index=-100,
reduction: typing.Callable[[torch.Tensor], torch.Tensor] = torch.mean,
):
if not 0 < p < 1:
raise ValueError("p should be in [0, 1] range, got: {}".format(p))
if isinstance(warmup, int):
this_p = 1.0 if iteration < warmup else p
elif callable(warmup):
this_p = warmup(p, iteration)
else:
raise ValueError(
"warmup should be int or callable, got {}".format(type(warmup))
)
# Shortcut
if this_p == 1.0:
return torch.nn.functional.cross_entropy(
inputs, targets, weight, ignore_index=ignore_index, reduction=reduction
)
raw_loss = torch.nn.functional.cross_entropy(
inputs, targets, weight=weight, ignore_index=ignore_index, reduction="none"
).view(-1)
num_pixels = raw_loss.numel()
loss, _ = torch.topk(raw_loss, int(num_pixels * this_p), sorted=False)
return reduction(loss)
Also warmup can be specified as callable (taking p and current iteration) or int which allows for flexible or easy scheduling.
And making a class basing of _WeightedLoss and iteration incremented automatically during each call (so only inputs and targets have to be passed):
class BoostrappedCrossEntropy(torch.nn.modules.loss._WeightedLoss):
def __init__(
self,
p: float,
warmup: typing.Union[typing.Callable[[float, int], float], int] = -1,
weight=None,
ignore_index=-100,
reduction: typing.Callable[[torch.Tensor], torch.Tensor] = torch.mean,
):
self.p = p
self.warmup = warmup
self.ignore_index = ignore_index
self._current_iteration = -1
super().__init__(weight, size_average=None, reduce=None, reduction=reduction)
def forward(self, inputs, targets):
self._current_iteration += 1
return bootstrapped_cross_entropy(
inputs,
targets,
self._current_iteration,
self.p,
self.warmup,
self.weight,
self.ignore_index,
self.reduction,
)

Related

Deep Q Learning - Cartpole Environment

I have a concern in understanding the Cartpole code as an example for Deep Q Learning. The DQL Agent part of the code as follow:
class DQLAgent:
def __init__(self, env):
# parameter / hyperparameter
self.state_size = env.observation_space.shape[0]
self.action_size = env.action_space.n
self.gamma = 0.95
self.learning_rate = 0.001
self.epsilon = 1 # explore
self.epsilon_decay = 0.995
self.epsilon_min = 0.01
self.memory = deque(maxlen = 1000)
self.model = self.build_model()
def build_model(self):
# neural network for deep q learning
model = Sequential()
model.add(Dense(48, input_dim = self.state_size, activation = "tanh"))
model.add(Dense(self.action_size,activation = "linear"))
model.compile(loss = "mse", optimizer = Adam(lr = self.learning_rate))
return model
def remember(self, state, action, reward, next_state, done):
# storage
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
# acting: explore or exploit
if random.uniform(0,1) <= self.epsilon:
return env.action_space.sample()
else:
act_values = self.model.predict(state)
return np.argmax(act_values[0])
def replay(self, batch_size):
# training
if len(self.memory) < batch_size:
return
minibatch = random.sample(self.memory,batch_size)
for state, action, reward, next_state, done in minibatch:
if done:
target = reward
else:
target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
train_target = self.model.predict(state)
train_target[0][action] = target
self.model.fit(state,train_target, verbose = 0)
def adaptiveEGreedy(self):
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
In the training section, we found our target and train_target. So why did we set train_target[0][action] = target here?
Every predict made while learning is not correct, but thanks to error calculation and backpropagation, the predict made at the end of the network will get closer and closer, but when we make train_target[0][action] = target here the error becomes 0, and in this case, how will the learning be?
self.model.predict(state) will return a tensor of shape of (1, 2) containing the estimated Q values for each action (in cartpole the action space is {0,1}).
As you know the Q value is a measure of the expected reward.
By setting self.model.predict(state)[0][action] = target (where target is the expected sum of rewards) it is creating a target Q value on which to train the model. By then calling model.fit(state, train_target) it is using the target Q value to train said model to approximate better Q values for each state.
I don't understand why you are saying that the loss becomes 0: the target is set to the discounted sum of rewards plus the current reward
target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
while the network prediction for the highest Q value is
np.amax(self.model.predict(next_state)[0])
The loss between the target and the predicted values is what is used to train the model.
Edit - more detailed explaination
(you can ignore the [0] to the predicted values, as it is just to access the right column and unimportant in the understanding)
The target variable is set to the sum between the current reward and the estimated sum of future rewards, or the Q value. Note that this variable is called target but it is not the target of the network, but the target Q value for the chosen action.
The train_target variable is used as what you call the "dataset". It represents the target of the network.
train_target = self.model.predict(state)
train_target[0][action] = target
You can clearly see that:
train_target[<taken action>] = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
train_target[<any other action>] = <prediction from the model>
the loss (mean squared error):
prediction = self.model.predict(state)
loss = (train_target - prediction)^2
For any line of the that is not the the loss is 0. For the one line that has been set the loss is
(target - prediction[action])^2
or
((reward + self.gamma*np.amax(self.model.predict(next_state)[0])) - self.model.predict(state)[0][action])^2
which is clearly different from 0.
Note that this agent is not ideal. I would strongly recommend the use of a target model instead of creating target Q values that way. Check out this answer as for why.

network values goes to 0 by linear layers

I designed the Graph Attention Network.
However, during the operations inside the layer, the values of features becoming equal.
class GraphAttentionLayer(nn.Module):
## in_features = out_features = 1024
def __init__(self, in_features, out_features, dropout):
super(GraphAttentionLayer, self).__init__()
self.dropout = dropout
self.in_features = in_features
self.out_features = out_features
self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
self.a1 = nn.Parameter(torch.zeros(size=(out_features, 1)))
self.a2 = nn.Parameter(torch.zeros(size=(out_features, 1)))
nn.init.xavier_normal_(self.W.data, gain=1.414)
nn.init.xavier_normal_(self.a1.data, gain=1.414)
nn.init.xavier_normal_(self.a2.data, gain=1.414)
self.leakyrelu = nn.LeakyReLU()
def forward(self, input, adj):
h = torch.mm(input, self.W)
a_input1 = torch.mm(h, self.a1)
a_input2 = torch.mm(h, self.a2)
a_input = torch.mm(a_input1, a_input2.transpose(1, 0))
e = self.leakyrelu(a_input)
zero_vec = torch.zeros_like(e)
attention = torch.where(adj > 0, e, zero_vec) # most of values is close to 0
attention = F.softmax(attention, dim=1) # all values are 0.0014 which is 1/707 (707^2 is the dimension of attention)
attention = F.dropout(attention, self.dropout)
return attention
The dimension of 'attention' is (707 x 707) and I observed the value of attention is near 0 before the softmax.
After the softmax, all values are 0.0014 which is 1/707.
I wonder how to keep the values normalized and prevent this situation.
Thanks
Since you say this happens during training I would assume it is at the start. With random initialization you often get near identical values at the end of the network during the start of the training process.
When all values are more or less equal the output of the softmax will be 1/num_elements for every element, so they sum up to 1 over the dimension you chose. So in your case you get 1/707 as all the values, which just sounds to me your weights are freshly initialized and the outputs are mostly random at this stage.
I would let it train for a while and observe if this changes.

How to estimate the parameters of a mixture model in OpenTURNS?

I would like to estimate the parameters of a mixture model of normal distributions in OpenTURNS (that is, the distribution of a weighted sum of Gaussian random variables). OpenTURNS can create such a mixture, but it cannot estimate its parameters. Moreover, I need to create the mixture as an OpenTURNS distribution in order to propagate uncertainty through a function.
For example, I know how to create a mixture of two normal distributions:
import openturns as ot
mu1 = 1.0
sigma1 = 0.5
mu2 = 3.0
sigma2 = 2.0
weights = [0.3, 0.7]
n1 = ot.Normal(mu1, sigma1)
n2 = ot.Normal(mu2, sigma2)
m = ot.Mixture([n1, n2], weights)
In this example, I would like to estimate mu1, sigma1, mu2, sigma2 on a given sample. In order to create a working example, it is easy to generate a sample by simulation.
s = m.getSample(100)
You can rely on scikit-learn's GaussianMixture to estimate the parameters and then use them to define a Mixture model in OpenTURNS.
The script hereafter contains a Python class MixtureFactory that estimates the parameters of a scikitlearn GaussianMixture and outputs an OpenTURNS Mixture distribution:
from sklearn.mixture import GaussianMixture
from sklearn.utils.validation import check_is_fitted
import openturns as ot
import numpy as np
class MixtureFactory(GaussianMixture):
"""
Representation of a Gaussian mixture model probability distribution.
This class allows to estimate the parameters of a Gaussian mixture
distribution using scikit algorithms & provides openturns Mixture object.
Read more in scikit learn user guide & openturns theory.
Parameters:
-----------
n_components : int, defaults to 1.
The number of mixture components.
covariance_type : {'full' (default), 'tied', 'diag', 'spherical'}
String describing the type of covariance parameters to use.
Must be one of:
'full'
each component has its own general covariance matrix
'tied'
all components share the same general covariance matrix
'diag'
each component has its own diagonal covariance matrix
'spherical'
each component has its own single variance
tol : float, defaults to 1e-3.
The convergence threshold. EM iterations will stop when the
lower bound average gain is below this threshold.
reg_covar : float, defaults to 1e-6.
Non-negative regularization added to the diagonal of covariance.
Allows to assure that the covariance matrices are all positive.
max_iter : int, defaults to 100.
The number of EM iterations to perform.
n_init : int, defaults to 1.
The number of initializations to perform. The best results are kept.
init_params : {'kmeans', 'random'}, defaults to 'kmeans'.
The method used to initialize the weights, the means and the
precisions.
Must be one of::
'kmeans' : responsibilities are initialized using kmeans.
'random' : responsibilities are initialized randomly.
weights_init : array-like, shape (n_components, ), optional
The user-provided initial weights, defaults to None.
If it None, weights are initialized using the `init_params` method.
means_init : array-like, shape (n_components, n_features), optional
The user-provided initial means, defaults to None,
If it None, means are initialized using the `init_params` method.
precisions_init : array-like, optional.
The user-provided initial precisions (inverse of the covariance
matrices), defaults to None.
If it None, precisions are initialized using the 'init_params' method.
The shape depends on 'covariance_type'::
(n_components,) if 'spherical',
(n_features, n_features) if 'tied',
(n_components, n_features) if 'diag',
(n_components, n_features, n_features) if 'full'
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
warm_start : bool, default to False.
If 'warm_start' is True, the solution of the last fitting is used as
initialization for the next call of fit(). This can speed up
convergence when fit is called several times on similar problems.
In that case, 'n_init' is ignored and only a single initialization
occurs upon the first call.
See :term:`the Glossary <warm_start>`.
verbose : int, default to 0.
Enable verbose output. If 1 then it prints the current
initialization and each iteration step. If greater than 1 then
it prints also the log probability and the time needed
for each step.
verbose_interval : int, default to 10.
Number of iteration done before the next print.
"""
def __init__(self, n_components=2, covariance_type='full', tol=1e-6,
reg_covar=1e-6, max_iter=1000, n_init=1, init_params='kmeans',
weights_init=None, means_init=None, precisions_init=None,
random_state=41, warm_start=False,
verbose=0, verbose_interval=10):
super().__init__(n_components, covariance_type, tol, reg_covar,
max_iter, n_init, init_params, weights_init, means_init,
precisions_init, random_state, warm_start, verbose, verbose_interval)
def fit(self, X):
"""
Fit the mixture model parameters.
EM algorithm is applied here to estimate the model parameters and build a
Mixture distribution (see openturns mixture).
The method fits the model ``n_init`` times and sets the parameters with
which the model has the largest likelihood or lower bound. Within each
trial, the method iterates between E-step and M-step for ``max_iter``
times until the change of likelihood or lower bound is less than
``tol``, otherwise, a ``ConvergenceWarning`` is raised.
If ``warm_start`` is ``True``, then ``n_init`` is ignored and a single
initialization is performed upon the first call. Upon consecutive
calls, training starts where it left off.
Parameters
----------
X : array-like, shape (n_samples, n_features)
List of n_features-dimensional data points. Each row
corresponds to a single data point.
Returns
-------
"""
data = np.array(X)
# Evaluate the model parameters.
super().fit(data)
# openturns mixture
# n_components ==> weight of size n_components
weights = self.weights_
n_components = len(weights)
# Create ot distribution
collection = n_components * [0]
# Covariance matrices
cov = self.covariances_
mu = self.means_
# means : n_components x n_features
n_components, n_features = mu.shape
# Following the type of covariance, we define the collection of gaussians
# Spherical : C_k = Identity * sigma_k
if self.covariance_type is 'spherical':
c = ot.CorrelationMatrix(n_features)
for l in range(n_components):
sigma = np.sqrt(cov[l])
collection[l] = ot.Normal(list(mu[l]), [ sigma ] * n_features , c)
elif self.covariance_type is 'diag' :
for l in range(n_components):
c = ot.CovarianceMatrix(n_features)
for i in range(n_features):
c[i,i] = cov[l, i]
collection[l] = ot.Normal(list(mu[l]), c)
elif self.covariance_type == 'tied':
# Same covariance for all clusters
c = ot.CovarianceMatrix(n_features)
for i in range(n_features):
for j in range(0, i+1):
c[i,j] = cov[i,j]
# Define the collection with the same covariance
for l in range(n_components):
collection[l] = ot.Normal(list(mu[l]), c)
else:
n_features = cov.shape[1]
for l in range(n_components):
c = ot.CovarianceMatrix(n_features)
for i in range(n_features):
for j in range(0, i+1):
c[i,j] = cov[l][i,j]
collection[l] = ot.Normal(list(mu[l]), c)
self._mixture = ot.Mixture(collection, weights)
return self
def get_mixture(self):
"""
Returns the Mixture object
"""
check_is_fitted(self)
return self._mixture
if __name__ == "__main__":
mu1 = 1.0
sigma1 = 0.5
mu2 = 3.0
sigma2 = 2.0
weights = [0.3, 0.7]
n1 = ot.Normal(mu1, sigma1)
n2 = ot.Normal(mu2, sigma2)
m = ot.Mixture([n1, n2], weights)
x = m.getSample(1000)
est_dist = MixtureFactory(random_state=1)
est_dist.fit(x)
print(est_dist.get_mixture())
I have actually tried this method and it works perfectly. On top of that the fit of the model through the SciKit GMM and the ulterior adjustment thanks to OpenTurns are very fast. I recommend future users to test several numbers of components and covariance matrix structures, as it will not take a lot of time and can substantially improve the goodness of fit to the data.
Thanks for the answer.
Here is a pure OpenTURNS solution. It is probably slower than the scikit-learn-based method, but it is more generic: you could use it to estimate the parameters of any mixture model, not necessarily a mixture of normal distributions.
The idea is to retrieve the log-likelihood function from the Mixture object and minimize it.
In the following, let us assume that s is the sample we want to fit the mixture on.
First, we need to build the mixture we want to estimate the parameters of. We can specify any valid set of parameters, it does not matter. In your example, you want a mixture of 2 normal distributions.
mixture = ot.Mixture([ot.Normal()]*2, [0.5]*2)
There is a small hurdle. All weights sum to 1, thus one of them is determined by the others: the solver must not be allowed to freely set it. The order of the parameters of an OpenTURNS Mixture is as follows:
weight of the first distribution;
parameters of the first distribution;
weight of the second distribution;
parameters of the second distribution:
...
You can view all parameters with mixture.getParameter() and their names with mixture.getParameterDescription(). The following is a helper function that:
takes as input the list containing of all mixture parameters except the weight of its first distribution;
outputs a Point containing all parameters including the weight of the first distribution.
def full(params):
"""
Point of all mixture parameters from a list that omits the first weight.
"""
params = ot.Point(params)
aux_mixture = ot.Mixture(mixture)
dist_number = aux_mixture.getDistributionCollection().getSize()
index = aux_mixture.getDistributionCollection()[0].getParameter().getSize()
list_weights = []
for num in range(1, dist_number):
list_weights.append(params[index])
index += 1 + aux_mixture.getDistributionCollection()[num].getParameter().getSize()
complementary_weight = ot.Point([abs(1.0 - sum(list_weights))])
complementary_weight.add(params)
return complementary_weight
The next function computes the opposite of the log-likelihood of a given list of parameters (except the first weight).
For the sake of numerical stability, it divides this value by the number of observations.
We will minimize this function in order to find the Maximum Likelihood Estimate.
def minus_log_pdf(params):
"""
- log-likelihood of a list of parameters excepting the first weight
divided by the number of observations
"""
aux_mixture = ot.Mixture(mixture)
full_params = full(params)
try:
aux_mixture.setParameter(full_params)
except TypeError:
# case where the proposed parameters are invalid:
# return a huge value
return [ot.SpecFunc.LogMaxScalar]
res = - aux_mixture.computeLogPDF(s).computeMean()
return res
To use OpenTURNS optimization facilities, we need to turn this function into a PythonFunction object.
OT_minus_log_pdf = ot.PythonFunction(mixture.getParameter().getSize()-1, 1, minus_log_pdf)
Cobyla is usually good at likelihood optimization.
problem = ot.OptimizationProblem(OT_minus_log_pdf)
algo = ot.Cobyla(problem)
In order to decrease chances of Cobyla being stuck on a local minimum, we are going to use MultiStart. We pick a starting set of parameters and randomly change the weights. The following helper function makes it easy:
def random_weights(params, nb):
"""
List of nb Points representing mixture parameters with randomly varying weights.
"""
aux_mixture = ot.Mixture(mixture)
full_params = full(params)
aux_mixture.setParameter(full_params)
list_params = []
for num in range(nb):
dirichlet = ot.Dirichlet([1.0] * aux_mixture.getDistributionCollection().getSize()).getRealization()
dirichlet.add(1.0 - sum(dirichlet))
aux_mixture.setWeights(dirichlet)
list_params.append(aux_mixture.getParameter()[1:])
return list_params
We pick 10 starting points and increase the number of maximum evaluations of the log-likelihood from 100 (by default) to 10000.
init = mixture.getParameter()[1:]
starting_points = random_weights(init, 10)
algo_multistart = ot.MultiStart(algo, starting_points)
algo_multistart.setMaximumEvaluationNumber(10000)
Let's run the solver and retrieve the result.
algo_multistart.run()
result = algo_multistart.getResult()
All that remains is to set the mixture's parameters to the optimal value.
We must not forget to add the first weight back!
optimal_parameters = result.getOptimalPoint()
mixture.setParameter(full(optimal_parameters))
Below is an alternative.
The first step creates a new GaussianMixture class, derived from PythonDistribution. The key point is to implement the computeLogPDF method and the set/getParameters methods. Notice that this parametrization of a mixture only has one single weight w.
class GaussianMixture(ot.PythonDistribution):
def __init__(self, mu1 = -5.0, sigma1 = 1.0, \
mu2 = 5.0, sigma2 = 1.0, \
w = 0.5):
super(GaussianMixture, self).__init__(1)
if w < 0.0 or w > 1.0:
raise ValueError('The weight is not in [0, 1]. w=%s.' % (w))
self.mu1 = mu2
self.sigma1 = sigma1
self.mu2 = mu2
self.sigma2 = sigma2
self.w = w
collDist = [ot.Normal(mu1, sigma1), ot.Normal(mu2, sigma2)]
weight = [w, 1.0 - w]
self.distribution = ot.Mixture(collDist, weight)
def computeCDF(self, x):
p = self.distribution.computeCDF(x)
return p
def computePDF(self, x):
p = self.distribution.computePDF(x)
return p
def computeQuantile(self, prob, tail = False):
quantile = self.distribution.computeQuantile(prob, tail)
return quantile
def getSample(self, size):
X = self.distribution.getSample(size)
return X
def getParameter(self):
parameter = ot.Point([self.mu1, self.sigma1, \
self.mu2, self.sigma2, \
self.w])
return parameter
def setParameter(self, parameter):
[mu1, sigma1, mu2, sigma2, w] = parameter
self.__init__(mu1, sigma1, mu2, sigma2, w)
return parameter
def computeLogPDF(self, sample):
logpdf = self.distribution.computeLogPDF(sample)
return logpdf
In order to create the distribution, we use the Distribution class:
gm = ot.Distribution(GaussianMixture())
Estimating the parameters of this distribution is straightforward with MaximumLikelihoodFactory. However, we must set the bounds, because sigma cannot be negative and that w is in (0, 1).
factory = ot.MaximumLikelihoodFactory(gm)
lowerBound = [0.0, 1.e-6, 0.0, 1.e-6, 0.01]
upperBound = [0.0, 0.0, 0.0, 0.0, 0.99]
finiteLowerBound = [False, True, False, True, True]
finiteUpperBound = [False, False, False, False, True]
bounds = ot.Interval(lowerBound, upperBound, finiteLowerBound, finiteUpperBound)
factory.setOptimizationBounds(bounds)
Then we configure the optimization solver.
solver = factory.getOptimizationAlgorithm()
startingPoint = [-4.0, 1.0, 7.0, 1.5, 0.3]
solver.setStartingPoint(startingPoint)
factory.setOptimizationAlgorithm(solver)
Estimating the parameters is based on the build method.
distribution = factory.build(sample)
There are two limitations with this implementation.
First, it is not as fast as it should be, because of a limitation of the PythonDistribution (see https://github.com/openturns/openturns/issues/1391).
Estimating the parameters may be difficult, because the problem may have local optimas that cannot be retrieved with the default algorithm in MaximumLikelihoodFactory. This kind of task is generally done with the EM algorithm.

what should I do if my regression model stuck at a high value loss?

I'm using neural nets for a regression problem where I have 3 features and I'm trying to predict one continuous value. I noticed that my neural net start learning good but after 10 epochs it get stuck on a high loss value and could not improve anymore.
I tried to use Adam and other adaptive optimizers instead of SGD but that didn't work. I tried a complex architectures like adding layers, neurons, batch normalization and other activations etc.. and that also didn't work.
I tried to debug and try to find out if something is wrong with the implementation but when I use only 10 examples of the data my model learn fast so there are no errors. I start to increase the examples of the data and monitoring my model results as I increase the data examples. when I reach 3000 data examples my model start to get stuck on a high value loss.
I tried to increase layers, neurons and also to try other activations, batch normalization. My data are also normalized between [-1, 1], my target value is not normalized since it is regression and I'm predicting a continuous value. I also tried using keras but I've got the same result.
My real dataset have 40000 data, I don't know what should I try, I almost try all things that I know for optimization but none of them worked. I would appreciate it if someone can guide me on this. I'll post my Code but maybe it is too messy to try to understand, I'm sure there is no problem with my implementation, I'm using skorch/pytorch and some SKlearn functions:
# take all features as an Independant variable except the bearing and distance
# here when I start small the model learn good but from 3000 data points as you can see the model stuck on a high value. I mean the start loss is 15 and it start to learn good but when it reach 9 it stucks there
# and if I try to use the whole dataset for training then the loss start at 47 and start decreasing until it reach 36 and then stucks there too
X = dataset.iloc[:3000, 0:-2].reset_index(drop=True).to_numpy().astype(np.float32)
# take distance and bearing as the output values:
y = dataset.iloc[:3000, -2:].reset_index(drop=True).to_numpy().astype(np.float32)
y_bearing = y[:, 0].reshape(-1, 1)
y_distance = y[:, 1].reshape(-1, 1)
# normalize the input values
scaler = StandardScaler()
X_norm = scaler.fit_transform(X, y)
X_br_train, X_br_test, y_br_train, y_br_test = train_test_split(X_norm,
y_bearing,
test_size=0.1,
random_state=42,
shuffle=True)
X_dis_train, X_dis_test, y_dis_train, y_dis_test = train_test_split(X_norm,
y_distance,
test_size=0.1,
random_state=42,
shuffle=True)
bearing_trainset = Dataset(X_br_train, y_br_train)
bearing_testset = Dataset(X_br_test, y_br_test)
distance_trainset = Dataset(X_dis_train, y_dis_train)
distance_testset = Dataset(X_dis_test, y_dis_test)
def root_mse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
class RMSELoss(nn.Module):
def __init__(self):
super().__init__()
self.mse = nn.MSELoss()
def forward(self, yhat, y):
return torch.sqrt(self.mse(yhat, y))
class AED(nn.Module):
"""custom average euclidean distance loss"""
def __init__(self):
super().__init__()
def forward(self, yhat, y):
return torch.dist(yhat, y)
def train(on_target,
hidden_units,
batch_size,
epochs,
optimizer,
lr,
regularisation_factor,
train_shuffle):
network = None
trainset = distance_trainset if on_target.lower() == 'distance' else bearing_trainset
testset = distance_testset if on_target.lower() == 'distance' else bearing_testset
print(f"shape of trainset.X = {trainset.X.shape}, shape of trainset.y = {trainset.y.shape}")
print(f"shape of testset.X = {testset.X.shape}, shape of testset.y = {testset.y.shape}")
mse = EpochScoring(scoring=mean_squared_error, lower_is_better=True, name='MSE')
r2 = EpochScoring(scoring=r2_score, lower_is_better=False, name='R2')
rmse = EpochScoring(scoring=make_scorer(root_mse), lower_is_better=True, name='RMSE')
checkpoint = Checkpoint(dirname=f'results/{on_target}/checkpoints')
train_end_checkpoint = TrainEndCheckpoint(dirname=f'results/{on_target}/checkpoints')
if on_target.lower() == 'bearing':
network = BearingNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=y_distance.shape[1])
elif on_target.lower() == 'distance':
network = DistanceNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=1)
model = NeuralNetRegressor(
module=network,
criterion=RMSELoss,
device='cpu',
batch_size=batch_size,
lr=lr,
optimizer=optim.Adam if optimizer.lower() == 'adam' else optim.SGD,
optimizer__weight_decay=regularisation_factor,
max_epochs=epochs,
iterator_train__shuffle=train_shuffle,
train_split=predefined_split(testset),
callbacks=[mse, r2, rmse, checkpoint, train_end_checkpoint]
)
print(f"{'*' * 10} start training the {on_target} model {'*' * 10}")
history = model.fit(trainset, y=None)
print(f"{'*' * 10} End Training the {on_target} Model {'*' * 10}")
if __name__ == '__main__':
args = parser.parse_args()
train(on_target=args.on_target,
hidden_units=args.hidden_units,
batch_size=args.batch_size,
epochs=args.epochs,
optimizer=args.optimizer,
lr=args.learning_rate,
regularisation_factor=args.regularisation_lambda,
train_shuffle=args.shuffle)
and this is my network declaration:
class DistanceNetwork(nn.Module):
"""separate NN for predicting distance"""
def __init__(self, n_features=5, n_hidden=16, n_out=1):
super().__init__()
self.model = nn.Sequential(
nn.Linear(n_features, n_hidden),
nn.LeakyReLU(),
nn.Linear(n_hidden, 5),
nn.LeakyReLU(),
nn.Linear(5, n_out)
)
here is the log while training:

prioritized experience replay in deep Q-learning

i was implementing DQN in mountain car problem of openai gym. this problem is special as the positive reward is very sparse. so i thought of implementing prioritized experience replay as proposed in this paper by google deep mind.
there are certain things that are confusing me:
how do we store the replay memory. i get that pi is the priority of transition and there are two ways but what is this P(i)?
if we follow the rules given won't P(i) change every time a sample is added.
what does it mean when it says "we sample according to this probability distribution". what is the distribution.
finally how do we sample from it. i get that if we store it in a priority queue we can sample directly but we are actually storing it in a sum tree.
thanks in advance
According to the paper, there are two ways for calculating Pi and base on your choice, your implementation differs. I assume you selected Proportional Prioriziation then you should use "sum-tree" data structure for storing a pair of transition and P(i). P(i) is just the normalized version of Pi and it shows how important that transition is or in other words how effective that transition is for improving your network. When P(i) is high, it means it's so surprising for the network so it can really help the network to tune itself.
You should add each new transition with infinity priority to make sure it will be played at least once and there is no need to update all the experience replay memory for each new coming transition. During the experience replay process, you select a mini-batch and update the probability of those experiences in the mini-batch.
Each experience has a probability so all of the experiences together make a distribution and we select our next mini-batch according to this distribution.
You can sample via this policy from your sum-tree:
def retrieve(n, s):
if n is leaf_node: return n
if n.left.val >= s: return retrieve(n.left, s)
else: return retrieve(n.right, s - n.left.val)
I have taken the code from here.
You can reuse the code in OpenAI Baseline or using SumTree
import numpy as np
import random
from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
class ReplayBuffer(object):
def __init__(self, size):
"""Create Replay buffer.
Parameters
----------
size: int
Max number of transitions to store in the buffer. When the buffer
overflows the old memories are dropped.
"""
self._storage = []
self._maxsize = size
self._next_idx = 0
def __len__(self):
return len(self._storage)
def add(self, obs_t, action, reward, obs_tp1, done):
data = (obs_t, action, reward, obs_tp1, done)
if self._next_idx >= len(self._storage):
self._storage.append(data)
else:
self._storage[self._next_idx] = data
self._next_idx = (self._next_idx + 1) % self._maxsize
def _encode_sample(self, idxes):
obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
for i in idxes:
data = self._storage[i]
obs_t, action, reward, obs_tp1, done = data
obses_t.append(np.array(obs_t, copy=False))
actions.append(np.array(action, copy=False))
rewards.append(reward)
obses_tp1.append(np.array(obs_tp1, copy=False))
dones.append(done)
return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
def sample(self, batch_size):
"""Sample a batch of experiences.
Parameters
----------
batch_size: int
How many transitions to sample.
Returns
-------
obs_batch: np.array
batch of observations
act_batch: np.array
batch of actions executed given obs_batch
rew_batch: np.array
rewards received as results of executing act_batch
next_obs_batch: np.array
next set of observations seen after executing act_batch
done_mask: np.array
done_mask[i] = 1 if executing act_batch[i] resulted in
the end of an episode and 0 otherwise.
"""
idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
return self._encode_sample(idxes)
class PrioritizedReplayBuffer(ReplayBuffer):
def __init__(self, size, alpha):
"""Create Prioritized Replay buffer.
Parameters
----------
size: int
Max number of transitions to store in the buffer. When the buffer
overflows the old memories are dropped.
alpha: float
how much prioritization is used
(0 - no prioritization, 1 - full prioritization)
See Also
--------
ReplayBuffer.__init__
"""
super(PrioritizedReplayBuffer, self).__init__(size)
assert alpha >= 0
self._alpha = alpha
it_capacity = 1
while it_capacity < size:
it_capacity *= 2
self._it_sum = SumSegmentTree(it_capacity)
self._it_min = MinSegmentTree(it_capacity)
self._max_priority = 1.0
def add(self, *args, **kwargs):
"""See ReplayBuffer.store_effect"""
idx = self._next_idx
super().add(*args, **kwargs)
self._it_sum[idx] = self._max_priority ** self._alpha
self._it_min[idx] = self._max_priority ** self._alpha
def _sample_proportional(self, batch_size):
res = []
p_total = self._it_sum.sum(0, len(self._storage) - 1)
every_range_len = p_total / batch_size
for i in range(batch_size):
mass = random.random() * every_range_len + i * every_range_len
idx = self._it_sum.find_prefixsum_idx(mass)
res.append(idx)
return res
def sample(self, batch_size, beta):
"""Sample a batch of experiences.
compared to ReplayBuffer.sample
it also returns importance weights and idxes
of sampled experiences.
Parameters
----------
batch_size: int
How many transitions to sample.
beta: float
To what degree to use importance weights
(0 - no corrections, 1 - full correction)
Returns
-------
obs_batch: np.array
batch of observations
act_batch: np.array
batch of actions executed given obs_batch
rew_batch: np.array
rewards received as results of executing act_batch
next_obs_batch: np.array
next set of observations seen after executing act_batch
done_mask: np.array
done_mask[i] = 1 if executing act_batch[i] resulted in
the end of an episode and 0 otherwise.
weights: np.array
Array of shape (batch_size,) and dtype np.float32
denoting importance weight of each sampled transition
idxes: np.array
Array of shape (batch_size,) and dtype np.int32
idexes in buffer of sampled experiences
"""
assert beta > 0
idxes = self._sample_proportional(batch_size)
weights = []
p_min = self._it_min.min() / self._it_sum.sum()
max_weight = (p_min * len(self._storage)) ** (-beta)
for idx in idxes:
p_sample = self._it_sum[idx] / self._it_sum.sum()
weight = (p_sample * len(self._storage)) ** (-beta)
weights.append(weight / max_weight)
weights = np.array(weights)
encoded_sample = self._encode_sample(idxes)
return tuple(list(encoded_sample) + [weights, idxes])
def update_priorities(self, idxes, priorities):
"""Update priorities of sampled transitions.
sets priority of transition at index idxes[i] in buffer
to priorities[i].
Parameters
----------
idxes: [int]
List of idxes of sampled transitions
priorities: [float]
List of updated priorities corresponding to
transitions at the sampled idxes denoted by
variable `idxes`.
"""
assert len(idxes) == len(priorities)
for idx, priority in zip(idxes, priorities):
assert priority > 0
assert 0 <= idx < len(self._storage)
self._it_sum[idx] = priority ** self._alpha
self._it_min[idx] = priority ** self._alpha
self._max_priority = max(self._max_priority, priority)