From Ufldl softmax regression, the gradient of the cost function is
I tried to implement it in Python, but my loss barely changed:
def update_theta(x, y, theta, learning_rate):
# 4 classes, 3 features
theta_gradients = np.zeros((4, 3)).astype(np.float)
for j in range(4):
for i in range(len(x)):
# p: softmax P(y = j|x, theta)
p = softmax(sm_input(x[i], theta))[y[i]]
# target function {y = j}
p -= 1 if y[i] == j else 0
x[i] = p * x[i]
# sum gradients
theta_gradients[j] += x[i]
theta_gradients[j] = theta_gradients[j] / len(x)
theta = theta.T - learning_rate * theta_gradients
return theta.T
My first 10 epoches loss and acc:
1.3863767797767788
train acc cnt 3
1.386293406734411
train acc cnt 255
1.3862943723056675
train acc cnt 3
1.3862943609888068
train acc cnt 255
1.386294361121427
train acc cnt 3
1.3862943611198806
train acc cnt 254
1.386294361119894
train acc cnt 4
1.3862943611198937
train acc cnt 125
1.3862943611198937
train acc cnt 125
1.3862943611198937
train acc cnt 125
I don't know if I misunderstood the equation, any suggestion would be appreciated!
Could it be that you are always initializing your theta_gradients in your update_theta function?
Normally each step of the gradient should learn from the previous theta.
Just as an example:
def step_gradient(theta_current, X, y, learning_rate):
preds = predict_abs(theta_current, X)
theta_gradient = -(2 / len(y)) * np.dot(X.T, (y - preds))
theta = theta_current - learning_rate * theta_gradient
return theta
Related
I'm getting weird results from a PyTorch Softmax layer, trying to figure out what's going on, so I boiled it down to a minimal test case, a neural network that just learns to decode binary numbers into one-hot form.
Just Softmax() gets a warning:
UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
Okay, so what to supply for X? I had been guessing 0 would be a sensible argument. Just to make sure, I tried Softmax(dim=1):
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
Okay, so that seems clear about allowed values. -1 apparently means the last dimension, so in this case, where the output is just a one-dimensional vector, that should mean the same thing as 0. Trying it with Softmax(dim=-1) works fine; in a few thousand epochs, the network reliably learns to decode the numbers with 100% accuracy.
Just to make sure it gives the same results, I tried it again with Softmax(dim=0) (as shown below)...
And it does not give the same result at all. The accuracy oscillates, but levels off somewhere around 20-30%.
What's going on? Why is 0 not the same as -1 in this context, and what exactly is 0 doing?
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
bits = 5
class Dataset1(Dataset):
def __init__(self):
s = []
for i in range(1 << bits):
x = []
for c in format(i, "b").zfill(bits):
x.append(float(c == "1"))
y = []
for j in range(1 << bits):
y.append(float(i == j))
x = torch.as_tensor(x)
y = torch.as_tensor(y)
s.append((x, y))
self.s = s
def __len__(self):
return len(self.s)
def __getitem__(self, i):
return self.s[i]
trainDs = Dataset1()
batchSize = 16
trainDl = DataLoader(trainDs, batch_size=batchSize)
for x, y in trainDl:
print(x.shape)
print(y.shape)
break
hiddenSize = 100
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layers = nn.Sequential(
nn.Linear(bits, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, hiddenSize),
nn.Tanh(),
nn.Linear(hiddenSize, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, 1 << bits),
nn.Softmax(dim=0),
)
def forward(self, x):
return self.layers(x)
device = torch.device("cpu")
model = Net().to(device)
def accuracy(model, ds):
n = 0
for x, y in ds:
with torch.no_grad():
z = model(x)
if torch.argmax(y) == torch.argmax(z):
n += 1
return n / len(ds)
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10000
interval = epochs // 10
for epoch in range(epochs + 1):
for bi, (x, y) in enumerate(trainDl):
x = x.to(device)
y = y.to(device)
loss = criterion(model(x), y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % interval == 0 and not bi:
print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")
In the accuracy function, you forgot to create a new dimension for the batch (batchsize=1), which explains why it gives that error when you use dim=1. Regarding the dimension of the softmax, you can check this post.
Below is the modified code.
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
bits = 5
class Dataset1(Dataset):
def __init__(self):
s = []
for i in range(1 << bits):
x = []
for c in format(i, "b").zfill(bits):
x.append(float(c == "1"))
y = []
for j in range(1 << bits):
y.append(float(i == j))
x = torch.as_tensor(x)
y = torch.as_tensor(y)
s.append((x, y))
self.s = s
def __len__(self):
return len(self.s)
def __getitem__(self, i):
return self.s[i]
trainDs = Dataset1()
batchSize = 16
trainDl = DataLoader(trainDs, batch_size=batchSize, drop_last=True)
for x, y in trainDl:
print(x.shape)
print(y.shape)
break
hiddenSize = 100
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layers = nn.ModuleList(
[nn.Linear(bits, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, hiddenSize),
nn.Tanh(),
nn.Linear(hiddenSize, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, 1 << bits),
nn.Softmax(dim=1)]
)
def forward(self, x):
for i,layer in enumerate(self.layers):
x = layer(x)
if i == 6:
pass
#print('softmax input shape',x.shape)
#print('softmax output shape',torch.nn.functional.softmax(x,dim=1).shape)
#print('linear',x.shape)
#print('output',x.shape)
return x
device = torch.device("cpu")
model = Net().to(device)
def accuracy(model, ds):
n = 0
for x, y in ds:
x = x.unsqueeze(0) # create a batch of size 1
y = y.unsqueeze(0) # create a batch of size 1
with torch.no_grad():
z = model(x)
print(z.shape)
break
if torch.argmax(y) == torch.argmax(z):
n += 1
return n / len(ds)
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10000
interval = epochs // 10
for epoch in range(epochs + 1):
for bi, (x, y) in enumerate(trainDl):
x = x.to(device)
y = y.to(device)
loss = criterion(model(x), y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % interval == 0 and not bi:
print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")
I coded a function picircle() that estimates pi.
Now I would like to plot this function for N values.
function Plotpi()
p = 100 # precision of π
N = 5
for i in 1:N
picircle(p)
end
end
3.2238805970149254
3.044776119402985
3.1641791044776117
3.1243781094527363
3.084577114427861
Now I am not sure how to plot the function, I tried plot(PP()) but it didn't work
Here I defined picircle:
function picircle(n)
n = n
L = 2n+1
x = range(-1, 1, length=L)
y = rand(L)
center = (0,0)
radius = 1
n_in_circle = 0
for i in 1:L
if norm((x[i], y[i]) .- center) < radius
n_in_circle += 1
end
end
println(4 * n_in_circle / L)
end
Your problem is that your functions don't actually return anything:
julia> x = Plotpi()
3.263681592039801
3.0646766169154227
2.845771144278607
3.18407960199005
3.044776119402985
julia> x
julia> typeof(x)
Nothing
The numbers you see are just printed to the REPL, and print doesn't return any value:
julia> x = print(5)
5
julia> typeof(x)
Nothing
So you probably just want to change your function so that it returns what you want to plot:
julia> function picircle(n)
n = n
L = 2n+1
x = range(-1, 1, length=L)
y = rand(L)
center = (0,0)
radius = 1
n_in_circle = 0
for i in 1:L
if norm((x[i], y[i]) .- center) < radius
n_in_circle += 1
end
end
4 * n_in_circle / L
end
Then:
julia> x = picircle(100)
3.263681592039801
julia> x
3.263681592039801
So now the value of the function is actually returned (rather than just printed to the console). You don't really need a separate function if you just want to do this multiple times and plot the results, a comprehension will do. Here's an example comparing the variability of the estimate with 100 draws vs 50 draws:
julia> using Plots
julia> histogram([picircle(100) for _ ∈ 1:1_000], label = "100 draws", alpha = 0.5)
julia> histogram!([picircle(20) for _ ∈ 1:1_000], label = "20 draws", alpha = 0.5)
Newb question
I am writing a OpenAI Gym pong player with TensorFlow and thus far have been able to create the network based on a random initialization so that it would randomly return to move the player paddle up or down.
After the epoch is over (21 games played where the computer won) I collected a set of observations, moves and scores. The final observation of a game receives a score and each preceding observation can be scored based on Bellman equation.
Now my questions what I do not understand yet:
How do I calculate the cost function so that it would be propagated as a start gradient for backward propagation? I totally get it with supervised learning, but here we do not have any labels to score agains.
How would I start optimizing the network?
Maybe a pointer to existing code or some literature would help.
Here's where I compute the rewards:
def compute_observation_rewards(self, gamma, up_score_probabilities):
"""
Applies Bellman equation and determines reward for each stored observation
:param gamma: Learning decay
:param up_score_probabilities: Probabilities for up score
:returns: List of scores for each move
"""
score_sum = 0
discounted_rewards = []
# go backwards through all observations
for i, p in enumerate(reversed(self._states_score_action)):
o = p[0]
s = p[1]
if s != 0:
score_sum = 0
score_sum = score_sum * gamma + s
discounted_rewards.append(score_sum)
# # normalize scores
discounted_rewards = np.array(discounted_rewards)
discounted_rewards -= np.mean(discounted_rewards)
discounted_rewards /= np.std(discounted_rewards)
return discounted_rewards
Below is my network:
with tf.variable_scope('NN_Model', reuse=tf.AUTO_REUSE):
layer1 = tf.layers.conv2d(inputs,
3,
3,
strides=(1, 1),
padding='valid',
data_format='channels_last',
dilation_rate=(1, 1),
activation= tf.nn.relu,
use_bias=True,
bias_initializer=tf.zeros_initializer(),
trainable=True,
name='layer1'
)
# (N - F + 1) x (N - F + 1)
# => layer1 should be
# (80 - 3 + 1) * (80 - 3 + 1) = 78 x 78
pool1 = tf.layers.max_pooling2d(layer1,
pool_size=5,
strides=2,
name='pool1')
# int((N - f) / s +1)
# (78 - 5) / 2 + 1 = 73/2 + 1 = 37
layer2 = tf.layers.conv2d(pool1,
5,
5,
strides=(2, 2),
padding='valid',
data_format='channels_last',
dilation_rate=(1, 1),
activation= tf.nn.relu,
use_bias=True,
kernel_initializer=tf.random_normal_initializer(),
bias_initializer=tf.zeros_initializer(),
trainable=True,
name='layer2',
reuse=None
)
# ((N + 2xpadding - F) / stride + 1) x ((N + 2xpadding - F) / stride + 1)
# => layer1 should be
# int((37 + 0 - 5) / 2) + 1
# 16 + 1 = 17
pool2 = tf.layers.max_pooling2d(layer2,
pool_size=3,
strides=2,
name='pool2')
# int((N - f) / s +1)
# (17 - 3) / 2 + 1 = 7 + 1 = 8
flat1 = tf.layers.flatten(pool2, 'flat1')
# Kx64
full1 = tf.contrib.layers.fully_connected(flat1,
num_outputs=1,
activation_fn=tf.nn.sigmoid,
weights_initializer=tf.contrib.layers.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
trainable=True,
scope=None
)
The algorithm you're looking for is called REINFORCE.
I would suggest reading chapter 13 of Sutton and Barto's RL book.
Here's pseudocode from the book.
Here, theta is the set of weights of your neural net. If you're unfamiliar with some of the rest of the notation, I'd suggest reading Chapter 3 of the above-mentioned book. It covers the basic problem formulation.
I'm trying to train a convolutional autoencoder to encode and decode a piano roll representation of monophonic midi clips. I reduced the note range to 3 octaves, divide songs into 100 time step pieces (where 1 time step = 1/100th of a second), and train the net in batches of 3 pieces.
I'm using Adagrad as my optimizer, and MSE as my loss function. The loss is huge, and I see no decrease in average loss even after hundreds of training examples are fed in.
Here's my code:
"""
Most absolutely simple assumptions:
- not changing the key of any of the files
- not changing the tempo of any of the files
- take blocks of 36 by 100
- divide up all songs by this amount, cutting off any excess from the
end, train
"""
from __future__ import print_function
import cPickle as pickle
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from reverse_pianoroll import piano_roll_to_pretty_midi as pr2pm
N = 1000
# load a NxMxC dataset
# N: Number of clips
# M: Piano roll size, the number of midi notes that could possibly be 'on'
# C: Clip length, in 100ths of a second
dataset = pickle.load(open('mh-midi-data.pickle', 'rb'))
######## take a subset of the data for training ######
# based on the mean and standard deviation of non zero entries in the data, I've
# found that the most populous, and thus best range of notes to take is from
# 48 to 84 (C2 - C5); this is 3 octaves, which is much less than the original
# 10 and a half. Additionally, we're going to take a subsample of 1000 because
# i'm training on my macbook and the network is pretty simple
######################################################
dataset = dataset[:, :, 48:84, :]
dataset = dataset[:N]
######################################################
midi_dim, clip_len = dataset.shape[2:]
class Autoencoder(nn.Module):
def __init__(self, **kwargs):
super(Autoencoder, self).__init__(**kwargs)
# input is 3 x 1 x 36 x 100
self.conv1 = nn.Conv2d(in_channels=1, out_channels=14, kernel_size=(midi_dim, 2))
# now transformed to 3 x 14 x 1 x 99
self.conv2 = nn.Conv2d(in_channels=14, out_channels=77, kernel_size=(1, 4))
# now transformed to 3 x 77 x 1 x 96
input_size = 3*77*1*96
self.fc1 = nn.Linear(input_size, input_size/2)
self.fc2 = nn.Linear(input_size/2, input_size/4)
self.fc3 = nn.Linear(input_size/4, input_size/2)
self.fc4 = nn.Linear(input_size/2, input_size)
self.tconv2 = nn.ConvTranspose2d(in_channels=77, out_channels=14, kernel_size=(1, 4))
self.tconv1 = nn.ConvTranspose2d(in_channels=14, out_channels=1, kernel_size=(midi_dim, 2))
self.sigmoid = nn.Sigmoid()
return
def forward(self, x):
# print("1: {}".format(x.size()))
x = F.relu(self.conv1(x))
# print("2: {}".format(x.size()))
x = F.relu(self.conv2(x))
# print("3: {}".format(x.size()))
x = x.view(-1, np.prod(x.size()[:]))
# print("4: {}".format(x.size()))
x = F.relu(self.fc1(x))
# print("5: {}".format(x.size()))
h = F.relu(self.fc2(x))
# print("6: {}".format(h.size()))
d = F.relu(self.fc3(h))
# print("7: {}".format(d.size()))
d = F.relu(self.fc4(d))
# print("8: {}".format(d.size()))
d = d.view(3, 77, 1, 96)
# print("9: {}".format(d.size()))
d = F.relu(self.tconv2(d))
# print("10: {}".format(d.size()))
d = self.tconv1(d)
d = self.sigmoid(d)
# print("11: {}".format(d.size()))
return d
net = Autoencoder()
loss_fn = nn.MSELoss()
# optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9)
optimizer = optim.Adagrad(net.parameters(), lr=1e-3)
batch_count = 0
avg_loss = 0.0
print_every = 3
print("Beginning Training")
for epoch in xrange(2):
# for i, clip in enumerate(dataset):
for i in xrange(len(dataset)/3):
batch = dataset[(3*i):(3*i + 3), :, :]
# get the input, wrap it in a Variable
inpt = Variable(torch.from_numpy(batch).type(torch.FloatTensor))
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outpt = net(inpt)
loss = loss_fn(outpt, inpt)
loss.backward()
optimizer.step()
# print stats out
avg_loss += loss.data[0]
if batch_count % print_every == print_every - 1:
print('epoch: %d, batch_count: %d, loss: %.3f'%(
epoch + 1, batch_count + 1, avg_loss / print_every))
avg_loss = 0.0
batch_count += 1
print('Finished Training')
I'm really a beginner with this stuff, so any advice would be greatly appreciated.
Double check that you normalize your inpt to be in the range of 0 to 1. For instance, if you are working with images you could just divide inpt variable by 255.
I wrote following code for gradientDescent in Octave in .m file as follows:
function [theta, J_history] = gradientDescent(X, y, theta, alpha, num_iters)
% Test values:
X = [1 5; 1 2; 1 4; 1 5];
y = [1 6 4 2]';
theta = [0 0]';
alpha = 0.01;
num_iters = 1000;
% Initialize some useful values:
m = length(y); % number of training examples
J_history = zeros(num_iters, 1);
for iter = 1:num_iters
x = X(:,2);
h = theta(1) + (theta(2)*x);
theta_zero = theta(1) - alpha * (1/m) * sum(h-y);
theta_one = theta(2) - alpha * (1/m) * sum((h - y) .* x);
theta = [theta_zero; theta_one];
% ============================================================
% Save the cost J in every iteration
J_history(iter) = computeCost(X, y, theta); % History of J
end
disp(min(J_history));
end
% Code for computeCost function is as follows:
function J = computeCost(X, y, theta)
data =
6.1101 17.5920
5.5277 9.1302
8.5186 13.6620
7.0032 11.8540
5.8598 6.8233
8.3829 11.8860
7.4764 4.3483
8.5781 12.0000
6.4862 6.5987
m = length(y);
J = 0;
X = data(:, 1);
y = data(:, 2);
predictions = X*theta'; % predictions of hypothesis on examples
sqrErrors = (predictions - y).^2; % squared errors
J = 1/(2*m) * sum(sqrErrors);
end
When I run this from octave workspace I get the following error:
Error: A(I) = X: X must have the same size as I
error: called from
gradientDescent at line 55 column 21
I tried many things but unsuccessfully and mentors never replied properly.
Can you please tell me where I may be making a mistake.
Thanks in advance.
Bharat.