Loss is not decreasing for convolutional autoencoder - deep-learning

I'm trying to train a convolutional autoencoder to encode and decode a piano roll representation of monophonic midi clips. I reduced the note range to 3 octaves, divide songs into 100 time step pieces (where 1 time step = 1/100th of a second), and train the net in batches of 3 pieces.
I'm using Adagrad as my optimizer, and MSE as my loss function. The loss is huge, and I see no decrease in average loss even after hundreds of training examples are fed in.
Here's my code:
"""
Most absolutely simple assumptions:
- not changing the key of any of the files
- not changing the tempo of any of the files
- take blocks of 36 by 100
- divide up all songs by this amount, cutting off any excess from the
end, train
"""
from __future__ import print_function
import cPickle as pickle
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from reverse_pianoroll import piano_roll_to_pretty_midi as pr2pm
N = 1000
# load a NxMxC dataset
# N: Number of clips
# M: Piano roll size, the number of midi notes that could possibly be 'on'
# C: Clip length, in 100ths of a second
dataset = pickle.load(open('mh-midi-data.pickle', 'rb'))
######## take a subset of the data for training ######
# based on the mean and standard deviation of non zero entries in the data, I've
# found that the most populous, and thus best range of notes to take is from
# 48 to 84 (C2 - C5); this is 3 octaves, which is much less than the original
# 10 and a half. Additionally, we're going to take a subsample of 1000 because
# i'm training on my macbook and the network is pretty simple
######################################################
dataset = dataset[:, :, 48:84, :]
dataset = dataset[:N]
######################################################
midi_dim, clip_len = dataset.shape[2:]
class Autoencoder(nn.Module):
def __init__(self, **kwargs):
super(Autoencoder, self).__init__(**kwargs)
# input is 3 x 1 x 36 x 100
self.conv1 = nn.Conv2d(in_channels=1, out_channels=14, kernel_size=(midi_dim, 2))
# now transformed to 3 x 14 x 1 x 99
self.conv2 = nn.Conv2d(in_channels=14, out_channels=77, kernel_size=(1, 4))
# now transformed to 3 x 77 x 1 x 96
input_size = 3*77*1*96
self.fc1 = nn.Linear(input_size, input_size/2)
self.fc2 = nn.Linear(input_size/2, input_size/4)
self.fc3 = nn.Linear(input_size/4, input_size/2)
self.fc4 = nn.Linear(input_size/2, input_size)
self.tconv2 = nn.ConvTranspose2d(in_channels=77, out_channels=14, kernel_size=(1, 4))
self.tconv1 = nn.ConvTranspose2d(in_channels=14, out_channels=1, kernel_size=(midi_dim, 2))
self.sigmoid = nn.Sigmoid()
return
def forward(self, x):
# print("1: {}".format(x.size()))
x = F.relu(self.conv1(x))
# print("2: {}".format(x.size()))
x = F.relu(self.conv2(x))
# print("3: {}".format(x.size()))
x = x.view(-1, np.prod(x.size()[:]))
# print("4: {}".format(x.size()))
x = F.relu(self.fc1(x))
# print("5: {}".format(x.size()))
h = F.relu(self.fc2(x))
# print("6: {}".format(h.size()))
d = F.relu(self.fc3(h))
# print("7: {}".format(d.size()))
d = F.relu(self.fc4(d))
# print("8: {}".format(d.size()))
d = d.view(3, 77, 1, 96)
# print("9: {}".format(d.size()))
d = F.relu(self.tconv2(d))
# print("10: {}".format(d.size()))
d = self.tconv1(d)
d = self.sigmoid(d)
# print("11: {}".format(d.size()))
return d
net = Autoencoder()
loss_fn = nn.MSELoss()
# optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9)
optimizer = optim.Adagrad(net.parameters(), lr=1e-3)
batch_count = 0
avg_loss = 0.0
print_every = 3
print("Beginning Training")
for epoch in xrange(2):
# for i, clip in enumerate(dataset):
for i in xrange(len(dataset)/3):
batch = dataset[(3*i):(3*i + 3), :, :]
# get the input, wrap it in a Variable
inpt = Variable(torch.from_numpy(batch).type(torch.FloatTensor))
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outpt = net(inpt)
loss = loss_fn(outpt, inpt)
loss.backward()
optimizer.step()
# print stats out
avg_loss += loss.data[0]
if batch_count % print_every == print_every - 1:
print('epoch: %d, batch_count: %d, loss: %.3f'%(
epoch + 1, batch_count + 1, avg_loss / print_every))
avg_loss = 0.0
batch_count += 1
print('Finished Training')
I'm really a beginner with this stuff, so any advice would be greatly appreciated.

Double check that you normalize your inpt to be in the range of 0 to 1. For instance, if you are working with images you could just divide inpt variable by 255.

Related

Why network is not learning with this loss?

I've been playing around a bit with Pytorch and have created a convolutional network with a total of 3 layers. I created a loss function that takes the results from the first layer and tries to minimize the norm.
So that view2 displays the data after the first layer in a matrix.
During learning, the error did not change at all, and the city was equal to 1 the whole time.
I know that this code doesn't make sense, but I am very intersting to her very this code is not working.
data = sio.loadmat('ORL_32x32.mat')
x, y = data['fea'], data['gnd']
x, y = data['fea'].reshape((-1, 1, 32, 32)), data['gnd']
y = np.squeeze(y - 1) # y in [0, 1, ..., K-1]
class ConvAutoencoder(nn.Module):
def __init__(self):
super(ConvAutoencoder, self).__init__()
## encoder layers ##
# conv layer (depth from 3 --> 16), 3x3 kernels
self.conv1 = nn.Conv2d(1, 3, 3)
self.conv2 = nn.Conv2d(3 ,3, 3)
self.conv3 = nn.Conv2d(3, 3, 3)
self.conv4 = nn.Conv2d(3, 3, 3)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
x = F.relu(self.conv4(x))
return x
def test1(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
return x
def test2(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
x = F.relu(self.conv4(x))
return x
def my_loss(novi2):
return torch.tensor(LA.norm(novi2)).to(device)
model = ConvAutoencoder().to(device)
epochs = 950;
lossList = []
view2 = np.zeros((576,400))
view3 = np.zeros((576,400))
losses = torch.tensor(0.).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
if not isinstance(x, torch.Tensor):
x = torch.tensor(x, dtype=torch.float32, device=device)
x = x.to(device)
if isinstance(y, torch.Tensor):
y = y.to('cuda').numpy()
K = len(np.unique(y))
for epoch in range(epochs):
view2 = np.zeros((576,400))
view3 = np.zeros((576,400))
output = model.test2(x.to(device)).cpu().detach().numpy()
output1 = model.test1(x.to(device)).cpu().detach().numpy()
for i in range(numclass):
lovro = output[i]
lovro =lovro[[0]]
lovro = lovro.squeeze(axis = 0)
lovro = lovro.flatten()
for j in range(576):
view2[j][i] = lovro[j]
for i in range(numclass):
lovro = output[i]
loss = my_loss(view2)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('Epoch %02d' %
(epoch))
The way you implemented your loss does not really look "differentiable". I am putting it in quotation marks because what you are observing is a difference between mathematical diffentiation and backpropagation. There is no functional dependency in the underlying graph of computation between your variables and your loss. The reason for that is because you used an array, where you copied values into. So while your loss depends on values of "view2" it does not depend on values of outputs of your model. You have to avoid any value assignments when defining your computation.
x = np.array([0])
x[0] = output_of_network
loss = LA.norm(x) # wrong
loss = LA.norm(output_of_network) # correct

Why does Softmax(dim=0) produce poor results?

I'm getting weird results from a PyTorch Softmax layer, trying to figure out what's going on, so I boiled it down to a minimal test case, a neural network that just learns to decode binary numbers into one-hot form.
Just Softmax() gets a warning:
UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
Okay, so what to supply for X? I had been guessing 0 would be a sensible argument. Just to make sure, I tried Softmax(dim=1):
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
Okay, so that seems clear about allowed values. -1 apparently means the last dimension, so in this case, where the output is just a one-dimensional vector, that should mean the same thing as 0. Trying it with Softmax(dim=-1) works fine; in a few thousand epochs, the network reliably learns to decode the numbers with 100% accuracy.
Just to make sure it gives the same results, I tried it again with Softmax(dim=0) (as shown below)...
And it does not give the same result at all. The accuracy oscillates, but levels off somewhere around 20-30%.
What's going on? Why is 0 not the same as -1 in this context, and what exactly is 0 doing?
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
bits = 5
class Dataset1(Dataset):
def __init__(self):
s = []
for i in range(1 << bits):
x = []
for c in format(i, "b").zfill(bits):
x.append(float(c == "1"))
y = []
for j in range(1 << bits):
y.append(float(i == j))
x = torch.as_tensor(x)
y = torch.as_tensor(y)
s.append((x, y))
self.s = s
def __len__(self):
return len(self.s)
def __getitem__(self, i):
return self.s[i]
trainDs = Dataset1()
batchSize = 16
trainDl = DataLoader(trainDs, batch_size=batchSize)
for x, y in trainDl:
print(x.shape)
print(y.shape)
break
hiddenSize = 100
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layers = nn.Sequential(
nn.Linear(bits, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, hiddenSize),
nn.Tanh(),
nn.Linear(hiddenSize, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, 1 << bits),
nn.Softmax(dim=0),
)
def forward(self, x):
return self.layers(x)
device = torch.device("cpu")
model = Net().to(device)
def accuracy(model, ds):
n = 0
for x, y in ds:
with torch.no_grad():
z = model(x)
if torch.argmax(y) == torch.argmax(z):
n += 1
return n / len(ds)
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10000
interval = epochs // 10
for epoch in range(epochs + 1):
for bi, (x, y) in enumerate(trainDl):
x = x.to(device)
y = y.to(device)
loss = criterion(model(x), y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % interval == 0 and not bi:
print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")
In the accuracy function, you forgot to create a new dimension for the batch (batchsize=1), which explains why it gives that error when you use dim=1. Regarding the dimension of the softmax, you can check this post.
Below is the modified code.
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
bits = 5
class Dataset1(Dataset):
def __init__(self):
s = []
for i in range(1 << bits):
x = []
for c in format(i, "b").zfill(bits):
x.append(float(c == "1"))
y = []
for j in range(1 << bits):
y.append(float(i == j))
x = torch.as_tensor(x)
y = torch.as_tensor(y)
s.append((x, y))
self.s = s
def __len__(self):
return len(self.s)
def __getitem__(self, i):
return self.s[i]
trainDs = Dataset1()
batchSize = 16
trainDl = DataLoader(trainDs, batch_size=batchSize, drop_last=True)
for x, y in trainDl:
print(x.shape)
print(y.shape)
break
hiddenSize = 100
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layers = nn.ModuleList(
[nn.Linear(bits, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, hiddenSize),
nn.Tanh(),
nn.Linear(hiddenSize, hiddenSize),
nn.ReLU(),
nn.Linear(hiddenSize, 1 << bits),
nn.Softmax(dim=1)]
)
def forward(self, x):
for i,layer in enumerate(self.layers):
x = layer(x)
if i == 6:
pass
#print('softmax input shape',x.shape)
#print('softmax output shape',torch.nn.functional.softmax(x,dim=1).shape)
#print('linear',x.shape)
#print('output',x.shape)
return x
device = torch.device("cpu")
model = Net().to(device)
def accuracy(model, ds):
n = 0
for x, y in ds:
x = x.unsqueeze(0) # create a batch of size 1
y = y.unsqueeze(0) # create a batch of size 1
with torch.no_grad():
z = model(x)
print(z.shape)
break
if torch.argmax(y) == torch.argmax(z):
n += 1
return n / len(ds)
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10000
interval = epochs // 10
for epoch in range(epochs + 1):
for bi, (x, y) in enumerate(trainDl):
x = x.to(device)
y = y.to(device)
loss = criterion(model(x), y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % interval == 0 and not bi:
print(f"{epoch}\t{loss}\t{accuracy(model, trainDs)}")

Why Deep Adaptive Input Normalization (DAIN) normalizes time series data accross rows?

The DAIN paper describes how a network learns to normalize time series data by itself, here is how the authors implemented it. The code leads me to think that normalization is happening across rows, not columns. Can anyone explain why it is implemented that way? Because I always thought that one normalizes time series only across columns to keep each feature's true information.
Here is the piece the does normalization:
```python
class DAIN_Layer(nn.Module):
def __init__(self, mode='adaptive_avg', mean_lr=0.00001, gate_lr=0.001, scale_lr=0.00001, input_dim=144):
super(DAIN_Layer, self).__init__()
print("Mode = ", mode)
self.mode = mode
self.mean_lr = mean_lr
self.gate_lr = gate_lr
self.scale_lr = scale_lr
# Parameters for adaptive average
self.mean_layer = nn.Linear(input_dim, input_dim, bias=False)
self.mean_layer.weight.data = torch.FloatTensor(data=np.eye(input_dim, input_dim))
# Parameters for adaptive std
self.scaling_layer = nn.Linear(input_dim, input_dim, bias=False)
self.scaling_layer.weight.data = torch.FloatTensor(data=np.eye(input_dim, input_dim))
# Parameters for adaptive scaling
self.gating_layer = nn.Linear(input_dim, input_dim)
self.eps = 1e-8
def forward(self, x):
# Expecting (n_samples, dim, n_feature_vectors)
# Nothing to normalize
if self.mode == None:
pass
# Do simple average normalization
elif self.mode == 'avg':
avg = torch.mean(x, 2)
avg = avg.resize(avg.size(0), avg.size(1), 1)
x = x - avg
# Perform only the first step (adaptive averaging)
elif self.mode == 'adaptive_avg':
avg = torch.mean(x, 2)
adaptive_avg = self.mean_layer(avg)
adaptive_avg = adaptive_avg.resize(adaptive_avg.size(0), adaptive_avg.size(1), 1)
x = x - adaptive_avg
# Perform the first + second step (adaptive averaging + adaptive scaling )
elif self.mode == 'adaptive_scale':
# Step 1:
avg = torch.mean(x, 2)
adaptive_avg = self.mean_layer(avg)
adaptive_avg = adaptive_avg.resize(adaptive_avg.size(0), adaptive_avg.size(1), 1)
x = x - adaptive_avg
# Step 2:
std = torch.mean(x ** 2, 2)
std = torch.sqrt(std + self.eps)
adaptive_std = self.scaling_layer(std)
adaptive_std[adaptive_std <= self.eps] = 1
adaptive_std = adaptive_std.resize(adaptive_std.size(0), adaptive_std.size(1), 1)
x = x / (adaptive_std)
elif self.mode == 'full':
# Step 1:
avg = torch.mean(x, 2)
adaptive_avg = self.mean_layer(avg)
adaptive_avg = adaptive_avg.resize(adaptive_avg.size(0), adaptive_avg.size(1), 1)
x = x - adaptive_avg
# # Step 2:
std = torch.mean(x ** 2, 2)
std = torch.sqrt(std + self.eps)
adaptive_std = self.scaling_layer(std)
adaptive_std[adaptive_std <= self.eps] = 1
adaptive_std = adaptive_std.resize(adaptive_std.size(0), adaptive_std.size(1), 1)
x = x / adaptive_std
# Step 3:
avg = torch.mean(x, 2)
gate = F.sigmoid(self.gating_layer(avg))
gate = gate.resize(gate.size(0), gate.size(1), 1)
x = x * gate
else:
assert False
return x
```
I am not sure either but they do transpose in forward function : x = x.transpose(1, 2) of the MLP class. Thus, it seemed to me that they normalise over time for each feature.

Using libreOffice calc to fit a plane through a set of 3D points minimizing the total distance

Consider a set of 3D points:
| y/z | -1 | 0 | 1 |
|:---:|:------:|:------:|:------:|
| 5 | 19.898 | 19.905 | 19.913 |
| 0 | 19.898 | 19.92 | 19.935 |
| -3 | 19.883 | 19.883 | 19.92 |
| -4 | 19.86 | 19.898 | 19.898 |
where the rows are yis, columns are zis and the content are xis.
I want to fit a plane of
Ax + By + Cz + D = 0
into these points in a way the total distance of:
E = ∑ (|Axi + Byi + Czi + D| / √(A^2 + B^2 + C^2))
to be minimized. Consider that I want to have the absolute deviation |...| not the variance as used in conventional regression methods. Also please consider that the dimension of the actual data frame is much bigger, so it will be great if the solution is computationally efficient too.
I would appreciate if you could help me with this issue. Thanks in advance.
Reference: equations from here.
This stand-alone Python program should do what you want for the fitting. I do not know how to call it from Calc, or pass data and results back and forth between Calc and Python.
Ax + By + Cz + D = 0
rearranges to
Ax + By + D = -Cz
which rearranges to
(Ax + By + D) / -C = z
That is a 3D surface equation of the form "z = f(x,y)", easily fit with scipy's curve_fit as shown here:
import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt
graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels
# 3D contour plot lines
numberOfContourLines = 16
def SurfacePlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)
axes.scatter(x_data, y_data, z_data) # show data along with plotted surface
axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
axes.set_zlabel('Z Data') # Z axis data label
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ContourPlot(func, data, fittedParameters):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
x_data = data[0]
y_data = data[1]
z_data = data[2]
xModel = numpy.linspace(min(x_data), max(x_data), 20)
yModel = numpy.linspace(min(y_data), max(y_data), 20)
X, Y = numpy.meshgrid(xModel, yModel)
Z = func(numpy.array([X, Y]), *fittedParameters)
axes.plot(x_data, y_data, 'o')
axes.set_title('Contour Plot') # add a title for contour plot
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def ScatterPlot(data):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
matplotlib.pyplot.grid(True)
axes = Axes3D(f)
x_data = data[0]
y_data = data[1]
z_data = data[2]
axes.scatter(x_data, y_data, z_data)
axes.set_title('Scatter Plot (click-drag with mouse)')
axes.set_xlabel('X Data')
axes.set_ylabel('Y Data')
axes.set_zlabel('Z Data')
plt.show()
plt.close('all') # clean up after using pyplot or else thaere can be memory and process problems
def func(data, A, B, C, D):
x = data[0]
y = data[1]
return (A*x + B*y + D) / -C
if __name__ == "__main__":
xData = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
yData = numpy.array([11.0, 12.1, 13.0, 14.1, 15.0, 16.1, 17.0, 18.1, 90.0])
zData = numpy.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.0, 9.9])
data = [xData, yData, zData]
initialParameters = [1.0, 1.0, 1.0, 1.0] # these are the same as scipy default values in this example
# here a non-linear surface fit is made with scipy's curve_fit()
fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters)
ScatterPlot(data)
SurfacePlot(func, data, fittedParameters)
ContourPlot(func, data, fittedParameters)
print('fitted prameters', fittedParameters)
modelPredictions = func(data, *fittedParameters)
absError = modelPredictions - zData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)

Issues with Q-learning and neural networks

I'm just starting out learning Q-learning, and I've been okay with using the tabular method to get some decent results. One game I found quite fun to use Q-learning was with Blackjack, which seemed like a perfect MDP type problem.
I've been wanting to extend this to using a neural network as a function approximator, but I'm not having any luck at all. The approach is to calculate the expected value for every action in a given state and then pick the best one with a small chance of picking something random (epsilon greedy). Nothing converges, it learns silly Q-values, and it can't even figure out how to play when the only card in the deck is 5.
I am genuinely stuck, after spending hours on this and tuning hyper parameters and everything else I can think of. I feel like I must have made a fundamental error with Q-learning that I can't see. My code is below:
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import random
import pandas as pd
import sklearn
import math
import itertools
import tensorflow as tf
from matplotlib import pyplot as plt
############################ START BLACKJACK CLASS ############################
class Blackjack(gym.Env):
"""Simple Blackjack environment"""
def __init__(self, natural=False):
self.action_space = spaces.Discrete(2)
self._seed()
# Start the first game
self.prevState = self.reset()
def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return seed
# Returns a tuple of the form (str, int) where str is "H" or "S" depending on if its a
# Soft or Hard hand and int is the sum total of the cards in hand
# Example output: ("H", 15)
def getTotal(cards):
running_total = 0
softs = 0
for c in cards:
running_total += c
if c == 11:
softs += 1
if running_total > 21 and softs > 0:
softs -= 1
running_total -= 10
return "H" if softs == 0 else "S", running_total
def drawCard():
# Draw a random card from the deck with replacement. 11 is ACE
# I've set it to always draw a 5. In theory this should be very easy to learn and
# The only possible states, and their correct Q values should be:
# Q[10_5, stand] = -1 Q[10_5, hit] = 0
# Q[15_5, stand] = -1 Q[15_5, hit] = 0
# Q[20_5, stand] = 0 Q[20_5, hit] = -1
# The network can't even learn this!
return 5
return random.choice([5,6])
return random.choice([2,3,4,5,6,7,8,9,10,10,10,10,11])
def isBlackjack(cards):
return sum(cards) == 21 and len(cards) == 2
def getState(self):
# Defines the state of the current game
pstate, ptotal = Blackjack.getTotal(self.player)
dstate, dtotal = Blackjack.getTotal(self.dealer)
return "{}_{}".format("BJ" if Blackjack.isBlackjack(self.player) else pstate+str(ptotal), dtotal)
def reset(self):
# Resets the game - Dealer is dealt 1 card, player is dealt 2 cards
# The player and dealer are represented by an array of numbers, which are the cards they were
# dealt in order
self.soft = "H"
self.dealer = [Blackjack.drawCard()]
self.player = [Blackjack.drawCard() for _ in range(2)]
pstate, ptotal = Blackjack.getTotal(self.player)
dstate, dtotal = Blackjack.getTotal(self.dealer)
# Returns the current state of the game
return self.getState()
def step(self, action):
assert self.action_space.contains(action)
# Action should be 0 or 1.
# If standing, the dealer will draw all cards until they are >= 17. This will end the episode
# If hitting, a new card will be added to the player, if over 21, reward is -1 and episode ends
# Stand
if action == 0:
pstate, ptotal = Blackjack.getTotal(self.player)
dstate, dtotal = Blackjack.getTotal(self.dealer)
while dtotal < 17:
self.dealer.append(Blackjack.drawCard())
dstate, dtotal = Blackjack.getTotal(self.dealer)
# if player won with blackjack
if Blackjack.isBlackjack(self.player) and not Blackjack.isBlackjack(self.dealer):
rw = 1.5
# if dealer bust or if the player has a higher number than dealer
elif dtotal > 21 or (dtotal <= 21 and ptotal > dtotal and ptotal <= 21):
rw = 1
# if theres a draw
elif dtotal == ptotal:
rw = 0
# player loses in all other situations
else:
rw = -1
state = self.getState()
# Returns (current_state, reward, boolean_true_if_episode_ended, empty_dict)
return state, rw, True, {}
# Hit
else:
# Player draws another card
self.player.append(Blackjack.drawCard())
# Calc new total for player
pstate, ptotal = Blackjack.getTotal(self.player)
state = self.getState()
# Player went bust and episode is over
if ptotal > 21:
return state, -1, True, {}
# Player is still in the game, but no observed reward yet
else:
return state, 0, False, {}
############################ END BLACKJACK CLASS ############################
# Converts a player or dealers hand into an array of 10 cards
# that keep track of how many of each card are held. The card is identified
# through its index:
# Index: 0 1 2 3 4 5 6 7 9 10
# Card: 2 3 4 5 6 7 8 9 T A
def cardsToX(cards):
ans = [0] * 12
for c in cards:
ans[c] += 1
ans = ans[2:12]
return ans
# Easy way to convert Q values into weighted decision probabilities via softmax.
# This is useful if we probablistically choose actions based on their values rather
# than always choosing the max.
# eg Q[s,0] = -1
# Q[s,1] = -2
# softmax([-1,-2]) = [0.731, 0.269] --> 73% chance of standing, 27% chance of hitting
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
plt.ion()
# Define number of Neurons per layer
K = 20 # Layer 1
L = 10 # Layer 2
M = 5 # Layer 2
N_IN = 20 # 10 unique cards for player, and 10 for dealer = 20 total inputs
N_OUT = 2
SDEV = 0.000001
# Input / Output place holders
X = tf.placeholder(tf.float32, [None, N_IN])
X = tf.reshape(X, [-1, N_IN])
# This will be the observed reward + decay_factor * max(Q[s+1, 0], Q[s+1, 1]).
# This should be an estimate of the 'correct' Q-value with the ony caveat being that
# the Q-value of the next state is a biased estimate of the true value.
Q_TARGET = tf.placeholder(tf.float32, [None, N_OUT])
# LAYER 1
W1 = tf.Variable(tf.random_normal([N_IN, K], stddev = SDEV))
B1 = tf.Variable(tf.random_normal([K], stddev = SDEV))
# LAYER 2
W2 = tf.Variable(tf.random_normal([K, L], stddev = SDEV))
B2 = tf.Variable(tf.random_normal([L], stddev = SDEV))
# LAYER 3
W3 = tf.Variable(tf.random_normal([L, M], stddev = SDEV))
B3 = tf.Variable(tf.random_normal([M], stddev = SDEV))
# LAYER 4
W4 = tf.Variable(tf.random_normal([M, N_OUT], stddev = SDEV))
B4 = tf.Variable(tf.random_normal([N_OUT], stddev = SDEV))
H1 = tf.nn.relu(tf.matmul(X, W1) + B1)
H2 = tf.nn.relu(tf.matmul(H1, W2) + B2)
H3 = tf.nn.relu(tf.matmul(H2, W3) + B3)
# The predicted Q value, as determined by our network (function approximator)
# outputs expected reward for standing and hitting in the form [stand, hit] given the
# current game state
Q_PREDICT = (tf.matmul(H3, W4) + B4)
# Is this correct? The Q_TARGET should be a combination of the real reward and the discounted
# future rewards of the future state as predicted by the network. Q_TARGET - Q_PREDICT should be
# the error in prediction, which we want to minimise. Does this loss function work to help the network
# converge to the true Q values with sufficient training?
loss_func = tf.reduce_sum(tf.square(Q_TARGET - Q_PREDICT))
# This are some placeholder values to enable manually set decayed learning rates. For now, use
# the same learning rate all the time.
LR_START = 0.001
#LR_END = 0.000002
#LR_DECAY = 0.999
# Optimizer
LEARNING_RATE = tf.Variable(LR_START, trainable=False)
optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)#(LEARNING_RATE)
train_step = optimizer.minimize(loss_func)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# Initialise the game environment
game = Blackjack()
# Number of episodes (games) to play
num_eps = 10000000
# probability of picking a random action. This decays over time
epsilon = 0.1
# discount factor. For blackjack, future rewards are equally important as immediate rewards.
discount = 1.0
all_rewards = [] # Holds all observed rewards. The rolling mean of rewards should improve as the network learns
all_Qs = [] # Holds all predicted Q values. Useful as a sanity check once the network is trained
all_losses = [] # Holds all the (Q_TARGET - Q_PREDICTED) values. The rolling mean of this should decrease
hands = [] # Holds a summary of all hands played. (game_state, Q[stand], Q[hit], action_taken)
# boolean switch to use the highest action value instead of a stochastic decision via softmax on Q-values
use_argmax = True
# Begin generating episodes
for ep in range(num_eps):
game.reset()
# Keep looping until the episode is not over
while True:
# x is the array of 20 numbers. The player cards, and the dealer cards.
x = cardsToX(game.player) + cardsToX(game.dealer)
# Q1 refers to the predicted Q-values before any action was taken
Q1 = sess.run(Q_PREDICT, feed_dict = {X : np.reshape( np.array(x), (-1, N_IN) )})
all_Qs.append(Q1)
if use_argmax:
# action is selected to be the one with the highest Q-value
act = np.argmax(Q1)
else:
# action is a weighted selection based on predicted Q_values
act = np.random.choice(range(N_OUT), p = softmax(Q1)[0])
if random.random() < epsilon:
# action is selected randomly
act = random.randint(0, N_OUT-1)
# Get game state before action is taken
game_state = game.getState()
# Take action! Observe new state, reward, and if the game is over
game_state_new, reward, done, _ = game.step(act)
hands.append( (game_state, Q1[0][0], Q1[0][1], act, reward) )
# Store the new state vector to feed into our network.
# x2 corresponds to the x vector observed in state s+1
x2 = cardsToX(game.player) + cardsToX(game.dealer)
# Q2 refers to the predicted Q-values in the new s+1 state. This is used for the 'SARSA' update.
Q2 = sess.run(Q_PREDICT,feed_dict = {X : np.reshape( np.array(x2), (-1, N_IN) )})
# Store the maximum Q-value in this new state. This should be the expected reward from this new state
maxQ2 = np.max(Q2)
# targetQ is the same as our predicted one initially. The index of the action we took will be
# updated to be [observed reward] + [discount_factor] * max(Q[s+1])
targetQ = np.copy(Q1)
# If the game is done, then there is no future state
if done:
targetQ[0,act] = reward
all_rewards.append(reward)
else:
targetQ[0,act] = reward + discount * maxQ2
# Perform one gradient descent update, filling the placeholder value for Q_TARGET with targetQ.
# The returned loss is the difference between the predicted Q-values and the targetQ we just calculated
loss, _, _ = sess.run([loss_func, Q_PREDICT, train_step],
feed_dict = {X : np.reshape( np.array(x), (-1, N_IN) ),
Q_TARGET : targetQ}
)
all_losses.append(loss)
# Every 1000 episodes, show how the q-values moved after the gradient descent update
if ep % 1000 == 0 and ep > 0:
Q_NEW = sess.run(Q_PREDICT, feed_dict = {X : np.reshape( np.array(x), (-1, N_IN) ),
Q_TARGET : targetQ})
#print(game_state, targetQ[0], Q1[0], (Q_NEW-Q1)[0], loss, ep, epsilon, act)
rolling_window = 1000
rolling_mean = np.mean( all_rewards[-rolling_window:] )
rolling_loss = np.mean( all_losses[-rolling_window:] )
print("Rolling mean reward: {:<10.4f}, Rolling loss: {:<10.4f}".format(rolling_mean, rolling_loss))
if done:
# Reduce chance of random action as we train the model.
epsilon = 2/((ep/500) + 10)
epsilon = max(0.02, epsilon)
# rolling mean of rewards should increase over time!
if ep % 1000 == 0 and ep > 0:
pass# Show the rolling mean of all losses. This should decrease over time!
#plt.plot(pd.rolling_mean(pd.Series(all_losses), 5000))
#plt.pause(0.02)
#plt.show()
break
print(cardsToX(game.player))
print(game.dealer)
Any ideas? I'm stuck :(