Why does the reward of my model not pass a threshold? - reinforcement-learning

The problem is that at the beginning the reward increases its value up to -400, as it trains it decreases up to -100, but from there it does not continue increasing towards zero, it stays there stagnant.
10000/10000 [==============================] - 111s 11ms/step - reward: -100.0000
Interval 2 (10000 steps performed)
10000/10000 [==============================] - 126s 13ms/step - reward: -100.0000
Interval 3 (20000 steps performed)
10000/10000 [==============================] - 120s 12ms/step - reward: -100.0000
Interval 4 (30000 steps performed)
10000/10000 [==============================] - 126s 13ms/step - reward: -100.0000
Interval 5 (40000 steps performed)
9049/10000 [==========================>...] - ETA: 11s - reward: -100.0000
The model:
def step(self, action):
reward = 0
paqueteSale = False
listaPuertos, cargaPuertos, nodoSrc, nodoDst, tamPaquete = self.traducirEstado()
#Here are the different actions
#Calculamos la recompensa
if paqueteSale:
if calcularMLUGlobal() < umbralMLU:
reward = 1
else:
reward = umbralMLU-calcularMLUGlobal()
else:
reward = -100
self.rellenarEstado(listaPuertos, cargaPuertos)
#Devovlemos la informacion
return self.estado, reward, False, {}
Does anyone have any idea why this is happening? I am using DQNAgent from rl.agents for training.

Related

how to make Clock Angle Problem with python?

Clock Angle Problem: Given time in hh:mm format in 24-hour notation, calculate the shorter angle between the hour and minute hand in an analog clock.
Input: 5:30
Output: 15°
Input: 21:00
Output: 90°
Input: 12:00
Output: 0°
def clock_angle(time):
# Extract the hours and minutes from the time
hours, minutes = map(int, time.split(':'))
# Calculate the angles of the hour and minute hands
hour_angle = 30 * hours + 0.5 * minutes
minute_angle = 6 * minutes
# Calculate the shorter angle between the hour and minute hands
angle = abs(hour_angle - minute_angle)
if angle > 180:
angle = 360 - angle
return angle
# Test the function
print(clock_angle("12:00")) # should print 0
print(clock_angle("3:00")) # should print 90
print(clock_angle("6:00")) # should print 180
print(clock_angle("9:00")) # should print 90
print(clock_angle("12:30")) # should print 165

The DQN model cannot correctly come out the expected scores

I am working on a DQN training model of the game "CartPole-v1". In this model, the system did not remind any error information in the terminal. However, The result evaluation got worse.This is the output data:
episode: 85 score: 18 avarage score: 20.21 epsilon: 0.66
episode: 86 score: 10 avarage score: 20.09 epsilon: 0.66
episode: 87 score: 9 avarage score: 19.97 epsilon: 0.66
episode: 88 score: 14 avarage score: 19.90 epsilon: 0.65
episode: 89 score: 9 avarage score: 19.78 epsilon: 0.65
episode: 90 score: 10 avarage score: 19.67 epsilon: 0.65
episode: 91 score: 14 avarage score: 19.60 epsilon: 0.64
episode: 92 score: 13 avarage score: 19.53 epsilon: 0.64
episode: 93 score: 17 avarage score: 19.51 epsilon: 0.64
episode: 94 score: 10 avarage score: 19.40 epsilon: 0.63
episode: 95 score: 16 avarage score: 19.37 epsilon: 0.63
episode: 96 score: 16 avarage score: 19.33 epsilon: 0.63
episode: 97 score: 10 avarage score: 19.24 epsilon: 0.62
episode: 98 score: 13 avarage score: 19.17 epsilon: 0.62
episode: 99 score: 12 avarage score: 19.10 epsilon: 0.62
episode: 100 score: 11 avarage score: 19.02 epsilon: 0.61
episode: 101 score: 17 avarage score: 19.00 epsilon: 0.61
episode: 102 score: 11 avarage score: 18.92 epsilon: 0.61
episode: 103 score: 9 avarage score: 18.83 epsilon: 0.61
I'll show my code here. Firstly I constructed a neuron network:
import random
from torch.autograd import Variable
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import gym
from collections import deque
# construct a neuron network (prepare for step1, step3.2 and 3.3)
class DQN(nn.Module):
def __init__(self, s_space, a_space) -> None:
# inherit from DQN class in pytorch
super(DQN, self).__init__()
self.fc1 = nn.Linear(s_space, 360)
self.fc2 = nn.Linear(360, 360)
self.fc3 = nn.Linear(360, a_space)
# DNN operation architecture
def forward(self, input):
out = self.fc1(input)
out = F.relu(out)
out = self.fc2(out)
out = F.relu(out)
out = self.fc3(out)
return out
Instead of newing an agent class, I directly created the select function, which is used for select the corresponding action according to epsilon, and the back propagation function by gradient globally:
# define the action selection according to epsilon using neuron network (prepare for step3.2)
def select(net, epsilon, env, state):
# randomly select an action if not greedy
if(np.random.rand() <= epsilon):
action = env.action_space.sample()
return action
# select the maximum reward action by NN and the given state if greedy
else:
actions = net(Variable(th.Tensor(state))).detach().numpy()
action = np.argmax(actions[0])
return action
This is the back propagation function and the decreasing of epsilon:
# using loss function to improve neuron network (prepare for step3.3)
def backprbgt(net, store, batch_size, gamma, learning_rate):
# step1: create loss function and Adam optimizer
loss_F = nn.MSELoss()
opt = th.optim.Adam(net.parameters(),lr=learning_rate)
# step2: extract the sample in memory
materials = random.sample(store, batch_size)
# step3: Calculate arguments of loss function:
for t in materials:
Q_value = net(Variable(th.Tensor(t[0])))
# step3.1 Calculate tgt_Q_value in terms of greedy:
reward = t[3]
if(t[4] == True):
tgt = reward
else:
tgt = reward + gamma * np.amax(net(Variable(th.Tensor(t[2]))).detach().numpy()[0])
# print(tgt)
# tgt_Q_value = Variable(th.Tensor([[float(tgt)]]), requires_grad=True)
# print("Q_value:",Q_value)
Q_value[0][t[1]] = tgt
tgt_Q_value = Variable(th.Tensor(Q_value))
# print("tgt:",tgt_Q_value)
# step3.2 Calculate evlt_Q_value
# index = th.tensor([[t[1]]])
# evlt_Q_value = Q_value.gather(1,index) # gather tgt into the corresponding action
evlt_Q_value = net(Variable(th.Tensor(t[0])))
# print("evlt:",evlt_Q_value)
# step4: backward and optimization
loss = loss_F(evlt_Q_value, tgt_Q_value)
# print(loss)
opt.zero_grad()
loss.backward()
opt.step()
# step5: decrease epsilon for exploitation
def decrease(epsilon, min_epsilon, decrease_rate):
if(epsilon > min_epsilon):
epsilon *= decrease_rate
After that, the parameters and training progress are like this:
# training process
# step 1: set parameters and NN
episode = 1500
epsilon = 1.0
min_epsilon = 0.01
dr = 0.995
gamma = 0.9
lr = 0.001
batch_size = 40
memory_store = deque(maxlen=1500)
# step 2: define game category and associated states and actions
env = gym.make("CartPole-v1")
s_space = env.observation_space.shape[0]
a_space = env.action_space.n
net = DQN(s_space, a_space)
score = 0
# step 3: trainning
for e in range(0, episode):
# step3.1: at the start of each episode, the current result should be refreshed
# set initial state matrix
s = env.reset().reshape(-1, s_space)
# step3.2: iterate the state and action
for run in range(500):
# select action and get the next state according to current state "s"
a = select(net, epsilon, env, s)
obs, reward, done, info = env.step(a)
next_s = obs.reshape(-1,s_space)
s = next_s
score += 1
if(done == True):
reward = -10.0
memory_store.append((s,a,next_s,reward,done))
avs = score / (e+1)
print("episode:", e+1, "score:", run+1, "avarage score: {:.2f}".format(avs), "epsilon: {:.2}".format(epsilon))
break
# safe sample data
memory_store.append((s, a, next_s, reward, done))
if(run == 499):
print("episode:", e+1, "score:", run+1, "avarage score:", avs)
# step3.3 whenever the episode reach the integer time of batch size,
# we should backward to implore the NN
if(len(memory_store) > batch_size):
backprbgt(net, memory_store, batch_size, gamma, lr) # here we need a backprbgt function to backward
if(epsilon > min_epsilon):
epsilon = epsilon * dr
In the entire progress of training, there was no error or exception reminds. However, instead of the score increasing, the model performed lower score in the later steps. I think the theory of this model is correct but cannot find where the error appears although I tried lots of methods improving my code, including rechecking the input arguments of network, modifing the data structure of two arguments of loss function, etc. I paste my code here and hope to get some help on how to fix it. Thanks!
Check out the code. For most parts it's the same as in snippet above, but there is some changes:
for step in replay buffer (which is called in code memory_store) namedtuple is used, and in update it's much easier to read t.reward, than looking what every index doing in step t
class DQN has method update, it's better to keep optimizer as attribute of class, than create it every time when calling function backprbgt
usage of torch.autograd.Variable here is unnecessary, so it's also was taken away
update in backprbgt taken per batch
decrease size of hidden layer from 360 to 32, while increase batch size from 40 to 128
updating network once in 10 episodes, but on 10 batches in replay buffer
average score prints out every 50 episodes based on 10 last episodes
add seeds
Also for RL it's take a long time to learn anything, so hoping that after 100 episodes it'll be close to even 100 points is somewhat optimistic. For the code in link averaging on 5 runs results in following dynamics
X axis -- number of episodes (yeah, 70 K, but it's like 20 minutes of real time)
Y axis -- number of steps in episode
As can be seen after 70K episodes algorithm achieves reward comparable to highest possible in this environment (highest -- 500). By tweaking hyperparameters faster rate can be achieved, but also remember it's DQN without any modification.

How to tune the batch size for LSTM using hyperband?

I am trying to tune the batch size for LSTM using hyperband optimisation using the following codes, but it didn't work because the numbers of the training items did not change in different batch size conditions. Do you know how to improve the codes? Thank you in advance.
import keras_tuner as kt
import tensorflow as tf
from tensorflow import keras
import numpy as np
x_train = np.random.rand(63, 92)
y_train = np.random.randint(0,6, (63))
x_val = np.random.rand(63, 92)
y_val = np.random.randint(0,7, (63))
train_set = tf.keras.preprocessing.timeseries_dataset_from_array(
x_train, y_train, sequence_length=10)
val_set = tf.keras.preprocessing.timeseries_dataset_from_array(
x_val, y_val, sequence_length=10)
train_set = tf.keras.preprocessing.timeseries_dataset_from_array(
x_train, y_train, sequence_length=10, batch_size =1)
val_set = tf.keras.preprocessing.timeseries_dataset_from_array(
x_val, y_val, sequence_length=10, batch_size =1)
def model_builder(hp):
lr = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
hp_units = hp.Int('units', min_value=32, max_value=256, step=32)
hp_units1 = hp.Int('units1', min_value=32, max_value=256, step=32)
lstm_model = tf.keras.models.Sequential([
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp_units, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp_units1)),
tf.keras.layers.Dense(units=1)
])
lstm_model.compile(loss='mse',
optimizer=tf.optimizers.Adam(learning_rate=lr),
metrics=['mse'])
return lstm_model
class MyTuner(kt.tuners.Hyperband):
def run_trial(self, trial, *args, **kwargs):
kwargs['batch_size'] = trial.hyperparameters.Int('batch_size', 2, 6, step=2)
return super(MyTuner, self).run_trial(trial, *args, **kwargs)
tuner = MyTuner(model_builder,
objective='val_loss',
max_epochs=4,
factor=3,
directory='KT',
project_name='intro_to_kt0207',
overwrite=True)
early_stop = tf.keras.callbacks.EarlyStopping(
monitor='val_loss', min_delta=0, patience=20, verbose=0,
mode='min', baseline=None, restore_best_weights=True
)
tuner.search(train_set, epochs=1000,
validation_data = val_set)
This is the output:
Search: Running Trial #2
Hyperparameter |Value |Best Value So Far
learning_rate |0.0001 |0.01
units |192 |224
units1 |192 |64
batch_size |4 |2
tuner/epochs |2 |2
tuner/initial_e...|0 |0
tuner/bracket |1 |1
tuner/round |0 |0
Epoch 1/2
54/54 [==============================] - 7s 37ms/step - loss: 4.4262 - mse: 4.4262 - val_loss: 3.4221 - val_mse: 3.4221
Epoch 2/2
54/54 [==============================] - 1s 12ms/step - loss: 3.1213 - mse: 3.1213 - val_loss: 3.4463 - val_mse: 3.4463
Trial 2 Complete [00h 00m 08s]
val_loss: 3.42207670211792
Best val_loss So Far: 3.1889588832855225
Total elapsed time: 00h 00m 18s
Search: Running Trial #3
Hyperparameter |Value |Best Value So Far
learning_rate |0.01 |0.01
units |192 |224
units1 |96 |64
batch_size |2 |2
tuner/epochs |2 |2
tuner/initial_e...|0 |0
tuner/bracket |1 |1
tuner/round |0 |0
Epoch 1/2
54/54 [==============================] - 7s 34ms/step - loss: 3.6241 - mse: 3.6241 - val_loss: 3.1699 - val_mse: 3.1699
Epoch 2/2
54/54 [==============================] - 1s 12ms/step - loss: 3.1807 - mse: 3.1807 - val_loss: 3.2480 - val_mse: 3.2480
Trial 3 Complete [00h 00m 08s]
val_loss: 3.1699421405792236
Best val_loss So Far: 3.1699421405792236
Total elapsed time: 00h 00m 26s

Although I'm using StratifiedKFold, accuracy is always 0.5

I'm using pre-trained ResNet50 model to classify malaria dataset. I added two dense layer after it with 1024, 2048 units respectively and one classification layer using softmax function (results are worse with sigmoid). I used StratifiedKFold to validate this model but accuracy is always 0.5 after first fold.
After first fold all the epochs are the same like this:
22047/22047 [==============================] - 37s 3ms/step - loss: 8.0596 - acc: 0.5000
This is my model:
height = 100 #dimensions of image
width = 100
channel = 3 #RGB
classes = 2
batch_size = 64 #vary depending on the GPU
epochs = 10
folds = 5
optimizer = "Adam"
metrics = ["accuracy"]
loss = 'categorical_crossentropy'
random_state = 1377
chanDim = -1
model = ResNet50(include_top=False, weights="imagenet", input_shape=(height, width, channel))
# Get the ResNet50 layers up to res5c_branch2c
model = Model(input=model.input, output=model.get_layer('res5c_branch2c').output)
for layer in model.layers:
layer.trainable = False
Flatten1 = Flatten()(model.output)
F1 = Dense(1024, activation='relu')(Flatten1)
D1 = Dropout(0.5)(F1)
F2 = Dense(2048, activation='relu')(D1)
D2 = Dropout(0.2)(F2)
F3 = Dense(classes, activation='softmax')(D2)
model = Model(inputs = model.input, outputs = F3)
# Compile the model
model.compile(loss = loss, optimizer = optimizer, metrics = metrics)
This is validation part:
# Create a model compatible with sklearn
model = KerasClassifier(build_fn=customResnetBuild, epochs=epochs, batch_size=batch_size)
kfold = StratifiedKFold(n_splits=folds, shuffle=False, random_state=random_state)
# Make a custom score for classification report method to get results for mean of the all folds
def classification_report_with_accuracy_score(y_true, y_pred):
originalclass.extend(y_true)
predictedclass.extend(y_pred)
return accuracy_score(y_true, y_pred) # return accuracy score
scores = cross_val_score(model, data, labels, cv=kfold, error_score="raise", scoring=make_scorer(classification_report_with_accuracy_score) )
print(classification_report(originalclass, predictedclass))
Result
Mean of results: 0.6404469896025613
precision recall f1-score support
0 0.86 0.34 0.48 13781
1 0.59 0.94 0.72 13779
micro avg 0.64 0.64 0.64 27560
macro avg 0.72 0.64 0.60 27560
weighted avg 0.72 0.64 0.60 27560
This is the answer. To recap the problem is #parameters more than #dataset and usage of trainable=false is wrong.

Issues with Q-learning and neural networks

I'm just starting out learning Q-learning, and I've been okay with using the tabular method to get some decent results. One game I found quite fun to use Q-learning was with Blackjack, which seemed like a perfect MDP type problem.
I've been wanting to extend this to using a neural network as a function approximator, but I'm not having any luck at all. The approach is to calculate the expected value for every action in a given state and then pick the best one with a small chance of picking something random (epsilon greedy). Nothing converges, it learns silly Q-values, and it can't even figure out how to play when the only card in the deck is 5.
I am genuinely stuck, after spending hours on this and tuning hyper parameters and everything else I can think of. I feel like I must have made a fundamental error with Q-learning that I can't see. My code is below:
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import random
import pandas as pd
import sklearn
import math
import itertools
import tensorflow as tf
from matplotlib import pyplot as plt
############################ START BLACKJACK CLASS ############################
class Blackjack(gym.Env):
"""Simple Blackjack environment"""
def __init__(self, natural=False):
self.action_space = spaces.Discrete(2)
self._seed()
# Start the first game
self.prevState = self.reset()
def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return seed
# Returns a tuple of the form (str, int) where str is "H" or "S" depending on if its a
# Soft or Hard hand and int is the sum total of the cards in hand
# Example output: ("H", 15)
def getTotal(cards):
running_total = 0
softs = 0
for c in cards:
running_total += c
if c == 11:
softs += 1
if running_total > 21 and softs > 0:
softs -= 1
running_total -= 10
return "H" if softs == 0 else "S", running_total
def drawCard():
# Draw a random card from the deck with replacement. 11 is ACE
# I've set it to always draw a 5. In theory this should be very easy to learn and
# The only possible states, and their correct Q values should be:
# Q[10_5, stand] = -1 Q[10_5, hit] = 0
# Q[15_5, stand] = -1 Q[15_5, hit] = 0
# Q[20_5, stand] = 0 Q[20_5, hit] = -1
# The network can't even learn this!
return 5
return random.choice([5,6])
return random.choice([2,3,4,5,6,7,8,9,10,10,10,10,11])
def isBlackjack(cards):
return sum(cards) == 21 and len(cards) == 2
def getState(self):
# Defines the state of the current game
pstate, ptotal = Blackjack.getTotal(self.player)
dstate, dtotal = Blackjack.getTotal(self.dealer)
return "{}_{}".format("BJ" if Blackjack.isBlackjack(self.player) else pstate+str(ptotal), dtotal)
def reset(self):
# Resets the game - Dealer is dealt 1 card, player is dealt 2 cards
# The player and dealer are represented by an array of numbers, which are the cards they were
# dealt in order
self.soft = "H"
self.dealer = [Blackjack.drawCard()]
self.player = [Blackjack.drawCard() for _ in range(2)]
pstate, ptotal = Blackjack.getTotal(self.player)
dstate, dtotal = Blackjack.getTotal(self.dealer)
# Returns the current state of the game
return self.getState()
def step(self, action):
assert self.action_space.contains(action)
# Action should be 0 or 1.
# If standing, the dealer will draw all cards until they are >= 17. This will end the episode
# If hitting, a new card will be added to the player, if over 21, reward is -1 and episode ends
# Stand
if action == 0:
pstate, ptotal = Blackjack.getTotal(self.player)
dstate, dtotal = Blackjack.getTotal(self.dealer)
while dtotal < 17:
self.dealer.append(Blackjack.drawCard())
dstate, dtotal = Blackjack.getTotal(self.dealer)
# if player won with blackjack
if Blackjack.isBlackjack(self.player) and not Blackjack.isBlackjack(self.dealer):
rw = 1.5
# if dealer bust or if the player has a higher number than dealer
elif dtotal > 21 or (dtotal <= 21 and ptotal > dtotal and ptotal <= 21):
rw = 1
# if theres a draw
elif dtotal == ptotal:
rw = 0
# player loses in all other situations
else:
rw = -1
state = self.getState()
# Returns (current_state, reward, boolean_true_if_episode_ended, empty_dict)
return state, rw, True, {}
# Hit
else:
# Player draws another card
self.player.append(Blackjack.drawCard())
# Calc new total for player
pstate, ptotal = Blackjack.getTotal(self.player)
state = self.getState()
# Player went bust and episode is over
if ptotal > 21:
return state, -1, True, {}
# Player is still in the game, but no observed reward yet
else:
return state, 0, False, {}
############################ END BLACKJACK CLASS ############################
# Converts a player or dealers hand into an array of 10 cards
# that keep track of how many of each card are held. The card is identified
# through its index:
# Index: 0 1 2 3 4 5 6 7 9 10
# Card: 2 3 4 5 6 7 8 9 T A
def cardsToX(cards):
ans = [0] * 12
for c in cards:
ans[c] += 1
ans = ans[2:12]
return ans
# Easy way to convert Q values into weighted decision probabilities via softmax.
# This is useful if we probablistically choose actions based on their values rather
# than always choosing the max.
# eg Q[s,0] = -1
# Q[s,1] = -2
# softmax([-1,-2]) = [0.731, 0.269] --> 73% chance of standing, 27% chance of hitting
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
plt.ion()
# Define number of Neurons per layer
K = 20 # Layer 1
L = 10 # Layer 2
M = 5 # Layer 2
N_IN = 20 # 10 unique cards for player, and 10 for dealer = 20 total inputs
N_OUT = 2
SDEV = 0.000001
# Input / Output place holders
X = tf.placeholder(tf.float32, [None, N_IN])
X = tf.reshape(X, [-1, N_IN])
# This will be the observed reward + decay_factor * max(Q[s+1, 0], Q[s+1, 1]).
# This should be an estimate of the 'correct' Q-value with the ony caveat being that
# the Q-value of the next state is a biased estimate of the true value.
Q_TARGET = tf.placeholder(tf.float32, [None, N_OUT])
# LAYER 1
W1 = tf.Variable(tf.random_normal([N_IN, K], stddev = SDEV))
B1 = tf.Variable(tf.random_normal([K], stddev = SDEV))
# LAYER 2
W2 = tf.Variable(tf.random_normal([K, L], stddev = SDEV))
B2 = tf.Variable(tf.random_normal([L], stddev = SDEV))
# LAYER 3
W3 = tf.Variable(tf.random_normal([L, M], stddev = SDEV))
B3 = tf.Variable(tf.random_normal([M], stddev = SDEV))
# LAYER 4
W4 = tf.Variable(tf.random_normal([M, N_OUT], stddev = SDEV))
B4 = tf.Variable(tf.random_normal([N_OUT], stddev = SDEV))
H1 = tf.nn.relu(tf.matmul(X, W1) + B1)
H2 = tf.nn.relu(tf.matmul(H1, W2) + B2)
H3 = tf.nn.relu(tf.matmul(H2, W3) + B3)
# The predicted Q value, as determined by our network (function approximator)
# outputs expected reward for standing and hitting in the form [stand, hit] given the
# current game state
Q_PREDICT = (tf.matmul(H3, W4) + B4)
# Is this correct? The Q_TARGET should be a combination of the real reward and the discounted
# future rewards of the future state as predicted by the network. Q_TARGET - Q_PREDICT should be
# the error in prediction, which we want to minimise. Does this loss function work to help the network
# converge to the true Q values with sufficient training?
loss_func = tf.reduce_sum(tf.square(Q_TARGET - Q_PREDICT))
# This are some placeholder values to enable manually set decayed learning rates. For now, use
# the same learning rate all the time.
LR_START = 0.001
#LR_END = 0.000002
#LR_DECAY = 0.999
# Optimizer
LEARNING_RATE = tf.Variable(LR_START, trainable=False)
optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE)#(LEARNING_RATE)
train_step = optimizer.minimize(loss_func)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# Initialise the game environment
game = Blackjack()
# Number of episodes (games) to play
num_eps = 10000000
# probability of picking a random action. This decays over time
epsilon = 0.1
# discount factor. For blackjack, future rewards are equally important as immediate rewards.
discount = 1.0
all_rewards = [] # Holds all observed rewards. The rolling mean of rewards should improve as the network learns
all_Qs = [] # Holds all predicted Q values. Useful as a sanity check once the network is trained
all_losses = [] # Holds all the (Q_TARGET - Q_PREDICTED) values. The rolling mean of this should decrease
hands = [] # Holds a summary of all hands played. (game_state, Q[stand], Q[hit], action_taken)
# boolean switch to use the highest action value instead of a stochastic decision via softmax on Q-values
use_argmax = True
# Begin generating episodes
for ep in range(num_eps):
game.reset()
# Keep looping until the episode is not over
while True:
# x is the array of 20 numbers. The player cards, and the dealer cards.
x = cardsToX(game.player) + cardsToX(game.dealer)
# Q1 refers to the predicted Q-values before any action was taken
Q1 = sess.run(Q_PREDICT, feed_dict = {X : np.reshape( np.array(x), (-1, N_IN) )})
all_Qs.append(Q1)
if use_argmax:
# action is selected to be the one with the highest Q-value
act = np.argmax(Q1)
else:
# action is a weighted selection based on predicted Q_values
act = np.random.choice(range(N_OUT), p = softmax(Q1)[0])
if random.random() < epsilon:
# action is selected randomly
act = random.randint(0, N_OUT-1)
# Get game state before action is taken
game_state = game.getState()
# Take action! Observe new state, reward, and if the game is over
game_state_new, reward, done, _ = game.step(act)
hands.append( (game_state, Q1[0][0], Q1[0][1], act, reward) )
# Store the new state vector to feed into our network.
# x2 corresponds to the x vector observed in state s+1
x2 = cardsToX(game.player) + cardsToX(game.dealer)
# Q2 refers to the predicted Q-values in the new s+1 state. This is used for the 'SARSA' update.
Q2 = sess.run(Q_PREDICT,feed_dict = {X : np.reshape( np.array(x2), (-1, N_IN) )})
# Store the maximum Q-value in this new state. This should be the expected reward from this new state
maxQ2 = np.max(Q2)
# targetQ is the same as our predicted one initially. The index of the action we took will be
# updated to be [observed reward] + [discount_factor] * max(Q[s+1])
targetQ = np.copy(Q1)
# If the game is done, then there is no future state
if done:
targetQ[0,act] = reward
all_rewards.append(reward)
else:
targetQ[0,act] = reward + discount * maxQ2
# Perform one gradient descent update, filling the placeholder value for Q_TARGET with targetQ.
# The returned loss is the difference between the predicted Q-values and the targetQ we just calculated
loss, _, _ = sess.run([loss_func, Q_PREDICT, train_step],
feed_dict = {X : np.reshape( np.array(x), (-1, N_IN) ),
Q_TARGET : targetQ}
)
all_losses.append(loss)
# Every 1000 episodes, show how the q-values moved after the gradient descent update
if ep % 1000 == 0 and ep > 0:
Q_NEW = sess.run(Q_PREDICT, feed_dict = {X : np.reshape( np.array(x), (-1, N_IN) ),
Q_TARGET : targetQ})
#print(game_state, targetQ[0], Q1[0], (Q_NEW-Q1)[0], loss, ep, epsilon, act)
rolling_window = 1000
rolling_mean = np.mean( all_rewards[-rolling_window:] )
rolling_loss = np.mean( all_losses[-rolling_window:] )
print("Rolling mean reward: {:<10.4f}, Rolling loss: {:<10.4f}".format(rolling_mean, rolling_loss))
if done:
# Reduce chance of random action as we train the model.
epsilon = 2/((ep/500) + 10)
epsilon = max(0.02, epsilon)
# rolling mean of rewards should increase over time!
if ep % 1000 == 0 and ep > 0:
pass# Show the rolling mean of all losses. This should decrease over time!
#plt.plot(pd.rolling_mean(pd.Series(all_losses), 5000))
#plt.pause(0.02)
#plt.show()
break
print(cardsToX(game.player))
print(game.dealer)
Any ideas? I'm stuck :(