predicting simple autoregressive model with fully connected - deep-learning

the question is at the end, you can just jump to the question, I just wanted to share my process, in case someone want to give me general advice.
I started learning how to use LSTM layers and tried to build a simple predictor to the following AR model:
class AR_model:
def __init__(self, length=100):
self.time = 0
self.first_value = 0
self.a1 = 0.6
self.a2 = -0.5
self.a3 = -0.2
self.Xt = self.first_value
self.Xt_minus_1 = 0
self.Xt_minus_2 = 0
self.length = length
def __iter__(self):
return self
def __next__(self): # raise StopIteration
if self.time == self.length:
raise StopIteration
new_value = self.a1 * self.Xt + \
self.a2 * self.Xt_minus_1 + \
self.a3 * self.Xt_minus_2 + \
random.uniform(0, 0.1)
self.Xt_minus_2 = self.Xt_minus_1
self.Xt_minus_1 = self.Xt
self.Xt = new_value
self.time += 1
return new_value
which basicly means the following series:
Xt = a1 * Xt−1 + a2 * Xt−2 + a3X * t−3 + Ut
where: a1 = 0.6, a2 = −0.5, a3 = −0.2 and Ut (i.i.d) ∼ Uniform(0, 0.1)
using the following forward method:
def forward(self, input):
# input: [Batch x seq_length x input_size]
x, _ = self.lstm(input)
# x: [Batch x seq_length x hidden_state]
x = x[:, -1, :]
# taking only the last x: [Batch x hidden_state]
x = self.linear(x)
# x: [Batch x 1]
return x
the best result seems ok:
picture of results, 91 steps
with the following hyper-parameters:
signal_count = 50
signal_length = 200
hidden_state = 200
learning_rate = 0.1
also tried it on sin and tri waves:
sin wave 20 steps
tri wave 75 steps
tri wave might have worked on deeper layered network but I didnt bother to try
Question 1
It make sense that for a simple AR model, such as:
Xt = a1 * Xt−1 + a2 * Xt−2 + a3X * t−3 + Ut
where: a1 = 0.6, a2 = −0.5, a3 = −0.2 and Ut (i.i.d) ∼ Uniform(0, 0.1)
It would be possible to get a good prediction with a simple three input one layered fully connected network, where the inputs are the last tree values of the AR series.
but I just get terrible result. Even when I remove the noise from the AR model I still get bad results. Am I in the wrong to think this?
I didn't post the code because I think its a concept problem. If someone asks, I will post.
Question 2
for the above AR model, what simple predictor would you recommend, not necessarily based deep learning.
asking friends I got recommended kalman filter, and Markovian based.
haven't really checked them out yet.
Thank you for reading

Related

Actor Critic model returns NaN for action probabilities

I am new to RL and walking through the Keras implementation of Actor Critic.
As a variant of it, I am trying to learn the strategy for WORDLE. However, after a few runs, my action spaces all go down to nan.
actions = [nan nan nan ... nan nan nan]
Not sure what's happening. Could someone have any insights or pointers?
Attaching my code for reference.
Thanks
import pandas as pd
import numpy as np
import random
import string
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Configuration parameters for the whole setup
gamma = 0.9 # Discount factor for past rewards
max_runs = 10000
eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0
my_file = open("<wordle set of words data path>", "r")
content = my_file.read()
content = list(content.split('\n'))
lower_alphabet = list(string.ascii_letters)[:26]
def get_secret_word():
return random.choice(content)
def reset_available_action_space():
return [1 for i in range(len(content))]
def reset_guessed_alphabet_state():
return [0 for i in range(len(lower_alphabet))]
# array of 26 which represents which alphabet is available in word
def reset_contains_alphabet_state():
return [0 for i in range(len(lower_alphabet))]
# Array of 26*5.
# First 26 represent which alphabet was correctly guessed at the first slot
# Second 26 represent which alphabet was correctly guessed at the second slot. And so on for the next 5 slots.
def reset_correct_alphabet_pos_state():
return [0 for i in range(len(lower_alphabet)*5)]
def select_and_update_AVAILABLE_ACTION_SPACE(actions):
action_index = 0
while AVAILABLE_ACTION_SPACE[actions[action_index]] == False:
action_index += 1
AVAILABLE_ACTION_SPACE[actions[action_index]] = 0
return actions[action_index]
def env_reset():
AVAILABLE_ACTION_SPACE = reset_available_action_space()
guessed_alphabet_state = reset_guessed_alphabet_state()
contains_alphabet_state = reset_contains_alphabet_state()
correct_alphabet_pos_state = reset_correct_alphabet_pos_state()
state = guessed_alphabet_state + contains_alphabet_state + correct_alphabet_pos_state
SECRET_WORD = get_secret_word()
return state, SECRET_WORD, AVAILABLE_ACTION_SPACE
def env_step(action, state):
guessed_word = content[action]
guessed_alphabet_state = state[:26]
contains_alphabet_state = state[26:52]
correct_alphabet_pos_state = state[52:]
done = False
reward = -10
if SECRET_WORD == guessed_word:
done = True
reward = 10
secret_word = list(SECRET_WORD)
guessed_word = list(guessed_word)
for index_, char_ in enumerate(guessed_word):
alphabet_index = lower_alphabet.index(char_)
guessed_alphabet_state[alphabet_index] = 1
if char_ in secret_word:
contains_alphabet_state[alphabet_index] = 1
if secret_word[index_] == char_:
correct_alphabet_pos_state[26*index_ + alphabet_index] = 1
state = guessed_alphabet_state + contains_alphabet_state + correct_alphabet_pos_state
return state, reward, done
num_inputs = 182
num_actions = len(content)
num_hidden_1 = 256
num_hidden_2 = 128
inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden_1, activation="relu")(inputs)
common = layers.Dense(num_hidden_2, activation="relu")(common)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)
model = keras.Model(inputs=inputs, outputs=[action, critic])
optimizer = keras.optimizers.Adam(learning_rate=0.001)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
for runs in range(max_runs):
max_steps_per_episode = 6
state, SECRET_WORD, AVAILABLE_ACTION_SPACE = env_reset()
episode_reward = 0
with tf.GradientTape() as tape:
for timestep in range(max_steps_per_episode):
state_tensor = tf.convert_to_tensor(state)
state_tensor = tf.expand_dims(state, 0)
action_probs, critic_value = model(state_tensor)
critic_value_history.append(critic_value[0, 0])
actions = np.random.choice(num_actions, size=max_steps_per_episode+1, replace = False, p=np.squeeze(action_probs))
action = select_and_update_AVAILABLE_ACTION_SPACE(actions)
action_probs_history.append(tf.math.log(action_probs[0, action]))
state, reward, done = env_step(action, state)
rewards_history.append(reward)
episode_reward += reward
if done:
break
# Update running reward to check condition for solving
running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
# Calculate expected value from rewards
# - At each timestep what was the total reward received after that timestep
# - Rewards in the past are discounted by multiplying them with gamma
# - These are the labels for our critic
returns = []
discounted_sum = 0
for r in rewards_history[::-1]:
discounted_sum = r + gamma * discounted_sum
returns.insert(0, discounted_sum)
# Normalize
returns = np.array(returns)
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
returns = returns.tolist()
# Calculating loss values to update our network
history = zip(action_probs_history, critic_value_history, returns)
actor_losses = []
critic_losses = []
for log_prob, value, ret in history:
# At this point in history, the critic estimated that we would get a
# total reward = `value` in the future. We took an action with log probability
# of `log_prob` and ended up recieving a total reward = `ret`.
# The actor must be updated so that it predicts an action that leads to
# high rewards (compared to critic's estimate) with high probability.
diff = ret - value
actor_losses.append(-log_prob * diff) # actor loss
# The critic must be updated so that it predicts a better estimate of
# the future rewards.
critic_losses.append(
huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
)
# Backpropagation
loss_value = sum(actor_losses) + sum(critic_losses)
grads = tape.gradient(loss_value, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
# Clear the loss and reward history
action_probs_history.clear()
critic_value_history.clear()
rewards_history.clear()
# Log details
episode_count += 1
if episode_count % 10 == 0:
template = "running reward: {:.2f} at episode {}"
print(template.format(running_reward, episode_count))
wordle set of words data path => https://gist.github.com/cfreshman/a03ef2cba789d8cf00c08f767e0fad7b
My State Space is [guessed alphabets, alphabets contained in the secret word, alphabet in the correct position]
guessed alphabets => Array of 26 size (a-z)
alphabets contained in the secret word => Array of 26 size (a-z)
alphabet in the correct position => Array of 26 * 5 [(a-z), (a-z), (a-z), (a-z), (a-z)] (as each word is 5 letters)
The Available action spaces get updated after every action. The previously taken actions are no longer available for future actions.
I have tried both relu and tanh for activation
Observation: Critic Value keeps increasing to an extremely large values

PyTorch: Multi-class segmentation loss value != 0 when using target image as the prediction

I was performing semantic segmentation using PyTorch. There are a total of 103 different classes in the dataset and the targets are RGB images with only the Red channel containing the labels. I was using nn.CrossEntropyLoss as my loss function. For sanity, I wanted to check if using nn.CrossEntropyLoss is correct for this problem and whether it has the expected behaviour
I pick a random mask from my dataset and create a categorical version of it using this custom transform
class ToCategorical:
def __init__(self, n_classes: int) -> None:
self.n_classes = n_classes
def __call__(self, sample: torch.Tensor):
mask = sample.permute(1, 2, 0)
categories = torch.unique(mask).tolist()[1:] # get all categories other than 0
# build a tensor with `n_classes` channels
one_hot_image = torch.zeros(self.n_classes, *mask.shape[:-1])
for category in categories:
# get spacial locs where the categ is present
rows, cols, _ = torch.where(mask == category)
# in same spacial loc but in `categ` channel fill 1
one_hot_image[category, rows, cols] = 1
return one_hot_image
And then I send this image as the output (prediction) and use the ground truth mask as the target to the loss function.
import torch.nn as nn
mask = T.PILToTensor()(Image.open("path_to_image").convert("RGB"))
categorical_mask = ToCategorical(103)(mask).unsqueeze(0)
mask = mask[0].unsqueeze(0) # get only the red channel, add fake batch_dim
loss_fn = nn.CrossEntropyLoss()
target = mask
output = categorical_mask
print(output.shape, target.shape)
print(loss_fn(output, target.to(torch.long)))
I expected the loss to be zero but to my surprise, the output is as follows
torch.Size([1, 103, 600, 800]) torch.Size([1, 600, 800])
tensor(4.2836)
I verified with other samples in the dataset and I obtained similar values for other masks as well. Am I doing something wrong? I expect the loss to be = 0 when the output is the same as the target.
PS. I also know that nn.CrossEntropyLoss is the same as using log_softmax followed by nn.NLLLoss() but even I obtained the same value by using nllloss as well
For Reference
Dataset used: UECFoodPixComplete
I would like to adress this:
I expect the loss to be = 0 when the output is the same as the target.
If the prediction matches the target, i.e. the prediction corresponds to a one-hot-encoding of the labels contained in the dense target tensor, but the loss itself is not supposed to equal to zero. Actually, it can never be equal to zero because the nn.CrossEntropyLoss function is always positive by definition.
Let us take a minimal example with number of #C classes and a target y_pred and a prediction y_pred consisting of prefect predictions:
As a quick reminder:
The softmax is applied on the logits (q_i) as p_i = log(exp(q_i)/sum_j(exp(q_j)):
>>> p = F.softmax(y_pred, 1)
Similarly if you are using the log-softmax, defined as logp_i = log(p_i):
>>> logp = F.log_softmax(y_pred, 1)
Then comes the negative likelihood function computed between x the input and y the target: -y*x. In association with the softmax, it comes down to -y*p, or -y*logp respectively. In any case, whether you apply the log or not, only the predictions corresponding to the true classes will remain since the others ones are zeroed-out.
That being said, applying the NLLLoss on y_pred would indeed result with a 0 as you expected in your question. However, here we apply it on the probability distribution or log-probability: p, or logp respectively!
In our specific case, p_i = 1 for the true class and p_i = 0 for all other classes (there are #C - 1 of those). This means the softmax of the logit associated with the true class will equal to exp(1)/sum_i(p_i). And since sum_i(p_i) = (#C-1)*exp(0) + exp(1). We therefore have:
softmax(p) = e / (#C - 1 + e)
Similarly for log-softmax:
log-softmax(p) = log(e / (#C-1 + e)) = 1 - log(#C - 1 + e)
If we proceed by applying the negative likelihood function we simply get cross-entropy(y_pred, y_true) = (nllloss o log-softmax)(y_pred, y_true). This results in:
loss = - (1 - log(#C - 1 + e)) = log(#C - 1 + e) - 1
This effectively corresponds to the minimum of the nn.CrossEntropyLoss function.
Regarding your specific case where #C = 103, you may have an issue in your code... since the average loss should equal to log(102 + e) - 1 i.e. around 3.65.
>>> y_true = torch.randint(0,103,(1,1,2,5))
>>> y_pred = torch.zeros(1,103,2,5).scatter(1, y_true, value=1)
You can see for yourself with one of the provided methods:
the builtin function nn.functional.cross_entropy:
>>> F.cross_entropy(y_pred, y_true[:,0])
tensor(3.6513)
manually computing the quantity:
>>> logp = F.log_softmax(y_pred, 1)
>>> -logp.gather(1, y_true).mean()
tensor(3.6513)
analytical result:
>>> log(102 + e) - 1
3.6513

Difference between WGAN and WGAN-GP (Gradient Penalty)

I just find that in the code here:
https://github.com/NUS-Tim/Pytorch-WGAN/tree/master/models
The "generator" loss, G, between WGAN and WGAN-GP is different, for WGAN:
g_loss = self.D(fake_images)
g_loss = g_loss.mean().mean(0).view(1)
g_loss.backward(one) # !!!
g_cost = -g_loss
But for WGAN-GP:
g_loss = self.D(fake_images)
g_loss = g_loss.mean()
g_loss.backward(mone) # !!!
g_cost = -g_loss
Why one is one=1 and another is mone=-1?
You might have misread the source code, the first sample you gave is not averaging the resut of D to compute its loss but instead uses the binary cross-entropy.
To be more precise:
The first method ("GAN") uses the BCE loss to compute the loss terms for D and G. The standard GAN optimization objective for D is to minimize E_x[log(D(x))] + E_z[log(1-D(G(z)))]. Source code:
outputs = self.D(images)
d_loss_real = self.loss(outputs.flatten(), real_labels) # <- bce loss
real_score = outputs
# Compute BCELoss using fake images
fake_images = self.G(z)
outputs = self.D(fake_images)
d_loss_fake = self.loss(outputs.flatten(), fake_labels) # <- bce loss
fake_score = outputs
# Optimizie discriminator
d_loss = d_loss_real + d_loss_fake
self.D.zero_grad()
d_loss.backward()
self.d_optimizer.step()
For d_loss_real you optimize towards 1s (output is considered real), while d_loss_fake optimizes towards 0s (output is considered fake).
While the second ("WCGAN") uses the Wasserstein loss (ref) whereby we maximise for D the loss: E_x[D(x)] - E_z[D(G(z))]. Source code:
# Train discriminator
# WGAN - Training discriminator more iterations than generator
# Train with real images
d_loss_real = self.D(images)
d_loss_real = d_loss_real.mean()
d_loss_real.backward(mone)
# Train with fake images
z = self.get_torch_variable(torch.randn(self.batch_size, 100, 1, 1))
fake_images = self.G(z)
d_loss_fake = self.D(fake_images)
d_loss_fake = d_loss_fake.mean()
d_loss_fake.backward(one)
# [...]
Wasserstein_D = d_loss_real - d_loss_fake
By doing d_loss_real.backward(mone) you backpropage with a gradient of opposite sign, i.e. its's a gradient ascend, and you end up maximizing d_loss_real.
In order to Update D network:
lossD = Expectation of D(fake data) - Expectation of D(real data) + gradient penalty
lossD ↓,D(real data) ↑
so you need to add minus one to the gradient process

Pytorch:Apply cross entropy loss with custom weight map

I am solving multi-class segmentation problem using u-net architecture in pytorch.
As specified in U-NET paper, I am trying to implement custom weight maps to counter class imbalances.
Below is the opertion which I want to apply -
Also, I reduced the batch_size=1 so that I can remove that dimension while passing it to precompute_to_masks function.
I tried the below approach-
def precompute_for_image(masks):
masks = masks.cpu()
cls = masks.unique()
res = torch.stack([torch.where(masks==cls_val, torch.tensor(1), torch.tensor(0)) for cls_val in cls])
return res
def train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path):
###################
# train the model #
###################
model.train()
for batch_idx, (data, target) in enumerate(final_train_loader):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
optimizer.zero_grad()
output = model(data)
temp_target = precompute_for_image(target)
w = weight_map(temp_target)
loss = criterion(output,target)
loss = w*loss
loss.backward()
optimizer.step()
train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
return model
where weight_map is the function to calculate weight mask which I got from here
The issue, I am facing is I am getting memory error when I apply the following method. I am using 61gb RAM and Tesla V100 GPU.
I really think I am applying it in incorrect way.
How to do it?
I am omitting the non-essential details from the training loop.
Below is my weight_map function:
from skimage.segmentation import find_boundaries
w0 = 10
sigma = 5
def make_weight_map(masks):
"""
Generate the weight maps as specified in the UNet paper
for a set of binary masks.
Parameters
----------
masks: array-like
A 3D array of shape (n_masks, image_height, image_width),
where each slice of the matrix along the 0th axis represents one binary mask.
Returns
-------
array-like
A 2D array of shape (image_height, image_width)
"""
nrows, ncols = masks.shape[1:]
masks = (masks > 0).astype(int)
distMap = np.zeros((nrows * ncols, masks.shape[0]))
X1, Y1 = np.meshgrid(np.arange(nrows), np.arange(ncols))
X1, Y1 = np.c_[X1.ravel(), Y1.ravel()].T
for i, mask in enumerate(masks):
# find the boundary of each mask,
# compute the distance of each pixel from this boundary
bounds = find_boundaries(mask, mode='inner')
X2, Y2 = np.nonzero(bounds)
xSum = (X2.reshape(-1, 1) - X1.reshape(1, -1)) ** 2
ySum = (Y2.reshape(-1, 1) - Y1.reshape(1, -1)) ** 2
distMap[:, i] = np.sqrt(xSum + ySum).min(axis=0)
ix = np.arange(distMap.shape[0])
if distMap.shape[1] == 1:
d1 = distMap.ravel()
border_loss_map = w0 * np.exp((-1 * (d1) ** 2) / (2 * (sigma ** 2)))
else:
if distMap.shape[1] == 2:
d1_ix, d2_ix = np.argpartition(distMap, 1, axis=1)[:, :2].T
else:
d1_ix, d2_ix = np.argpartition(distMap, 2, axis=1)[:, :2].T
d1 = distMap[ix, d1_ix]
d2 = distMap[ix, d2_ix]
border_loss_map = w0 * np.exp((-1 * (d1 + d2) ** 2) / (2 * (sigma ** 2)))
xBLoss = np.zeros((nrows, ncols))
xBLoss[X1, Y1] = border_loss_map
# class weight map
loss = np.zeros((nrows, ncols))
w_1 = 1 - masks.sum() / loss.size
w_0 = 1 - w_1
loss[masks.sum(0) == 1] = w_1
loss[masks.sum(0) == 0] = w_0
ZZ = xBLoss + loss
return ZZ
Traceback of the error-
MemoryError Traceback (most recent call last)
<ipython-input-30-f0a595b8de7e> in <module>
1 # train the model
2 model_scratch = train(20, final_train_loader, unet, optimizer,
----> 3 criterion, train_on_gpu, 'model_scratch.pt')
<ipython-input-29-b481b4f3120e> in train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path)
24 loss = criterion(output,target)
25 target.requires_grad = False
---> 26 w = make_weight_map(target)
27 loss = W*loss
28 loss.backward()
<ipython-input-5-e75a6281476f> in make_weight_map(masks)
33 X2, Y2 = np.nonzero(bounds)
34 xSum = (X2.reshape(-1, 1) - X1.reshape(1, -1)) ** 2
---> 35 ySum = (Y2.reshape(-1, 1) - Y1.reshape(1, -1)) ** 2
36 distMap[:, i] = np.sqrt(xSum + ySum).min(axis=0)
37 ix = np.arange(distMap.shape[0])
MemoryError:
Your final_train_loader provides you with an input image data and the expected pixel-wise labeling target. I assume (following pytorch's conventions) that data is of shape B-3-H-W and of dtype=torch.float.
More importantly, target is of shape B-H-W and of dtype=torch.long.
On the other hand make_weight_map expects its input to be C-H-W (with C = number of classes, NOT batch size), of type numpy array.
Try providing make_weight_map the input mask as it expects it and see if you get similar errors.
I also recommend that you visualize the resulting weight map - to make sure your function does what you expect it to do.

Simple Multilayer Perceptron model does not converge in TensorFlow

I am new to TensorFlow. Today I tried to implement my first model in TF but it returned strange results. I know that I am missing something here but I was not able to figure it out. Here is the story.
Model
I have a simple Multilayer Perceptron model with only a single hidden layer applied on MNIST databse. Layers are defined like [input(784) , hidden_layer(470) , output_layer(10)] with tanh as non-linearity for hidden layer and softmax as the loss for output layer. The optimizer I am using is Gradient Descent Algorithm with learning rate of 0.01. My mini batch size is 1 (I am training model with samples one by one).
My implementations :
First I implemented my model in C++ and got around 96% accuracy.Here is the repository : https://github.com/amin2ros/Artificog
I implemented the exact model in TensorFlow but surprisingly the model didn't converge at all. Here is the code.
Code:
import sys
import input_data
import matplotlib.pyplot as plt
from pylab import *
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
import tensorflow as tf
# Parameters
learning_rate = 0.1
training_epochs = 1
batch_size = 1
display_step = 1
# Network Parameters
n_hidden_1 = 470 # 1st layer num features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
# Create model
def multilayer_perceptron(_X, _weights, _biases):
layer_1 = tf.tanh(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1']))
return tf.matmul(layer_1, _weights['out']) + _biases['out']
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax(pred)) # Softmax loss
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(cost) #
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
m= 0
total_batch = int(mnist.train.num_examples/batch_size)
counter=0
#print 'count = ' , total_batch
#sys.stdin.read(1)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
label = tf.argmax(batch_ys,1).eval()[0]
counter+=1
sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
wrong_prediction = tf.not_equal(tf.argmax(pred, 1), tf.argmax(y, 1))
missed=tf.cast(wrong_prediction, "float")
m += missed.eval({x: batch_xs, y: batch_ys})[0]
print "Sample #", counter , " - Label : " , label , " - Prediction :" , tf.argmax(pred, 1).eval({x: batch_xs, y: batch_ys})[0] ,\
"- Missed = " , m , " - Error Rate = " , 100 * float(m)/counter
print "Optimization Finished!"
I am very curious why this happens. Any help is appreciated.
Edit:
As commented below definition of cost function was incorrect so it should be like
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred,y))
Now model converges :)