Simple Multilayer Perceptron model does not converge in TensorFlow - deep-learning

I am new to TensorFlow. Today I tried to implement my first model in TF but it returned strange results. I know that I am missing something here but I was not able to figure it out. Here is the story.
Model
I have a simple Multilayer Perceptron model with only a single hidden layer applied on MNIST databse. Layers are defined like [input(784) , hidden_layer(470) , output_layer(10)] with tanh as non-linearity for hidden layer and softmax as the loss for output layer. The optimizer I am using is Gradient Descent Algorithm with learning rate of 0.01. My mini batch size is 1 (I am training model with samples one by one).
My implementations :
First I implemented my model in C++ and got around 96% accuracy.Here is the repository : https://github.com/amin2ros/Artificog
I implemented the exact model in TensorFlow but surprisingly the model didn't converge at all. Here is the code.
Code:
import sys
import input_data
import matplotlib.pyplot as plt
from pylab import *
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
import tensorflow as tf
# Parameters
learning_rate = 0.1
training_epochs = 1
batch_size = 1
display_step = 1
# Network Parameters
n_hidden_1 = 470 # 1st layer num features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
# Create model
def multilayer_perceptron(_X, _weights, _biases):
layer_1 = tf.tanh(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1']))
return tf.matmul(layer_1, _weights['out']) + _biases['out']
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax(pred)) # Softmax loss
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(cost) #
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
m= 0
total_batch = int(mnist.train.num_examples/batch_size)
counter=0
#print 'count = ' , total_batch
#sys.stdin.read(1)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
label = tf.argmax(batch_ys,1).eval()[0]
counter+=1
sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
wrong_prediction = tf.not_equal(tf.argmax(pred, 1), tf.argmax(y, 1))
missed=tf.cast(wrong_prediction, "float")
m += missed.eval({x: batch_xs, y: batch_ys})[0]
print "Sample #", counter , " - Label : " , label , " - Prediction :" , tf.argmax(pred, 1).eval({x: batch_xs, y: batch_ys})[0] ,\
"- Missed = " , m , " - Error Rate = " , 100 * float(m)/counter
print "Optimization Finished!"
I am very curious why this happens. Any help is appreciated.
Edit:
As commented below definition of cost function was incorrect so it should be like
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred,y))
Now model converges :)

Related

System exit 1 error after obtaining NaN losses from finetuning Mask R-CNN in Pytorch

I am following this tutorial from Pytorch for Finetuning a pre-trained model on my own dataset. I have my annotation in the COCO format in a json file, so, I first implemented the dataloader as follows:
import torch
import json
from torch.utils.data import Dataset
from pycocotools.coco import COCO
from PIL import Image
import os
import numpy as np
from torchvision import transforms
import Config
import transforms as T
from torchvision.transforms import functional as F
class CustomDataset(Dataset):
def __init__(self, root, json_file, transform=None):
self.root = root
with open(json_file) as f:
self.data = json.load(f)
self.transform = transform
self.image_ids = [img["id"] for img in self.data["images"]]
self.imgs = list(sorted(os.listdir(os.path.join(root, "Images"))))
self.masks = list(sorted(os.listdir(os.path.join(root, "Masks"))))
def __getitem__(self, idx):
# Get image ID
img_id = self.image_ids[idx]
img = next(image for image in self.data["images"] if image["id"] == img_id)
img_path = os.path.join(self.root, "Images")
mask_path = os.path.join(self.root, "Masks")
# Load image
image = Image.open(os.path.join(img_path, img['file_name'])).convert("RGB")
# extract annotations from the json file
annotations = [ann for ann in self.data["annotations"] if ann["image_id"] == img_id]
# extract labels from annotations
labels = [ann["label"] for ann in annotations]
# convert labels to integers
labels = [label for label in labels]
labels = torch.as_tensor(labels, dtype=torch.int64)
# extract boxes and convert them to format [x1, y1, x2, y2]
boxes = [ann["bbox"] for ann in annotations]
boxes = [[bbox[0], bbox[1], bbox[2], bbox[3]] for bbox in boxes]
num_objects = len(boxes)
# read the mask and include the number of objects in the first dimension
mask = np.array(Image.open(os.path.join(mask_path, img['file_name'])).convert("L"))
# Check if mask is empty
if mask.size == 0:
mask = np.zeros((num_objects, 1, 1), dtype=np.uint8)
else:
mask = np.expand_dims(mask, axis=0)
mask = np.repeat(mask, num_objects, axis=0)
# convert the binary mask array to a torch tensor
mask = torch.as_tensor(mask, dtype=torch.uint8)
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objects,), dtype=torch.int64)
# convert bboxes to tensors
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# calculate the area of the bounding box
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# convert id to tensor
image_id = torch.tensor([idx])
# create target dictionary
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = mask
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
# apply the transform if any
if self.transform is not None:
image, target = self.transform(image, target)
return image, target
def __len__(self):
return len(self.imgs)
and I am using this code for training:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from engine import train_one_epoch
import utils
import transforms as T
from dataloader import CustomDataset
import Config
import torch
import utils
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from torchvision.transforms import functional as F
def get_instance_segmentation_model(num_classes):
# load an instance segmentation model pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
# get the number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# now get the number of input features for the mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
num_classes)
return model
def get_transform(train):
transforms = []
# converts the image, a PIL image, into a PyTorch Tensor
transforms.append(T.PILToTensor())
if train:
# during training, randomly flip the training images
# and ground-truth for data augmentation
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
json_path = 'annotations.json'
# use our dataset and defined transformations
dataset = CustomDataset(root = Config.Dataset_dir, json_file=json_path, transform = get_transform(train=True))
# for image, target in dataset:
# print(image.shape)
# split the dataset in train and test set
torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-500])
dataset_test = torch.utils.data.Subset(dataset, indices[-500:])
# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=1, shuffle=True, num_workers=4,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1, shuffle=False, num_workers=4,
collate_fn=utils.collate_fn)
device = Config.DEVICE
# # our dataset has two classes only - background and person
num_classes = 2
# get the model using our helper function
model = get_instance_segmentation_model(num_classes)
# move model to the right device
model.to(device)
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.1,
momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler which decreases the learning rate by
# 10x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.1)
# let's train it for 10 epochs
num_epochs = 10
for epoch in range(num_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
# update the learning rate
lr_scheduler.step()
# evaluate on the test dataset
evaluate(model, data_loader_test, device=device)
This training code is as stated in the tutorial is using some helper functions which can be accessed from here. I have run the training code and the training is working for the first 10 samples in the data, but then it gives the following error:
Epoch: [0] [ 0/2759] eta: 13:29:50 lr: 0.000200 loss: -136.8811 (-136.8811) loss_classifier: 0.9397 (0.9397) loss_box_reg: 0.0017 (0.0017) loss_mask: -137.9142 (-137.9142) loss_objectness: 0.0859 (0.0859) loss_rpn_box_reg: 0.0057 (0.0057) time: 17.6117 data: 10.0775
Loss is nan, stopping training
{'loss_classifier': tensor(nan, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(nan, grad_fn=<DivBackward0>), 'loss_mask': tensor(nan, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_objectness': tensor(nan, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(nan, grad_fn=<DivBackward0>)}
An exception has occurred, use %tb to see the full traceback.
SystemExit: 1
This error is raised from the engine.py train_one_epoch function, especially from this part of the function:
with torch.cuda.amp.autocast(enabled=scaler is not None):
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print(f"Loss is {loss_value}, stopping training")
print(loss_dict_reduced)
sys.exit(1)
Which indicates that the losses returned after the first loop are NaN ... What could be wrong here please? I am running out of ideas and don't know what's going wrong anymore.

PyTorch: Confusion Matrix for Transfer Learning

I've been trying to plot a confusion matrix for the below code - check def train_alexnet(). But I keep getting this error:
IndexError: only integers, slices (`:`), ellipsis (`...`), None and long or byte Variables are valid indices (got float)
So, I tried converting my tensors to an integer tensor but then got the error:
ValueError: only one element tensors can be converted to Python scalars
Can someone suggest me what can be done to convert the tensors 'all_preds' and 'source_value' to tensors containing integer values? I found the torch no grad option but I am unaware as to how to use it because I'm new to pytorch.
Here's the link of the github repo that I'm trying to work with: https://github.com/syorami/DDC-transfer-learning/blob/master/DDC.py
from __future__ import print_function
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import warnings
warnings.filterwarnings('ignore')
import math
import model
import torch
import dataloader
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix
from plotcm import plot_confusion_matrix
from torch import nn
from torch import optim
from torch.autograd import Variable
cuda = torch.cuda.is_available()
def step_decay(epoch, learning_rate):
# learning rate step decay
# :param epoch: current training epoch
# :param learning_rate: initial learning rate
# :return: learning rate after step decay
initial_lrate = learning_rate
drop = 0.8
epochs_drop = 10.0
lrate = initial_lrate * math.pow(drop, math.floor((1 + epoch) / epochs_drop))
return lrate
def train_alexnet(epoch, model, learning_rate, source_loader):
# train source on alexnet
# :param epoch: current training epoch
# :param model: defined alexnet
# :param learning_rate: initial learning rate
# :param source_loader: source loader
# :return:
log_interval = 10
LEARNING_RATE = step_decay(epoch, learning_rate)
print(f'Learning Rate: {LEARNING_RATE}')
optimizer = optim.SGD([
{'params': model.features.parameters()},
{'params': model.classifier.parameters()},
{'params': model.final_classifier.parameters(), 'lr': LEARNING_RATE}
], lr=LEARNING_RATE / 10, momentum=MOMENTUM, weight_decay=L2_DECAY)
# enter training mode
model.train()
iter_source = iter(source_loader)
num_iter = len(source_loader)
correct = 0
total_loss = 0
clf_criterion = nn.CrossEntropyLoss()
all_preds = torch.tensor([])
source_value = torch.tensor([])
for i in range(1, num_iter):
source_data, source_label = iter_source.next()
# print("source label: ", source_label)
if cuda:
source_data, source_label = source_data.cuda(), source_label.cuda()
source_data, source_label = Variable(source_data), Variable(source_label)
optimizer.zero_grad()
##
source_preds = model(source_data)
preds = source_preds.data.max(1, keepdim=True)[1]
correct += preds.eq(source_label.data.view_as(preds)).sum()
#prediction label
all_preds = torch.cat(
(all_preds, preds)
,dim=0
)
#actual label
source_value = torch.cat(
(source_value,source_label)
,dim=0
)
loss = clf_criterion(source_preds, source_label)
total_loss += loss
loss.backward()
optimizer.step()
if i % log_interval == 0:
print('Train Epoch {}: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, i * len(source_data), len(source_loader) * BATCH_SIZE,
100. * i / len(source_loader), loss.item()))
total_loss /= len(source_loader)
acc_train = float(correct) * 100. / (len(source_loader) * BATCH_SIZE)
# print('all preds= ',int(all_preds))
# print("source value", int(source_value))
stacked = torch.stack(
(
source_value
,(all_preds.argmax(dim=1))
)
,dim=1
)
print("stacked",stacked)
cmt = torch.zeros(3
,3, dtype=torch.float64)
with torch.no_grad():
for p in stacked:
tl, pl = p.tolist()
cmt[tl, pl] = cmt[tl, pl] + 1
print("cmt: ",cmt)
print('{} set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
SOURCE_NAME, total_loss.item(), correct, len(source_loader.dataset), acc_train))
def test_alexnet(model, target_loader):
# test target data on fine-tuned alexnet
# :param model: trained alexnet on source data set
# :param target_loader: target dataloader
# :return: correct num
# enter evaluation mode
clf_criterion = nn.CrossEntropyLoss()
model.eval()
test_loss = 0
correct = 0
for data, target in target_test_loader:
if cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data, volatile=True), Variable(target)
target_preds = model(data)
test_loss += clf_criterion(target_preds, target) # sum up batch loss
pred = target_preds.data.max(1)[1] # get the index of the max log-probability
correct += pred.eq(target.data.view_as(pred)).cpu().sum()
stacked = torch.stack(
(
target
,target_preds.argmax(dim=1)
)
,dim=1
)
print("stacked target",stacked)
test_loss /= len(target_loader)
print('{} set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
TARGET_NAME, test_loss.item(), correct, len(target_loader.dataset),
100. * correct / len(target_loader.dataset)))
return correct
def compute_confusion_matrix(preds, y):
#round predictions to the closest integer
rounded_preds = torch.round(torch.sigmoid(preds))
return confusion_matrix(y, rounded_preds)
if __name__ == '__main__':
ROOT_PATH = './v1234_combined/pets'
SOURCE_NAME = 'v123'
TARGET_NAME = 'v4'
BATCH_SIZE = 15
TRAIN_EPOCHS = 1
learning_rate = 1e-2
L2_DECAY = 5e-4
MOMENTUM = 0.9
source_loader = dataloader.load_training(ROOT_PATH, SOURCE_NAME, BATCH_SIZE)
#target_train_loader = dataloader.load_training(ROOT_PATH, TARGET_NAME, BATCH_SIZE)
target_test_loader = dataloader.load_testing(ROOT_PATH, TARGET_NAME, BATCH_SIZE)
print('Load data complete')
alexnet = model.Alexnet_finetune(num_classes=3)
print('Construct model complete')
# load pretrained alexnet model
alexnet = model.load_pretrained_alexnet(alexnet)
print('Load pretrained alexnet parameters complete\n')
if cuda: alexnet.cuda()
for epoch in range(1, TRAIN_EPOCHS + 1):
print(f'Train Epoch {epoch}:')
train_alexnet(epoch, alexnet, learning_rate, source_loader)
correct = test_alexnet(alexnet, target_test_loader)
print(len(source_loader.dataset))
In oder to conver all elements of a tensor from floats to ints, you need to use .to():
all_preds_int = all_preds.to(torch.int64)
Note that it appears as if your all_preds are the predicted class probabilities and not the actual labels. You might need to torch.argmax along the appropriate dimension. (BTW, the output of argmax is int - no need to convert).

How to use keras to fine-tune inception v3 to do multi-class classification?

I want to use Keras to do two classes image classify using Cat vs. Dog dataset from Kaggle.com.
But I have some problem with param "class_mode" as below code.
if I use "binary" mode, accuracy is about 95%, but if I use "categorical" accuracy is abnormally low, only above 50%.
binary mode means only one output in last layer and use sigmoid activation to classify. sample's label is only one integer.
categorical means two output in last layer and use softmax activation to classify. sample's label is one hot format, eg.(1,0), (0,1).
I think these two ways should have the similar result. Anyone knows the reason for the difference? Thanks very much!
import os
import sys
import glob
import argparse
import matplotlib.pyplot as plt
from keras import __version__
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import SGD
set some params here
IM_WIDTH, IM_HEIGHT = 299, 299 #fixed size for InceptionV3
NB_EPOCHS = 1
BAT_SIZE = 32
FC_SIZE = 1024
NB_IV3_LAYERS_TO_FREEZE = 172
loss_mode = "binary_crossentropy"
def get_nb_files(directory):
"""Get number of files by searching directory recursively"""
if not os.path.exists(directory):
return 0
cnt = 0
for r, dirs, files in os.walk(directory):
for dr in dirs:
cnt += len(glob.glob(os.path.join(r, dr + "/*")))
return cnt
transfer_learn, keep the weights in inception v3
def setup_to_transfer_learn(model, base_model):
"""Freeze all layers and compile the model"""
for layer in base_model.layers:
layer.trainable = False
model.compile(optimizer='rmsprop', loss=loss_mode, metrics=['accuracy'])
Add last layer to do two classes classification.
def add_new_last_layer(base_model, nb_classes):
"""Add last layer to the convnet
Args:
base_model: keras model excluding top
nb_classes: # of classes
Returns:
new keras model with last layer
"""
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(FC_SIZE, activation='relu')(x) #new FC layer, random init
if args.class_mode == "binary":
predictions = Dense(1, activation='sigmoid')(x) #new softmax layer
else:
predictions = Dense(nb_classes, activation='softmax')(x) #new softmax layer
model = Model(inputs=base_model.input, outputs=predictions)
return model
Freeze the bottom NB_IV3_LAYERS and retrain the remaining top layers,
and fine tune weights.
def setup_to_finetune(model):
"""Freeze the bottom NB_IV3_LAYERS and retrain the remaining top layers.
note: NB_IV3_LAYERS corresponds to the top 2 inception blocks in the inceptionv3 arch
Args:
model: keras model
"""
for layer in model.layers[:NB_IV3_LAYERS_TO_FREEZE]:
layer.trainable = False
for layer in model.layers[NB_IV3_LAYERS_TO_FREEZE:]:
layer.trainable = True
model.compile(optimizer="rmsprop", loss=loss_mode, metrics=['accuracy'])
#model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])
def train(args):
"""Use transfer learning and fine-tuning to train a network on a new dataset"""
nb_train_samples = get_nb_files(args.train_dir)
nb_classes = len(glob.glob(args.train_dir + "/*"))
nb_val_samples = get_nb_files(args.val_dir)
nb_epoch = int(args.nb_epoch)
batch_size = int(args.batch_size)
print("nb_classes:{}".format(nb_classes))
data prepare
train_datagen = ImageDataGenerator(
preprocessing_function=preprocess_input,
rotation_range=30,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True
)
test_datagen = ImageDataGenerator(
preprocessing_function=preprocess_input,
rotation_range=30,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True
)
train_generator = train_datagen.flow_from_directory(
args.train_dir,
target_size=(IM_WIDTH, IM_HEIGHT),
batch_size=batch_size,
#class_mode='binary'
class_mode=args.class_mode
)
validation_generator = test_datagen.flow_from_directory(
args.val_dir,
target_size=(IM_WIDTH, IM_HEIGHT),
batch_size=batch_size,
#class_mode='binary'
class_mode=args.class_mode
)
setup model
base_model = InceptionV3(weights='imagenet', include_top=False) #include_top=False excludes final FC layer
model = add_new_last_layer(base_model, nb_classes)
transfer learning
setup_to_transfer_learn(model, base_model)
#model.summary()
history_tl = model.fit_generator(
train_generator,
epochs=nb_epoch,
steps_per_epoch=nb_train_samples//BAT_SIZE,
validation_data=validation_generator,
validation_steps=nb_val_samples//BAT_SIZE)
fine-tuning
setup_to_finetune(model)
history_ft = model.fit_generator(
train_generator,
steps_per_epoch=nb_train_samples//BAT_SIZE,
epochs=nb_epoch,
validation_data=validation_generator,
validation_steps=nb_val_samples//BAT_SIZE)
model.save(args.output_model_file)
if args.plot:
plot_training(history_ft)
def plot_training(history):
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'r.')
plt.plot(epochs, val_acc, 'r')
plt.title('Training and validation accuracy')
plt.figure()
plt.plot(epochs, loss, 'r.')
plt.plot(epochs, val_loss, 'r-')
plt.title('Training and validation loss')
plt.show()
main func
if __name__=="__main__":
a = argparse.ArgumentParser()
a.add_argument("--train_dir", default="train2")
a.add_argument("--val_dir", default="test2")
a.add_argument("--nb_epoch", default=NB_EPOCHS)
a.add_argument("--batch_size", default=BAT_SIZE)
a.add_argument("--output_model_file", default="inceptionv3-ft.model")
a.add_argument("--plot", action="store_true")
a.add_argument("--class_mode", default="binary")
args = a.parse_args()
if args.train_dir is None or args.val_dir is None:
a.print_help()
sys.exit(1)
if args.class_mode != "binary" and args.class_mode != "categorical":
print("set class_mode as 'binary' or 'categorical'")
if args.class_mode == "categorical":
loss_mode = "categorical_crossentropy"
#set class_mode
print("class_mode:{}, loss_mode:{}".format(args.class_mode, loss_mode))
if (not os.path.exists(args.train_dir)) or (not os.path.exists(args.val_dir)):
print("directories do not exist")
sys.exit(1)
train(args)
I had this problem on several tasks when the learning rate was too high. Try something like 0.0001 or even less.
According to the Keras Documentation, the default rate ist 0.001:
keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
See https://keras.io/optimizers/#rmsprop
I found that if I use SDG or Adam optimizer, the accuracy can go up normally. So is there something wrong using RMSprop optimizer with default learning rate=0.001?

Getting dimensions wrong when creating a feed-forward auto-encoder in Theano/Lasagne

I want to create a simple autoencoder with 3000 input, 2 hidden and 3000 output neurons:
def build_autoencoder(input_var=None):
l_in = InputLayer(shape=(None,3000), input_var=input_var)
l_hid = DenseLayer(
l_in, num_units=2,
nonlinearity=rectify,
W=lasagne.init.GlorotUniform())
l_out = DenseLayer(
l_hid, num_units=3000,
nonlinearity=softmax)
return l_out
The shape of the training data is as follows:
train.shape = (3000,3)
This is input, target and loss function definition:
import sys
import os
import time
import numpy as np
import theano
import theano.tensor as T
import lasagne
from lasagne.updates import rmsprop
from lasagne.layers import DenseLayer, DropoutLayer, InputLayer
from lasagne.nonlinearities import rectify, softmax
from lasagne.objectives import categorical_crossentropy
# Creating the Theano variables
input_var = T.dmatrix('inputs')
target_var = T.dmatrix('targets')
# Building the Theano expressions on these variables
network = build_autoencoder(input_var)
prediction = lasagne.layers.get_output(network)
loss = categorical_crossentropy(prediction, target_var)
loss = loss.mean()
test_prediction = lasagne.layers.get_output(network,
deterministic=True)
test_loss = categorical_crossentropy(test_prediction, target_var)
test_loss = test_loss.mean()
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
dtype=theano.config.floatX)
I'm just running one epoch but get an error:
params = lasagne.layers.get_all_params(network, trainable=True)
updates = rmsprop(loss, params, learning_rate=0.001)
# Compiling the graph by declaring the Theano functions
train_fn = theano.function([input_var, target_var],
loss, updates=updates)
val_fn = theano.function([input_var, target_var],
[test_loss, test_acc])
# For loop that goes each time through the hole training
# and validation data
print("Starting training...")
for epoch in range(1):
# Going over the training data
train_err = 0
train_batches = 0
start_time = time.time()
print 'test1'
train_err += train_fn(train, train)
train_batches += 1
# Going over the validation data
val_err = 0
val_acc = 0
val_batches = 0
err, acc = val_fn(train, train)
val_err += err
val_acc += acc
val_batches += 1
# Then we print the results for this epoch:
print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time))
print("training loss:\t\t{:.6f}".format(train_err / train_batches))
print("validation loss:\t\t{:.6f}".format(val_err / val_batches))
print("validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100))
This is the error:
ValueError: ('shapes (3000,3) and (3000,2) not aligned: 3 (dim 1) !=
3000 (dim 0)', (3000, 3), (3000, 2)) Apply node that caused the error:
Dot22(inputs, W) Toposort index: 3 Inputs types: [TensorType(float64,
matrix), TensorType(float64, matrix)] Inputs shapes: [(3000, 3),
(3000, 2)] Inputs strides: [(24, 8), (16, 8)] Inputs values: ['not
shown', 'not shown'] Outputs clients:
[[Elemwise{add,no_inplace}(Dot22.0, InplaceDimShuffle{x,0}.0),
Elemwise{Composite{(i0 * (Abs(i1) + i2 + i3))}}[(0,
2)](TensorConstant{(1, 1) of 0.5}, Elemwise{add,no_inplace}.0,
Dot22.0, InplaceDimShuffle{x,0}.0)]]
To me it seems that the bottleneck of the auto encoder is the problem. Any ideas?
Just got some help from my IBM college (Erwan), I've posted the working solution to this GIST, the relevant sections are these ones:
First, get the shape of the training data correct:
train.shape = (3, 3000)
Then use the same shape on the InputLayer:
def build_autoencoder(input_var=None):
l_in = InputLayer(shape=(3, 3000), input_var=input_var)
l_hid = DenseLayer(
l_in, num_units=2,
nonlinearity=rectify,
W=lasagne.init.GlorotUniform())
l_out = DenseLayer(
l_hid, num_units=3000,
nonlinearity=softmax)
return l_out
So this is solved, next problem is getting a descending cost during training, but this is another topic :)

Is Caffe's reported accuracy reliable?

I recently tried to get the confusion matrix for one of my trained models, to see how precise it is. I downloaded this script and fed my model. To my astonishment, the accuracy calculated by the script, is very different than the one, Caffe reports.
I have used this script to calculate the confusion matrix, this however, reports the accuracy as well, the problem is the accuracy reported by this script is way different that the one reported by Caffe! For example Caffe reports the accuracy lets say for CIFAR10, as 92.34%, while, when the model is fed to the script to calculate confusion matrix and its accuracy, it results in for example something like 86.5%!
Which one of these accuracies are the correct one, and can be reported in papers or compared with the results of other papers such as those here ?
I also saw something weird again, I trained two identical models, with only one difference, that being one used Xavier, and the other used msra for initialization. The first one reports an accuracy of 94.25 and the other reports 94.26 in Caffe. when these models are fed to the script I linked above, for confusion matrix computations. their accuracies were 89.2% and 87.4% respectively!
Is this normal? what is the cause for this? msra?
Are the accuracies reported by Caffe true and reliable?
PS: The accuracy in the script is calculated as (complete script):
for i, image, label in reader:
image_caffe = image.reshape(1, *image.shape)
out = net.forward_all(data=np.asarray([ image_caffe ]))
plabel = int(out['prob'][0].argmax(axis=0))
count += 1
iscorrect = label == plabel
correct += (1 if iscorrect else 0)
matrix[(label, plabel)] += 1
labels_set.update([label, plabel])
if not iscorrect:
print("\rError: i=%s, expected %i but predicted %i" \
% (i, label, plabel))
sys.stdout.write("\rAccuracy: %.1f%%" % (100.*correct/count))
sys.stdout.flush()
print(", %i/%i corrects" % (correct, count))
Which imho is OK and correct. the number of correct predictions, divided by the total number of instances in the dataset.
I found the reason.
The reason for the mismatch between Caffe generated accuracy and the accuract generated by the script in question, was solely because of mean-subtraction, which was done in caffe, and not in the script.
This is the modified version of script which takes this into account and hopefully everything is just fine.
# Author: Axel Angel, copyright 2015, license GPLv3.
# added mean subtraction so that, the accuracy can be reported accurately just like caffe when doing a mean subtraction
# Seyyed Hossein Hasan Pour
# Coderx7#Gmail.com
# 7/3/2016
import sys
import caffe
import numpy as np
import lmdb
import argparse
from collections import defaultdict
def flat_shape(x):
"Returns x without singleton dimension, eg: (1,28,28) -> (28,28)"
return x.reshape(filter(lambda s: s > 1, x.shape))
def lmdb_reader(fpath):
import lmdb
lmdb_env = lmdb.open(fpath)
lmdb_txn = lmdb_env.begin()
lmdb_cursor = lmdb_txn.cursor()
for key, value in lmdb_cursor:
datum = caffe.proto.caffe_pb2.Datum()
datum.ParseFromString(value)
label = int(datum.label)
image = caffe.io.datum_to_array(datum).astype(np.uint8)
yield (key, flat_shape(image), label)
def leveldb_reader(fpath):
import leveldb
db = leveldb.LevelDB(fpath)
for key, value in db.RangeIter():
datum = caffe.proto.caffe_pb2.Datum()
datum.ParseFromString(value)
label = int(datum.label)
image = caffe.io.datum_to_array(datum).astype(np.uint8)
yield (key, flat_shape(image), label)
def npz_reader(fpath):
npz = np.load(fpath)
xs = npz['arr_0']
ls = npz['arr_1']
for i, (x, l) in enumerate(np.array([ xs, ls ]).T):
yield (i, x, l)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--proto', type=str, required=True)
parser.add_argument('--model', type=str, required=True)
parser.add_argument('--mean', type=str, required=True)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--lmdb', type=str, default=None)
group.add_argument('--leveldb', type=str, default=None)
group.add_argument('--npz', type=str, default=None)
args = parser.parse_args()
# Extract mean from the mean image file
mean_blobproto_new = caffe.proto.caffe_pb2.BlobProto()
f = open(args.mean, 'rb')
mean_blobproto_new.ParseFromString(f.read())
mean_image = caffe.io.blobproto_to_array(mean_blobproto_new)
f.close()
count = 0
correct = 0
matrix = defaultdict(int) # (real,pred) -> int
labels_set = set()
# CNN reconstruction and loading the trained weights
net = caffe.Net(args.proto, args.model, caffe.TEST)
caffe.set_mode_cpu()
print "args", vars(args)
if args.lmdb != None:
reader = lmdb_reader(args.lmdb)
if args.leveldb != None:
reader = leveldb_reader(args.leveldb)
if args.npz != None:
reader = npz_reader(args.npz)
for i, image, label in reader:
image_caffe = image.reshape(1, *image.shape)
out = net.forward_all(data=np.asarray([ image_caffe ])- mean_image)
plabel = int(out['prob'][0].argmax(axis=0))
count += 1
iscorrect = label == plabel
correct += (1 if iscorrect else 0)
matrix[(label, plabel)] += 1
labels_set.update([label, plabel])
if not iscorrect:
print("\rError: i=%s, expected %i but predicted %i" \
% (i, label, plabel))
sys.stdout.write("\rAccuracy: %.1f%%" % (100.*correct/count))
sys.stdout.flush()
print(", %i/%i corrects" % (correct, count))
print ""
print "Confusion matrix:"
print "(r , p) | count"
for l in labels_set:
for pl in labels_set:
print "(%i , %i) | %i" % (l, pl, matrix[(l,pl)])