Google Colab RAM issue with semi-supervised CNN model training - deep-learning

I'm trying to training a binary classifier by transfer learning on EfficientNet. Since I have lots of unlabeled data, I use semi-supervised method to generate multiple "pseudo labeled" data before the model go through each epoch.
Since Colab has its limits of RAM, I delete some large variables(like numpy arrays, dataset, dataloader...) in each loop, however the RAM still increase in every loop like the picture shown below.
Below is my Training loop which consists of 3 main structure: semi-supervised, training loop, validation loop.
I'm not sure which step cause the RAM to keep increase in each epoch.
(1) semi-supervised part
for epoch in range(n_epochs):
print(f"[ Epoch | {epoch + 1:03d}/{n_epochs:03d} ]")
if do_semi:
model.eval()
dataset_0 = []
dataset_1 = []
for img in pseudo_loader:
with torch.no_grad():
logits = model(img.to(device))
probs = softmax(logits)
# Filter the data and construct a new dataset.
for i in range(len(probs)):
p = probs[i].tolist()
idx = p.index(max(p))
if p[idx] >= threshold:
if idx == 0:
dataset_0.append(img[i].numpy().reshape(128, 128, 3))
else:
dataset_1.append(img[i].numpy().reshape(128, 128, 3))
# stratified sampling with labels
len_0, len_1 = len(dataset_0), len(dataset_1)
print('label 0: ', len_0)
print('label 1: ', len_1)
# since there may be RAM memory error, restrict to 1000
if len_0 > 1000:
dataset_0 = random.sample(dataset_0, 1000)
if len_1 > 1000:
dataset_1 = random.sample(dataset_1, 1000)
if len_0 == len_1:
pseudo_x = np.array(dataset_0 + dataset_1)
pseudo_y = ['0' for _ in range(len(dataset_0))] + ['1' for _ in range(len(dataset_1))]
elif len_0 > len_1:
dataset_0 = random.sample(dataset_0, len(dataset_1))
pseudo_x = np.array(dataset_0 + dataset_1)
pseudo_y = ['0' for _ in range(len(dataset_0))] + ['1' for _ in range(len(dataset_1))]
else:
dataset_1 = random.sample(dataset_1, len(dataset_0))
pseudo_x = np.array(dataset_0 + dataset_1)
pseudo_y = ['0' for _ in range(len(dataset_0))] + ['1' for _ in range(len(dataset_1))]
if len(pseudo_x) != 0:
new_dataset = CustomTensorDataset(pseudo_x, np.array(pseudo_y), 'pseudo')
else:
new_dataset = []
# print how many pseudo label data added
print('Total number of pseudo labeled data are added: ', len(new_dataset))
# release RAM
dataset_0 = None
dataset_1 = None
pseudo_x = None
pseudo_y = None
del dataset_0, dataset_1, pseudo_x, pseudo_y
gc.collect()
# Turn off the eval mode.
model.train()
concat_dataset = ConcatDataset([train_set, new_dataset])
train_loader = DataLoader(concat_dataset, batch_size=batch_size, shuffle=True)
i'm quiet sure the problem happened in semi-supervised part, since RAM usage did not increase when no apply semi-supervised part.
Thanks for your helps!!

Related

RuntimeError: shape '[128, -1]' is invalid for input of size 378 pytorch

I'm running a spiking neural network for data that has 21 features with a batch size of 128. I get the following error after many iterations of training (this error doesn't arise immediately!):
RuntimeError: shape '[128, -1]' is invalid for input of size 378 pytorch
When I went to go print out what the shapes of the tensors are before, I get the following:
Train
torch.Size([128, 21])
Test
torch.Size([128, 21])
This is my network:
class SpikingNeuralNetwork(nn.Module):
"""
Parameters in SpikingNeuralNetwork class:
1. number_inputs: Number of inputs to the SNN.
2. number_hidden: Number of hidden layers.
3. number_outputs: Number of output classes.
4. beta: Decay rate.
"""
def __init__(self, number_inputs, number_hidden, number_outputs, beta):
super().__init__()
self.number_inputs = number_inputs
self.number_hidden = number_hidden
self.number_outputs = number_outputs
self.beta = beta
# Initialize layers
self.fc1 = nn.Linear(self.number_inputs, self.number_hidden) # Applies linear transformation to all input points
self.lif1 = snn.Leaky(beta = self.beta) # Integrates weighted input over time, emitting a spike if threshold condition is met
self.fc2 = nn.Linear(self.number_hidden, self.number_outputs) # Applies linear transformation to output spikes of lif1
self.lif2 = snn.Leaky(beta = self.beta) # Another spiking neuron, integrating the weighted spikes over time
"""
Forward propagation of SNN. The code below function will only be called once the input argument x
is explicitly passed into net.
#param x: input passed into the network
#return layer of output after applying final spiking neuron
"""
def forward(self, x):
num_steps = 25
# Initialize hidden states at t = 0
mem1 = self.lif1.init_leaky()
mem2 = self.lif2.init_leaky()
# Record the final layer
spk2_rec = []
mem2_rec = []
for step in range(num_steps):
cur1 = self.fc1(x)
spk1, mem1 = self.lif1(cur1, mem1)
cur2 = self.fc2(spk1)
spk2, mem2 = self.lif2(cur2, mem2)
spk2_rec.append(spk2)
mem2_rec.append(mem2)
return torch.stack(spk2_rec, dim = 0), torch.stack(mem2_rec, dim = 0)
This is my training loop:
def training_loop(net, train_loader, test_loader, dtype, device, optimizer):
num_epochs = 1
loss_history = []
test_loss_history = []
counter = 0
# Temporal dynamics
num_steps = 25
# Outer training loop
for epoch in range(num_epochs):
iter_counter = 0
train_batch = iter(train_loader)
# Minibatch training loop
for data, targets in train_batch:
data = data.to(device)
targets = targets.to(device)
# Forward pass
net.train()
print("Train")
print(data.size())
spk_rec, mem_rec = net(data.view(batch_size, -1))
# Initialize the loss and sum over time
loss_val = torch.zeros((1), dtype = dtype, device = device)
for step in range(num_steps):
loss_val += loss_function(mem_rec[step], targets.long().flatten().to(device))
# Gradient calculation and weight update
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# Store loss history for future plotting
loss_history.append(loss_val.item())
# Test set
with torch.no_grad():
net.eval()
test_data, test_targets = next(iter(test_loader))
test_data = test_data.to(device)
test_targets = test_targets.to(device)
# Test set forward pass
print("Test")
print(test_data.size())
test_spk, test_mem = net(test_data.view(batch_size, -1))
# Test set loss
test_loss = torch.zeros((1), dtype = dtype, device = device)
for step in range(num_steps):
test_loss += loss_function(test_mem[step], test_targets.long().flatten().to(device))
test_loss_history.append(test_loss.item())
# Print train/test loss and accuracy
if counter % 50 == 0:
train_printer(epoch, iter_counter, counter, loss_history, data, targets, test_data, test_targets)
counter = counter + 1
iter_counter = iter_counter + 1
return loss_history, test_loss_history
The error occurs on spk_rec, mem_rec = net(data.view(batch_size, -1)).
The code was adopted from https://snntorch.readthedocs.io/en/latest/tutorials/tutorial_5.html, where it was originally used for the MNIST dataset. However, I am not working with an image dataset. I am working with a dataset that has 21 features and predicts just one target (with 100 classes). I tried to change data.view(batch_size, -1) and test_data.view(batch_size, -1) to data.view(batch_size, 21) and test_data.view(batch_size, 21) based on some other forum answers that I saw, and my program is running for now through the training loop. Does anyone have any suggestions for how I can run through the training with no errors?
EDIT: I now get the error RuntimeError: shape '[128, 21]' is invalid for input of size 378 from spk_rec, mem_rec = net(data.view(batch_size, -1)).
Here are my DataLoaders:
train_loader = DataLoader(dataset = train, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(dataset = test, batch_size = batch_size, shuffle = True)
My batch size is 128.
Tryng to run it by myself to try to solve your problem I luck also: net params and snn.snn.Leaky
import torch
from torch import nn
from torch.utils.data import DataLoader
class SpikingNeuralNetwork(nn.Module):
"""
Parameters in SpikingNeuralNetwork class:
1. number_inputs: Number of inputs to the SNN.
2. number_hidden: Number of hidden layers.
3. number_outputs: Number of output classes.
4. beta: Decay rate.
"""
def __init__(self, number_inputs, number_hidden, number_outputs, beta):
super().__init__()
self.number_inputs = number_inputs
self.number_hidden = number_hidden
self.number_outputs = number_outputs
self.beta = beta
# Initialize layers
self.fc1 = nn.Linear(self.number_inputs,
self.number_hidden) # Applies linear transformation to all input points
self.lif1 = snn.Leaky(
beta=self.beta) # Integrates weighted input over time, emitting a spike if threshold condition is met
self.fc2 = nn.Linear(self.number_hidden,
self.number_outputs) # Applies linear transformation to output spikes of lif1
self.lif2 = snn.Leaky(beta=self.beta) # Another spiking neuron, integrating the weighted spikes over time
"""
Forward propagation of SNN. The code below function will only be called once the input argument x
is explicitly passed into net.
#param x: input passed into the network
#return layer of output after applying final spiking neuron
"""
def forward(self, x):
num_steps = 25
# Initialize hidden states at t = 0
mem1 = self.lif1.init_leaky()
mem2 = self.lif2.init_leaky()
# Record the final layer
spk2_rec = []
mem2_rec = []
for step in range(num_steps):
cur1 = self.fc1(x)
spk1, mem1 = self.lif1(cur1, mem1)
cur2 = self.fc2(spk1)
spk2, mem2 = self.lif2(cur2, mem2)
spk2_rec.append(spk2)
mem2_rec.append(mem2)
return torch.stack(spk2_rec, dim=0), torch.stack(mem2_rec, dim=0)
batch_size = 2
train = torch.rand(128, 21)
test = torch.rand(128, 21)
train_loader = DataLoader(dataset=train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test, batch_size=batch_size, shuffle=True)
net = SpikingNeuralNetwork(number_inputs=1)
loss_function = nn.CrossEntropyLoss()
optimizer = nn.optim.Adam(net.parameters(), lr=0.1)
def training_loop(net, train_loader, test_loader, dtype, device, optimizer):
num_epochs = 1
loss_history = []
test_loss_history = []
counter = 0
# Temporal dynamics
num_steps = 25
# Outer training loop
for epoch in range(num_epochs):
iter_counter = 0
train_batch = iter(train_loader)
# Minibatch training loop
for data, targets in train_batch:
data = data.to(device)
targets = targets.to(device)
# Forward pass
net.train()
print("Train")
print(data.size())
spk_rec, mem_rec = net(data.view(batch_size, -1))
# Initialize the loss and sum over time
loss_val = torch.zeros((1), dtype=dtype, device=device)
for step in range(num_steps):
loss_val += loss_function(mem_rec[step], targets.long().flatten().to(device))
# Gradient calculation and weight update
optimizer.zero_grad()
loss_val.backward()
optimizer.step()
# Store loss history for future plotting
loss_history.append(loss_val.item())
# Test set
with torch.no_grad():
net.eval()
test_data, test_targets = next(iter(test_loader))
test_data = test_data.to(device)
test_targets = test_targets.to(device)
# Test set forward pass
print("Test")
print(test_data.size())
test_spk, test_mem = net(test_data.view(batch_size, -1))
# Test set loss
test_loss = torch.zeros((1), dtype=dtype, device=device)
for step in range(num_steps):
test_loss += loss_function(test_mem[step], test_targets.long().flatten().to(device))
test_loss_history.append(test_loss.item())
# Print train/test loss and accuracy
if counter % 50 == 0:
train_printer(epoch, iter_counter, counter, loss_history, data, targets, test_data, test_targets)
counter = counter + 1
iter_counter = iter_counter + 1
return loss_history, test_loss_history
Your code works just fine on the MNIST dataset, so I think it might be a problem with how the DataLoader is being called. My guess is that the total dataset is not evenly divisible by your batch_size. If this is true, then you have two options:
Instead of spk_rec, mem_rec = net(data.view(batch_size, -1)), try spk_rec, mem_rec = net(data.flatten(1)) which preserves the first dimension of your data.
Alternatively, you may need to set drop_last=True in the DataLoader functions.

Finding the number of of nodes and gpus of DistributedDataParallel

I would like to know what number should I select for nodes and gpus.
I use Tesla V100-SXM2 (8 boards).
I tried:
nodes = 1, gpus=1 (only the first gpu works)
nodes=1, gpus =8 (It took very long time and cannot execute)
Did I got wrong parameter for the nodes and gpus? or Is my code wrong ? I would appreciate if you could help me out. The code below is simplified sample code of DPP.
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
parser.add_argument('-g', '--gpus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
help='ranking within the nodes')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
help='number of total epochs to run')
args = parser.parse_args()
args.world_size = args.gpus * args.nodes
os.environ['MASTER_ADDR'] = 'host1'
os.environ['MASTER_PORT'] = '7777'
mp.spawn(train, nprocs=args.gpus, args=(args,))
def train(gpu, args):
rank = args.nr * args.gpus + gpu
dist.init_process_group(
backend='nccl',
init_method='env://',
world_size=args.world_size,
rank=rank
)
torch.manual_seed(0)
model = ConvNet()
torch.cuda.set_device(gpu)
model.cuda(gpu)
batch_size = 100
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(gpu)
optimizer = torch.optim.SGD(model.parameters(), 1e-4)
# Wrapper around our model to handle parallel training
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
# Data loading code
train_dataset = get_datasets()
# Sampler that takes care of the distribution of the batches such that
# the data is not repeated in the iteration and sampled accordingly
train_sampler = torch.utils.data.distributed.DistributedSampler(
train_dataset,
num_replicas=args.world_size,
rank=rank
)
# We pass in the train_sampler which can be used by the DataLoader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=0,
pin_memory=True,
sampler=train_sampler)
start = datetime.now()
total_step = len(train_loader)
for epoch in range(args.epochs):
for i, (images, labels) in enumerate(train_loader):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0 and gpu == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
epoch + 1,
args.epochs,
i + 1,
total_step,
loss.item())
)
if gpu == 0:
print("Training complete)

PyTorch: Dropout (?) causes different model convergence for training+validation V. training-only

We are facing a very strange issue. We tested the exact same model into two different “execution” settings. In the first case, given a certain amount of epochs, we train using mini-batches for one epoch, and thereafter we test on the validation set following the same criteria. Then, we go for the next epoch. Clearly, before each training epoch, we use model.train(), and before validation we turn on model.eval().
Then we take the exact same model (same init, same dataset, same epochs, etc.) and we just train it without validation after each epoch.
Just looking at performance on training set, we observed that, even if we fixed all seeds, the two training procedures evolve differently and produce quite different metrics results (losses, accuracy, and so on). Specifically, the training-only procedure is less performing.
We also observe the following things:
It is not a reproducibility issue, because multiple executions of the
same procedure produce exactly the same results (and this is
intended);
Removing the dropout, it appears that the problem vanishes;
Batchnorm1d layer, that still has different behaviours between
training and evaluation, seems to work properly;
The issue still happens if we move from training onto TPUs to CPUs.
We are working and tried Pythorch 1.6, Pythorch nightly, XLA 1.6.
We quite lost one full day in trying to tackle this issue (and no, we cannot avoid using dropout). Does anyone have any idea about how to solve this fact?
Thank you very much!
p.s. Here the code employed for the training (on CPU).
def sigmoid(x):
return 1 / (1 + torch.exp(-x))
def _run(model, EPOCHS, training_data_in, validation_data_in=None):
def train_fn(train_dataloader, model, optimizer, criterion):
running_loss = 0.
running_accuracy = 0.
running_tp = 0.
running_tn = 0.
running_fp = 0.
running_fn = 0.
model.train()
for batch_idx, (ecg, spo2, labels) in enumerate(train_dataloader, 1):
optimizer.zero_grad()
outputs = model(ecg)
loss = criterion(outputs, labels)
loss.backward() # calculate the gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step() # update the network weights
running_loss += loss.item()
predicted = torch.round(sigmoid(outputs.data)) # here determining the sigmoid, not included in the model
running_accuracy += (predicted == labels).sum().item() / labels.size(0)
fp = ((predicted - labels) == 1.).sum().item()
fn = ((predicted - labels) == -1.).sum().item()
tp = ((predicted + labels) == 2.).sum().item()
tn = ((predicted + labels) == 0.).sum().item()
running_tp += tp
running_fp += fp
running_tn += tn
running_fn += fn
retval = {'loss':running_loss / batch_idx,
'accuracy':running_accuracy / batch_idx,
'tp':running_tp,
'tn':running_tn,
'fp':running_fp,
'fn':running_fn
}
return retval
def valid_fn(valid_dataloader, model, criterion):
running_loss = 0.
running_accuracy = 0.
running_tp = 0.
running_tn = 0.
running_fp = 0.
running_fn = 0.
model.eval()
for batch_idx, (ecg, spo2, labels) in enumerate(valid_dataloader, 1):
outputs = model(ecg)
loss = criterion(outputs, labels)
running_loss += loss.item()
predicted = torch.round(sigmoid(outputs.data)) # here determining the sigmoid, not included in the model
running_accuracy += (predicted == labels).sum().item() / labels.size(0)
fp = ((predicted - labels) == 1.).sum().item()
fn = ((predicted - labels) == -1.).sum().item()
tp = ((predicted + labels) == 2.).sum().item()
tn = ((predicted + labels) == 0.).sum().item()
running_tp += tp
running_fp += fp
running_tn += tn
running_fn += fn
retval = {'loss':running_loss / batch_idx,
'accuracy':running_accuracy / batch_idx,
'tp':running_tp,
'tn':running_tn,
'fp':running_fp,
'fn':running_fn
}
return retval
# Defining data loaders
train_dataloader = torch.utils.data.DataLoader(training_data_in, batch_size=BATCH_SIZE, shuffle=True, num_workers=1)
if validation_data_in != None:
validation_dataloader = torch.utils.data.DataLoader(validation_data_in, batch_size=BATCH_SIZE, shuffle=False, num_workers=1)
# Defining the loss function
criterion = nn.BCEWithLogitsLoss()
# Defining the optimizer
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=3e-4, amsgrad=False, eps=1e-07)
# Training code
metrics_history = {"loss":[], "accuracy":[], "precision":[], "recall":[], "f1":[], "specificity":[], "accuracy_bis":[], "tp":[], "tn":[], "fp":[], "fn":[],
"val_loss":[], "val_accuracy":[], "val_precision":[], "val_recall":[], "val_f1":[], "val_specificity":[], "val_accuracy_bis":[], "val_tp":[], "val_tn":[], "val_fp":[], "val_fn":[],}
train_begin = time.time()
for epoch in range(EPOCHS):
start = time.time()
print("EPOCH:", epoch+1)
train_metrics = train_fn(train_dataloader=train_dataloader,
model=model,
optimizer=optimizer,
criterion=criterion)
metrics_history["loss"].append(train_metrics["loss"])
metrics_history["accuracy"].append(train_metrics["accuracy"])
metrics_history["tp"].append(train_metrics["tp"])
metrics_history["tn"].append(train_metrics["tn"])
metrics_history["fp"].append(train_metrics["fp"])
metrics_history["fn"].append(train_metrics["fn"])
precision = train_metrics["tp"] / (train_metrics["tp"] + train_metrics["fp"]) if train_metrics["tp"] > 0 else 0
recall = train_metrics["tp"] / (train_metrics["tp"] + train_metrics["fn"]) if train_metrics["tp"] > 0 else 0
specificity = train_metrics["tn"] / (train_metrics["tn"] + train_metrics["fp"]) if train_metrics["tn"] > 0 else 0
f1 = 2*precision*recall / (precision + recall) if precision*recall > 0 else 0
metrics_history["precision"].append(precision)
metrics_history["recall"].append(recall)
metrics_history["f1"].append(f1)
metrics_history["specificity"].append(specificity)
if validation_data_in != None:
# Calculate the metrics on the validation data, in the same way as done for training
with torch.no_grad(): # don't keep track of the info necessary to calculate the gradients
val_metrics = valid_fn(valid_dataloader=validation_dataloader,
model=model,
criterion=criterion)
metrics_history["val_loss"].append(val_metrics["loss"])
metrics_history["val_accuracy"].append(val_metrics["accuracy"])
metrics_history["val_tp"].append(val_metrics["tp"])
metrics_history["val_tn"].append(val_metrics["tn"])
metrics_history["val_fp"].append(val_metrics["fp"])
metrics_history["val_fn"].append(val_metrics["fn"])
val_precision = val_metrics["tp"] / (val_metrics["tp"] + val_metrics["fp"]) if val_metrics["tp"] > 0 else 0
val_recall = val_metrics["tp"] / (val_metrics["tp"] + val_metrics["fn"]) if val_metrics["tp"] > 0 else 0
val_specificity = val_metrics["tn"] / (val_metrics["tn"] + val_metrics["fp"]) if val_metrics["tn"] > 0 else 0
val_f1 = 2*val_precision*val_recall / (val_precision + val_recall) if val_precision*val_recall > 0 else 0
metrics_history["val_precision"].append(val_precision)
metrics_history["val_recall"].append(val_recall)
metrics_history["val_f1"].append(val_f1)
metrics_history["val_specificity"].append(val_specificity)
print(" > Training/validation loss:", round(train_metrics['loss'], 4), round(val_metrics['loss'], 4))
print(" > Training/validation accuracy:", round(train_metrics['accuracy'], 4), round(val_metrics['accuracy'], 4))
print(" > Training/validation precision:", round(precision, 4), round(val_precision, 4))
print(" > Training/validation recall:", round(recall, 4), round(val_recall, 4))
print(" > Training/validation f1:", round(f1, 4), round(val_f1, 4))
print(" > Training/validation specificity:", round(specificity, 4), round(val_specificity, 4))
else:
print(" > Training loss:", round(train_metrics['loss'], 4))
print(" > Training accuracy:", round(train_metrics['accuracy'], 4))
print(" > Training precision:", round(precision, 4))
print(" > Training recall:", round(recall, 4))
print(" > Training f1:", round(f1, 4))
print(" > Training specificity:", round(specificity, 4))
print("Completed in:", round(time.time() - start, 1), "seconds \n")
print("Training completed in:", round((time.time()- train_begin)/60, 1), "minutes")
# Save the model weights
torch.save(model.state_dict(), './nnet_model.pt')
# Save the metrics history
torch.save(metrics_history, 'training_history')
And here is the function that initializes the model and set the seeds, called before each execution of the code of "_run":
def reinit_model():
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
net = Net() # the model
return net
Ok, I found the issue.
The problem is determined by the fact that, apparently, running the evaluation some random seeds are changed, and this affects the training phase.
The solution is thus as follows:
at the beginning of function "_run()", set all seeds states to the desired value, e.g., 42. Then, save those seeds to disk.
at the beginning of function "train_fn()", read the seeds states from disk, and set them
at the end of function "train_fn()", save the seeds states to disk
For instance, running on TPU with XLA, the following instructions have to be used:
at the beginning of function "_run()": xm.set_rng_state(42), xm.save(xm.get_rng_state(), 'xm_seed')
at the beginning of function "train_fn()": xm.set_rng_state(torch.load('xm_seed'), device=device) (you can also print here the seed for verification purposes with xm.master_print(xm.get_rng_state())
at the end of function "train_fn_()": xm.save(xm.get_rng_state(), 'xm_seed')

Is Caffe's reported accuracy reliable?

I recently tried to get the confusion matrix for one of my trained models, to see how precise it is. I downloaded this script and fed my model. To my astonishment, the accuracy calculated by the script, is very different than the one, Caffe reports.
I have used this script to calculate the confusion matrix, this however, reports the accuracy as well, the problem is the accuracy reported by this script is way different that the one reported by Caffe! For example Caffe reports the accuracy lets say for CIFAR10, as 92.34%, while, when the model is fed to the script to calculate confusion matrix and its accuracy, it results in for example something like 86.5%!
Which one of these accuracies are the correct one, and can be reported in papers or compared with the results of other papers such as those here ?
I also saw something weird again, I trained two identical models, with only one difference, that being one used Xavier, and the other used msra for initialization. The first one reports an accuracy of 94.25 and the other reports 94.26 in Caffe. when these models are fed to the script I linked above, for confusion matrix computations. their accuracies were 89.2% and 87.4% respectively!
Is this normal? what is the cause for this? msra?
Are the accuracies reported by Caffe true and reliable?
PS: The accuracy in the script is calculated as (complete script):
for i, image, label in reader:
image_caffe = image.reshape(1, *image.shape)
out = net.forward_all(data=np.asarray([ image_caffe ]))
plabel = int(out['prob'][0].argmax(axis=0))
count += 1
iscorrect = label == plabel
correct += (1 if iscorrect else 0)
matrix[(label, plabel)] += 1
labels_set.update([label, plabel])
if not iscorrect:
print("\rError: i=%s, expected %i but predicted %i" \
% (i, label, plabel))
sys.stdout.write("\rAccuracy: %.1f%%" % (100.*correct/count))
sys.stdout.flush()
print(", %i/%i corrects" % (correct, count))
Which imho is OK and correct. the number of correct predictions, divided by the total number of instances in the dataset.
I found the reason.
The reason for the mismatch between Caffe generated accuracy and the accuract generated by the script in question, was solely because of mean-subtraction, which was done in caffe, and not in the script.
This is the modified version of script which takes this into account and hopefully everything is just fine.
# Author: Axel Angel, copyright 2015, license GPLv3.
# added mean subtraction so that, the accuracy can be reported accurately just like caffe when doing a mean subtraction
# Seyyed Hossein Hasan Pour
# Coderx7#Gmail.com
# 7/3/2016
import sys
import caffe
import numpy as np
import lmdb
import argparse
from collections import defaultdict
def flat_shape(x):
"Returns x without singleton dimension, eg: (1,28,28) -> (28,28)"
return x.reshape(filter(lambda s: s > 1, x.shape))
def lmdb_reader(fpath):
import lmdb
lmdb_env = lmdb.open(fpath)
lmdb_txn = lmdb_env.begin()
lmdb_cursor = lmdb_txn.cursor()
for key, value in lmdb_cursor:
datum = caffe.proto.caffe_pb2.Datum()
datum.ParseFromString(value)
label = int(datum.label)
image = caffe.io.datum_to_array(datum).astype(np.uint8)
yield (key, flat_shape(image), label)
def leveldb_reader(fpath):
import leveldb
db = leveldb.LevelDB(fpath)
for key, value in db.RangeIter():
datum = caffe.proto.caffe_pb2.Datum()
datum.ParseFromString(value)
label = int(datum.label)
image = caffe.io.datum_to_array(datum).astype(np.uint8)
yield (key, flat_shape(image), label)
def npz_reader(fpath):
npz = np.load(fpath)
xs = npz['arr_0']
ls = npz['arr_1']
for i, (x, l) in enumerate(np.array([ xs, ls ]).T):
yield (i, x, l)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--proto', type=str, required=True)
parser.add_argument('--model', type=str, required=True)
parser.add_argument('--mean', type=str, required=True)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--lmdb', type=str, default=None)
group.add_argument('--leveldb', type=str, default=None)
group.add_argument('--npz', type=str, default=None)
args = parser.parse_args()
# Extract mean from the mean image file
mean_blobproto_new = caffe.proto.caffe_pb2.BlobProto()
f = open(args.mean, 'rb')
mean_blobproto_new.ParseFromString(f.read())
mean_image = caffe.io.blobproto_to_array(mean_blobproto_new)
f.close()
count = 0
correct = 0
matrix = defaultdict(int) # (real,pred) -> int
labels_set = set()
# CNN reconstruction and loading the trained weights
net = caffe.Net(args.proto, args.model, caffe.TEST)
caffe.set_mode_cpu()
print "args", vars(args)
if args.lmdb != None:
reader = lmdb_reader(args.lmdb)
if args.leveldb != None:
reader = leveldb_reader(args.leveldb)
if args.npz != None:
reader = npz_reader(args.npz)
for i, image, label in reader:
image_caffe = image.reshape(1, *image.shape)
out = net.forward_all(data=np.asarray([ image_caffe ])- mean_image)
plabel = int(out['prob'][0].argmax(axis=0))
count += 1
iscorrect = label == plabel
correct += (1 if iscorrect else 0)
matrix[(label, plabel)] += 1
labels_set.update([label, plabel])
if not iscorrect:
print("\rError: i=%s, expected %i but predicted %i" \
% (i, label, plabel))
sys.stdout.write("\rAccuracy: %.1f%%" % (100.*correct/count))
sys.stdout.flush()
print(", %i/%i corrects" % (correct, count))
print ""
print "Confusion matrix:"
print "(r , p) | count"
for l in labels_set:
for pl in labels_set:
print "(%i , %i) | %i" % (l, pl, matrix[(l,pl)])

Simple Multilayer Perceptron model does not converge in TensorFlow

I am new to TensorFlow. Today I tried to implement my first model in TF but it returned strange results. I know that I am missing something here but I was not able to figure it out. Here is the story.
Model
I have a simple Multilayer Perceptron model with only a single hidden layer applied on MNIST databse. Layers are defined like [input(784) , hidden_layer(470) , output_layer(10)] with tanh as non-linearity for hidden layer and softmax as the loss for output layer. The optimizer I am using is Gradient Descent Algorithm with learning rate of 0.01. My mini batch size is 1 (I am training model with samples one by one).
My implementations :
First I implemented my model in C++ and got around 96% accuracy.Here is the repository : https://github.com/amin2ros/Artificog
I implemented the exact model in TensorFlow but surprisingly the model didn't converge at all. Here is the code.
Code:
import sys
import input_data
import matplotlib.pyplot as plt
from pylab import *
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
import tensorflow as tf
# Parameters
learning_rate = 0.1
training_epochs = 1
batch_size = 1
display_step = 1
# Network Parameters
n_hidden_1 = 470 # 1st layer num features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])
# Create model
def multilayer_perceptron(_X, _weights, _biases):
layer_1 = tf.tanh(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1']))
return tf.matmul(layer_1, _weights['out']) + _biases['out']
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Construct model
pred = multilayer_perceptron(x, weights, biases)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax(pred)) # Softmax loss
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(cost) #
# Initializing the variables
init = tf.initialize_all_variables()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
m= 0
total_batch = int(mnist.train.num_examples/batch_size)
counter=0
#print 'count = ' , total_batch
#sys.stdin.read(1)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
label = tf.argmax(batch_ys,1).eval()[0]
counter+=1
sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
wrong_prediction = tf.not_equal(tf.argmax(pred, 1), tf.argmax(y, 1))
missed=tf.cast(wrong_prediction, "float")
m += missed.eval({x: batch_xs, y: batch_ys})[0]
print "Sample #", counter , " - Label : " , label , " - Prediction :" , tf.argmax(pred, 1).eval({x: batch_xs, y: batch_ys})[0] ,\
"- Missed = " , m , " - Error Rate = " , 100 * float(m)/counter
print "Optimization Finished!"
I am very curious why this happens. Any help is appreciated.
Edit:
As commented below definition of cost function was incorrect so it should be like
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred,y))
Now model converges :)