Highway networks in keras and lasagne - significant perfomance difference - deep-learning

I implemented highway networks with keras and with lasagne, and the keras version consistently underperforms to the lasagne version. I am using the same dataset and metaparameters in both of them. Here is the keras version's code:
X_train, y_train, X_test, y_test, X_all = hacking_script.load_all_data()
data_dim = 144
layer_count = 32
dropout = 0.04
hidden_units = 32
nb_epoch = 10
model = Sequential()
model.add(Dense(hidden_units, input_dim=data_dim))
model.add(Dropout(dropout))
for index in range(layer_count):
model.add(Highway(activation = 'relu'))
model.add(Dropout(dropout))
model.add(Dropout(dropout))
model.add(Dense(2, activation='softmax'))
print 'compiling...'
model.compile(loss='binary_crossentropy', optimizer='adagrad')
model.fit(X_train, y_train, batch_size=100, nb_epoch=nb_epoch,
show_accuracy=True, validation_data=(X_test, y_test), shuffle=True, verbose=0)
predictions = model.predict_proba(X_test)
And here is the lasagne version's code:
class MultiplicativeGatingLayer(MergeLayer):
def __init__(self, gate, input1, input2, **kwargs):
incomings = [gate, input1, input2]
super(MultiplicativeGatingLayer, self).__init__(incomings, **kwargs)
assert gate.output_shape == input1.output_shape == input2.output_shape
def get_output_shape_for(self, input_shapes):
return input_shapes[0]
def get_output_for(self, inputs, **kwargs):
return inputs[0] * inputs[1] + (1 - inputs[0]) * inputs[2]
def highway_dense(incoming, Wh=Orthogonal(), bh=Constant(0.0),
Wt=Orthogonal(), bt=Constant(-4.0),
nonlinearity=rectify, **kwargs):
num_inputs = int(np.prod(incoming.output_shape[1:]))
l_h = DenseLayer(incoming, num_units=num_inputs, W=Wh, b=bh, nonlinearity=nonlinearity)
l_t = DenseLayer(incoming, num_units=num_inputs, W=Wt, b=bt, nonlinearity=sigmoid)
return MultiplicativeGatingLayer(gate=l_t, input1=l_h, input2=incoming)
# ==== Parameters ====
num_features = X_train.shape[1]
epochs = 10
hidden_layers = 32
hidden_units = 32
dropout_p = 0.04
# ==== Defining the neural network shape ====
l_in = InputLayer(shape=(None, num_features))
l_hidden1 = DenseLayer(l_in, num_units=hidden_units)
l_hidden2 = DropoutLayer(l_hidden1, p=dropout_p)
l_current = l_hidden2
for k in range(hidden_layers - 1):
l_current = highway_dense(l_current)
l_current = DropoutLayer(l_current, p=dropout_p)
l_dropout = DropoutLayer(l_current, p=dropout_p)
l_out = DenseLayer(l_dropout, num_units=2, nonlinearity=softmax)
# ==== Neural network definition ====
net1 = NeuralNet(layers=l_out,
update=adadelta, update_rho=0.95, update_learning_rate=1.0,
objective_loss_function=categorical_crossentropy,
train_split=TrainSplit(eval_size=0), verbose=0, max_epochs=1)
net1.fit(X_train, y_train)
predictions = net1.predict_proba(X_test)[:, 1]
Now the keras version barely outperforms logistic regression, while the lasagne version is the best scoring algorithm so far. Any ideas as to why?

Here are some suggestions (I'm not sure if they will actually close the performance gap you are observing):
According to the Keras documentation the Highway layer is initialized using Glorot Uniform weights while in your Lasagne code you are using Orthogonal weight initialization. Unless you have another part of your code where you set the weight initialization to Orthogonal for the Keras Highway layer, this could be a source of the performance gap.
It also seems like you are using Adagrad for your Keras model, but you are using Adadelta for your Lasagne model.
Also I am not 100% sure about this, but you may also want to verify that your transform bias terms are initialized the same way.

Related

Building neural network using k-fold cross validation

I am new to deep learning, trying to implement a neural network using 4-fold cross-validation for training, testing, and validating. The topic is to classify the vehicle using an existing dataset.
The accuracy result is 0.7.
Traning Accuracy
An example output for epochs
I also don't know whether the code is correct and what to do for increasing the accuracy.
Here is the code:
!pip install category_encoders
import tensorflow as tf
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from tensorflow import keras
import category_encoders as ce
from category_encoders import OrdinalEncoder
car_data = pd.read_csv('car_data.csv')
car_data.columns = ['Purchasing', 'Maintenance', 'No_Doors','Capacity','BootSize','Safety','Evaluation']
# Extract the features and labels from the dataset
X = car_data.drop(['Evaluation'], axis=1)
Y = car_data['Evaluation']
encoder = ce.OrdinalEncoder(cols=['Purchasing', 'Maintenance', 'No_Doors','Capacity','BootSize','Safety'])
X = encoder.fit_transform(X)
X = X.to_numpy()
Y_df = pd.DataFrame(Y, columns=['Evaluation'])
encoder = OrdinalEncoder(cols=['Evaluation'])
Y_encoded = encoder.fit_transform(Y_df)
Y = Y_encoded.to_numpy()
input_layer = tf.keras.layers.Input(shape=(X.shape[1]))
# Define the hidden layers
hidden_layer_1 = tf.keras.layers.Dense(units=64, activation='relu', kernel_initializer='glorot_uniform')(input_layer)
hidden_layer_2 = tf.keras.layers.Dense(units=32, activation='relu', kernel_initializer='glorot_uniform')(hidden_layer_1)
# Define the output layer
output_layer = tf.keras.layers.Dense(units=1, activation='sigmoid', kernel_initializer='glorot_uniform')(hidden_layer_2)
# Create the model
model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
# Initialize the 4-fold cross-validation
kfold = KFold(n_splits=4, shuffle=True, random_state=42)
# Initialize a list to store the scores
scores = []
quality_weights= []
# Compile the model
model.compile(optimizer='adam',
loss=''sparse_categorical_crossentropy'',
metrics=['accuracy'],
sample_weight_mode='temporal')
for train_index, test_index in kfold.split(X,Y):
# Split the data into train and test sets
X_train, X_test = X[train_index], X[test_index]
Y_train, Y_test = Y[train_index], Y[test_index]
# Fit the model on the training data
model.fit(X_train, Y_train, epochs=300, batch_size=64, sample_weight=quality_weights)
# Evaluate the model on the test data
score = model.evaluate(X_test, Y_test)
# Append the score to the scores list
scores.append(score[1])
plt.plot(history.history['accuracy'])
plt.title('Model Training Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()
# Print the mean and standard deviation of the scores
print(f'Mean accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')
The first thing that caught my attention was here:
model.fit(X_train, Y_train, epochs=300, batch_size=64, sample_weight=quality_weights)
Your quality_weights should be a numpy array of size of the input.
Refer here: https://keras.io/api/models/model_training_apis/#fit-method
If changing that doesn't seemt to help then may be your network doesn't seem to be learning from the data. A few possible reasons could be:
The network is a bit too shallow. Try adding just one more hidden layer to see if that improves anything
From the code I can't see the size of your input data. Does it have enough datapoints for 4-fold cross-validation? Can you somehow augment the data?

Resolve overfitting in a convolutional network

I've written a snippet to classify Omniglot images. I calculate the training and validation losses in each epoch, where the latter is computed using images that were not seen by the network before. The two plots are as below:
Since the training loss decreases while the validation loss increases, I have concluded that my model overfits. I've tried several suggestions (e.g. here) to overcome this, including:
Increasing the size of the training set.
shuffling the data.
Adding dropout layers (up to p=0.9).
Using smaller model.
Altering the architecture.
Changing the learning rate.
Reducing the batch size.
Adding weight decay.
However, the validation loss still increases. I wonder if there are any other suggestions to improve this behavior or if this is not overfitting, but the problem is something else. Below is the snippet used in this question.
import torch
import torchvision
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
dim_out = 964
# -- embedding params
self.cn1 = nn.Conv2d(1, 16, 7)
self.cn2 = nn.Conv2d(16, 32, 4)
self.cn3 = nn.Conv2d(32, 64, 3)
self.pool = nn.MaxPool2d(2)
self.bn1 = nn.BatchNorm2d(16)
self.bn2 = nn.BatchNorm2d(32)
self.bn3 = nn.BatchNorm2d(64)
# -- prediction params
self.fc1 = nn.Linear(256, 170)
self.fc2 = nn.Linear(170, 50)
self.fc3 = nn.Linear(50, dim_out)
# -- non-linearity
self.relu = nn.ReLU()
self.Beta = 10
self.sopl = nn.Softplus(beta=self.Beta)
def forward(self, x):
y1 = self.pool(self.bn1(self.relu(self.cn1(x))))
y2 = self.pool(self.bn2(self.relu(self.cn2(y1))))
y3 = self.relu(self.bn3(self.cn3(y2)))
y3 = y3.view(y3.size(0), -1)
y5 = self.sopl(self.fc1(y3))
y6 = self.sopl(self.fc2(y5))
return self.fc3(y6)
class Train:
def __init__(self):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# -- data
dim = 28
batch_size = 400
my_transforms = transforms.Compose([transforms.Resize((dim, dim)), transforms.ToTensor()])
trainset = torchvision.datasets.Omniglot(root="./data/omniglot_train/", download=False, transform=my_transforms)
validset = torchvision.datasets.Omniglot(root="./data/omniglot_train/", background=False, download=False,
transform=my_transforms)
self.TrainDataset = DataLoader(dataset=trainset, batch_size=batch_size, shuffle=True)
self.ValidDataset = DataLoader(dataset=validset, batch_size=len(validset), shuffle=False)
self.N_train = len(trainset)
self.N_valid = len(validset)
# -- model
self.model = MyModel().to(self.device)
# -- train
self.epochs = 3000
self.loss = nn.CrossEntropyLoss()
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
def train_epoch(self):
self.model.train()
train_loss = 0
for batch_idx, data_batch in enumerate(self.TrainDataset):
# -- predict
predict = self.model(data_batch[0].to(self.device))
# -- loss
loss = self.loss(predict, data_batch[1].to(self.device))
# -- optimize
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
train_loss += loss.item()
return train_loss/(batch_idx+1)
def valid_epoch(self):
with torch.no_grad():
self.model.eval()
for data_batch in self.ValidDataset:
# -- predict
predict = self.model(data_batch[0].to(self.device))
# -- loss
loss = self.loss(predict, data_batch[1].to(self.device))
return loss.item()
def __call__(self):
for epoch in range(self.epochs):
train_loss = self.train_epoch()
valid_loss = self.valid_epoch()
print('Epoch {}: Training loss = {:.5f}, Validation loss = {:.5f}.'.format(epoch, train_loss, valid_loss))
torch.save(self.model.state_dict(), './model_stat.pth')
if __name__ == '__main__':
my_train = Train()
my_train()
If your train accuracy is good but testing (data not used in training) accuracy is bad then you have an overfitting problem. I had the same problem with a CNN model. You can use two methods to overcome overfitting. First is early stopping for your train and second is regularization. Check the below example:
# L2 regularizers for layers
model = keras.Sequential([
keras.layers.InputLayer(input_shape=(32, 32)),
keras.layers.Reshape(target_shape=(32, 32, 1)),
keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu, use_bias=True , kernel_regularizer =tf.keras.regularizers.l2( l=0.01)),
keras.layers.MaxPooling2D(pool_size=(2, 2)),
keras.layers.Flatten(),
keras.layers.Dense(10, activation = 'softmax', use_bias=True)
])
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
#Early Stopping
history = model.fit(X_train, Y_train,
validation_data=(X_dev, Y_dev),
epochs=4000,
callbacks=EarlyStopping(monitor='val_loss'))
Do not forget to import for early stopping.
from tensorflow.keras.callbacks import EarlyStopping

what should I do if my regression model stuck at a high value loss?

I'm using neural nets for a regression problem where I have 3 features and I'm trying to predict one continuous value. I noticed that my neural net start learning good but after 10 epochs it get stuck on a high loss value and could not improve anymore.
I tried to use Adam and other adaptive optimizers instead of SGD but that didn't work. I tried a complex architectures like adding layers, neurons, batch normalization and other activations etc.. and that also didn't work.
I tried to debug and try to find out if something is wrong with the implementation but when I use only 10 examples of the data my model learn fast so there are no errors. I start to increase the examples of the data and monitoring my model results as I increase the data examples. when I reach 3000 data examples my model start to get stuck on a high value loss.
I tried to increase layers, neurons and also to try other activations, batch normalization. My data are also normalized between [-1, 1], my target value is not normalized since it is regression and I'm predicting a continuous value. I also tried using keras but I've got the same result.
My real dataset have 40000 data, I don't know what should I try, I almost try all things that I know for optimization but none of them worked. I would appreciate it if someone can guide me on this. I'll post my Code but maybe it is too messy to try to understand, I'm sure there is no problem with my implementation, I'm using skorch/pytorch and some SKlearn functions:
# take all features as an Independant variable except the bearing and distance
# here when I start small the model learn good but from 3000 data points as you can see the model stuck on a high value. I mean the start loss is 15 and it start to learn good but when it reach 9 it stucks there
# and if I try to use the whole dataset for training then the loss start at 47 and start decreasing until it reach 36 and then stucks there too
X = dataset.iloc[:3000, 0:-2].reset_index(drop=True).to_numpy().astype(np.float32)
# take distance and bearing as the output values:
y = dataset.iloc[:3000, -2:].reset_index(drop=True).to_numpy().astype(np.float32)
y_bearing = y[:, 0].reshape(-1, 1)
y_distance = y[:, 1].reshape(-1, 1)
# normalize the input values
scaler = StandardScaler()
X_norm = scaler.fit_transform(X, y)
X_br_train, X_br_test, y_br_train, y_br_test = train_test_split(X_norm,
y_bearing,
test_size=0.1,
random_state=42,
shuffle=True)
X_dis_train, X_dis_test, y_dis_train, y_dis_test = train_test_split(X_norm,
y_distance,
test_size=0.1,
random_state=42,
shuffle=True)
bearing_trainset = Dataset(X_br_train, y_br_train)
bearing_testset = Dataset(X_br_test, y_br_test)
distance_trainset = Dataset(X_dis_train, y_dis_train)
distance_testset = Dataset(X_dis_test, y_dis_test)
def root_mse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
class RMSELoss(nn.Module):
def __init__(self):
super().__init__()
self.mse = nn.MSELoss()
def forward(self, yhat, y):
return torch.sqrt(self.mse(yhat, y))
class AED(nn.Module):
"""custom average euclidean distance loss"""
def __init__(self):
super().__init__()
def forward(self, yhat, y):
return torch.dist(yhat, y)
def train(on_target,
hidden_units,
batch_size,
epochs,
optimizer,
lr,
regularisation_factor,
train_shuffle):
network = None
trainset = distance_trainset if on_target.lower() == 'distance' else bearing_trainset
testset = distance_testset if on_target.lower() == 'distance' else bearing_testset
print(f"shape of trainset.X = {trainset.X.shape}, shape of trainset.y = {trainset.y.shape}")
print(f"shape of testset.X = {testset.X.shape}, shape of testset.y = {testset.y.shape}")
mse = EpochScoring(scoring=mean_squared_error, lower_is_better=True, name='MSE')
r2 = EpochScoring(scoring=r2_score, lower_is_better=False, name='R2')
rmse = EpochScoring(scoring=make_scorer(root_mse), lower_is_better=True, name='RMSE')
checkpoint = Checkpoint(dirname=f'results/{on_target}/checkpoints')
train_end_checkpoint = TrainEndCheckpoint(dirname=f'results/{on_target}/checkpoints')
if on_target.lower() == 'bearing':
network = BearingNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=y_distance.shape[1])
elif on_target.lower() == 'distance':
network = DistanceNetwork(n_features=X_norm.shape[1],
n_hidden=hidden_units,
n_out=1)
model = NeuralNetRegressor(
module=network,
criterion=RMSELoss,
device='cpu',
batch_size=batch_size,
lr=lr,
optimizer=optim.Adam if optimizer.lower() == 'adam' else optim.SGD,
optimizer__weight_decay=regularisation_factor,
max_epochs=epochs,
iterator_train__shuffle=train_shuffle,
train_split=predefined_split(testset),
callbacks=[mse, r2, rmse, checkpoint, train_end_checkpoint]
)
print(f"{'*' * 10} start training the {on_target} model {'*' * 10}")
history = model.fit(trainset, y=None)
print(f"{'*' * 10} End Training the {on_target} Model {'*' * 10}")
if __name__ == '__main__':
args = parser.parse_args()
train(on_target=args.on_target,
hidden_units=args.hidden_units,
batch_size=args.batch_size,
epochs=args.epochs,
optimizer=args.optimizer,
lr=args.learning_rate,
regularisation_factor=args.regularisation_lambda,
train_shuffle=args.shuffle)
and this is my network declaration:
class DistanceNetwork(nn.Module):
"""separate NN for predicting distance"""
def __init__(self, n_features=5, n_hidden=16, n_out=1):
super().__init__()
self.model = nn.Sequential(
nn.Linear(n_features, n_hidden),
nn.LeakyReLU(),
nn.Linear(n_hidden, 5),
nn.LeakyReLU(),
nn.Linear(5, n_out)
)
here is the log while training:

LSTM for speech accent classification

I am trying LSTM model on this dataset: https://www.kaggle.com/rtatman/speech-accent-archive
This is the model that I am working on:
def train_lstm_model(X_train, y_train, X_validation, y_validation, EPOCHS, batch_size=128):
# Get row, column, and class sizes
rows = X_train[0].shape[0]
cols = X_train[0].shape[1]
val_rows = X_validation[0].shape[0]
val_cols = X_validation[0].shape[1]
num_classes = len(y_train[0])
input_shape = (rows, cols)
X_train = X_train.reshape(X_train.shape[0], rows, cols)
X_validation = X_validation.reshape(X_validation.shape[0], val_rows, val_cols)
lstm = Sequential()
lstm.add(LSTM(64, return_sequences=True, stateful=False, input_shape=input_shape, activation='tanh'))
lstm.add(LSTM(64, return_sequences=True, stateful=False, activation='tanh'))
lstm.add(LSTM(64, stateful=False, activation='tanh'))
# add dropout to control for overfitting
lstm.add(Dropout(.25))
# squash output onto number of classes in probability space
lstm.add(Dense(num_classes, activation='softmax'))
# adam = optimizers.adam(lr=0.0001)
rmsprop = optimizers.adam(lr=0.002)
lstm.compile(loss='categorical_crossentropy', optimizer=rmsprop, metrics=["accuracy"])
es = EarlyStopping(monitor='acc', min_delta=.005, patience=10, verbose=1, mode='auto')
# Creates log file for graphical interpretation using TensorBoard
tb = TensorBoard(log_dir=LOG_DIR, histogram_freq=0, batch_size=32, write_graph=True, write_grads=True,
write_images=True, embeddings_freq=0, embeddings_layer_names=None,
embeddings_metadata=None)
lstm.fit(X_train, y_train, batch_size=batch_size,
epochs=EPOCHS, validation_data=(X_validation,y_validation),
callbacks=[es,tb])
return lstm
And when I run it for 15 epochs, I get this loss curve for the validation data. https://imgur.com/a/hB4uK
And this is the accuracy on validation data.
https://imgur.com/a/9knGD
This is the training accuracy: https://imgur.com/a/HBfgF
And this is the training loss: https://imgur.com/a/JRdQ9
I've only used three classes from the dataset.
Any suggestions on what can I improve in the model?
These are the steps I followed:
1. Read wav file [only reading 90 samples per class]
2. calculate melspectrogram
3. split mel-spec into segments. [this gives around 11k samples]
3. normalize the mel-spec.
4. feed into network.

How to use keras to fine-tune inception v3 to do multi-class classification?

I want to use Keras to do two classes image classify using Cat vs. Dog dataset from Kaggle.com.
But I have some problem with param "class_mode" as below code.
if I use "binary" mode, accuracy is about 95%, but if I use "categorical" accuracy is abnormally low, only above 50%.
binary mode means only one output in last layer and use sigmoid activation to classify. sample's label is only one integer.
categorical means two output in last layer and use softmax activation to classify. sample's label is one hot format, eg.(1,0), (0,1).
I think these two ways should have the similar result. Anyone knows the reason for the difference? Thanks very much!
import os
import sys
import glob
import argparse
import matplotlib.pyplot as plt
from keras import __version__
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import SGD
set some params here
IM_WIDTH, IM_HEIGHT = 299, 299 #fixed size for InceptionV3
NB_EPOCHS = 1
BAT_SIZE = 32
FC_SIZE = 1024
NB_IV3_LAYERS_TO_FREEZE = 172
loss_mode = "binary_crossentropy"
def get_nb_files(directory):
"""Get number of files by searching directory recursively"""
if not os.path.exists(directory):
return 0
cnt = 0
for r, dirs, files in os.walk(directory):
for dr in dirs:
cnt += len(glob.glob(os.path.join(r, dr + "/*")))
return cnt
transfer_learn, keep the weights in inception v3
def setup_to_transfer_learn(model, base_model):
"""Freeze all layers and compile the model"""
for layer in base_model.layers:
layer.trainable = False
model.compile(optimizer='rmsprop', loss=loss_mode, metrics=['accuracy'])
Add last layer to do two classes classification.
def add_new_last_layer(base_model, nb_classes):
"""Add last layer to the convnet
Args:
base_model: keras model excluding top
nb_classes: # of classes
Returns:
new keras model with last layer
"""
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(FC_SIZE, activation='relu')(x) #new FC layer, random init
if args.class_mode == "binary":
predictions = Dense(1, activation='sigmoid')(x) #new softmax layer
else:
predictions = Dense(nb_classes, activation='softmax')(x) #new softmax layer
model = Model(inputs=base_model.input, outputs=predictions)
return model
Freeze the bottom NB_IV3_LAYERS and retrain the remaining top layers,
and fine tune weights.
def setup_to_finetune(model):
"""Freeze the bottom NB_IV3_LAYERS and retrain the remaining top layers.
note: NB_IV3_LAYERS corresponds to the top 2 inception blocks in the inceptionv3 arch
Args:
model: keras model
"""
for layer in model.layers[:NB_IV3_LAYERS_TO_FREEZE]:
layer.trainable = False
for layer in model.layers[NB_IV3_LAYERS_TO_FREEZE:]:
layer.trainable = True
model.compile(optimizer="rmsprop", loss=loss_mode, metrics=['accuracy'])
#model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])
def train(args):
"""Use transfer learning and fine-tuning to train a network on a new dataset"""
nb_train_samples = get_nb_files(args.train_dir)
nb_classes = len(glob.glob(args.train_dir + "/*"))
nb_val_samples = get_nb_files(args.val_dir)
nb_epoch = int(args.nb_epoch)
batch_size = int(args.batch_size)
print("nb_classes:{}".format(nb_classes))
data prepare
train_datagen = ImageDataGenerator(
preprocessing_function=preprocess_input,
rotation_range=30,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True
)
test_datagen = ImageDataGenerator(
preprocessing_function=preprocess_input,
rotation_range=30,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True
)
train_generator = train_datagen.flow_from_directory(
args.train_dir,
target_size=(IM_WIDTH, IM_HEIGHT),
batch_size=batch_size,
#class_mode='binary'
class_mode=args.class_mode
)
validation_generator = test_datagen.flow_from_directory(
args.val_dir,
target_size=(IM_WIDTH, IM_HEIGHT),
batch_size=batch_size,
#class_mode='binary'
class_mode=args.class_mode
)
setup model
base_model = InceptionV3(weights='imagenet', include_top=False) #include_top=False excludes final FC layer
model = add_new_last_layer(base_model, nb_classes)
transfer learning
setup_to_transfer_learn(model, base_model)
#model.summary()
history_tl = model.fit_generator(
train_generator,
epochs=nb_epoch,
steps_per_epoch=nb_train_samples//BAT_SIZE,
validation_data=validation_generator,
validation_steps=nb_val_samples//BAT_SIZE)
fine-tuning
setup_to_finetune(model)
history_ft = model.fit_generator(
train_generator,
steps_per_epoch=nb_train_samples//BAT_SIZE,
epochs=nb_epoch,
validation_data=validation_generator,
validation_steps=nb_val_samples//BAT_SIZE)
model.save(args.output_model_file)
if args.plot:
plot_training(history_ft)
def plot_training(history):
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'r.')
plt.plot(epochs, val_acc, 'r')
plt.title('Training and validation accuracy')
plt.figure()
plt.plot(epochs, loss, 'r.')
plt.plot(epochs, val_loss, 'r-')
plt.title('Training and validation loss')
plt.show()
main func
if __name__=="__main__":
a = argparse.ArgumentParser()
a.add_argument("--train_dir", default="train2")
a.add_argument("--val_dir", default="test2")
a.add_argument("--nb_epoch", default=NB_EPOCHS)
a.add_argument("--batch_size", default=BAT_SIZE)
a.add_argument("--output_model_file", default="inceptionv3-ft.model")
a.add_argument("--plot", action="store_true")
a.add_argument("--class_mode", default="binary")
args = a.parse_args()
if args.train_dir is None or args.val_dir is None:
a.print_help()
sys.exit(1)
if args.class_mode != "binary" and args.class_mode != "categorical":
print("set class_mode as 'binary' or 'categorical'")
if args.class_mode == "categorical":
loss_mode = "categorical_crossentropy"
#set class_mode
print("class_mode:{}, loss_mode:{}".format(args.class_mode, loss_mode))
if (not os.path.exists(args.train_dir)) or (not os.path.exists(args.val_dir)):
print("directories do not exist")
sys.exit(1)
train(args)
I had this problem on several tasks when the learning rate was too high. Try something like 0.0001 or even less.
According to the Keras Documentation, the default rate ist 0.001:
keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
See https://keras.io/optimizers/#rmsprop
I found that if I use SDG or Adam optimizer, the accuracy can go up normally. So is there something wrong using RMSprop optimizer with default learning rate=0.001?