train yolov3 by mmdetection with VOCstyle-dataset - deep-learning

When I try to train yolo3 with command
python mmdetection/tools/train.py config/yolo-darknet53.py
and everything goes ok until first epoch begin and error happens that
it says "ValueError: need at least one array to concatenate" I am searching for a long time on net. But no use. Please help or try to give some ideas how to achieve this.
here is the yolo3 configuration file
#!user/bin/env python3
# -*- coding: utf-8 -*-
_base_ = '../mmdetection/configs/_base_/default_runtime.py'
classes = ('hand',)
# model settings
model = dict(
type='YOLOV3',
pretrained='D:/workplace/srtp/handdetection/checkpoints/darknet53-a628ea1b.pth',
backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)),
neck=dict(
type='YOLOV3Neck',
num_scales=3,
in_channels=[1024, 512, 256],
out_channels=[512, 256, 128]),
bbox_head=dict(
type='YOLOV3Head',
num_classes=1,
in_channels=[512, 256, 128],
out_channels=[1024, 512, 256],
anchor_generator=dict(
type='YOLOAnchorGenerator',
base_sizes=[[(116, 90), (156, 198), (373, 326)],
[(30, 61), (62, 45), (59, 119)],
[(10, 13), (16, 30), (33, 23)]],
strides=[32, 16, 8]),
bbox_coder=dict(type='YOLOBBoxCoder'),
featmap_strides=[32, 16, 8],
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0,
reduction='sum'),
loss_conf=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0,
reduction='sum'),
loss_xy=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=2.0,
reduction='sum'),
loss_wh=dict(type='MSELoss', loss_weight=2.0, reduction='sum')))
# training and testing settings
train_cfg = dict(
assigner=dict(
type='GridAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0))
test_cfg = dict(
nms_pre=1000,
min_bbox_size=0,
score_thr=0.05,
conf_thr=0.005,
nms=dict(type='nms', iou_threshold=0.45),
max_per_img=100)
# dataset settings
dataset_type = 'VOCDataset'
dataset_root = 'D:/workplace/srtp/handdetection/VOC_HandDataSetVOC2007'
img_norm_cfg = dict(mean=[0, 0, 0], std=[255., 255., 255.], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile', to_float32=True),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='PhotoMetricDistortion'),
dict(
type='Expand',
mean=img_norm_cfg['mean'],
to_rgb=img_norm_cfg['to_rgb'],
ratio_range=(1, 2)),
dict(
type='MinIoURandomCrop',
min_ious=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
min_crop_size=0.3),
dict(type='Resize', img_scale=[(320, 320), (608, 608)], keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(608, 608),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=1,
workers_per_gpu=1,
train=dict(
type=dataset_type,
ann_file=f'{dataset_root}/ImageSets/Main/hand_train.txt',
img_prefix=f'{dataset_root}',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=f'{dataset_root}/ImageSets/Main/hand_val.txt',
img_prefix=f'{dataset_root}',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=f'{dataset_root}/ImageSets/Main/hand_test.txt',
img_prefix=f'{dataset_root}',
pipeline=test_pipeline))
# optimizer
optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0005)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=2000, # same as burn-in in darknet
warmup_ratio=0.1,
step=[218, 246])
# runtime settings
total_epochs = 273
evaluation = dict(interval=1, metric=['bbox'])

Related

Why VGG16 model cannot be trained with its FC Layers

I am trying to train the VGG16 model code, but the loss is not optimized and seems that model's parameters are not updated.
here is the model :
import torch
import torch.nn as nn
import math
import torch.nn.functional as F
from utils import AvgPoolConv
cfg = {
'VGG11': [16, 'M', 32, 'M', 64, 64, 'M', 128, 128, 'M', 128, 128, 'M'],
'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],}
class VGG(nn.Module):
def __init__(self, vgg_name, use_bn, num_class=100):
super(VGG, self).__init__()
self.features = self._make_layers(cfg[vgg_name], use_bn)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(512,4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096,4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096, num_class)
)
#self.classifier = nn.Linear(512, num_class)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
n = m.weight.size(1)
m.weight.data.normal_(0, 1.0/float(n))
m.bias.data.zero_()
def forward(self, x):
out = self.features(x)
out = self.classifier(out)
return out
def _make_layers(self, cfg, use_bn=True):
layers = []
in_channels = 3
for x in cfg:
if x == 'M':
layers += [nn.AvgPool2d(2)]
#layers += [AvgPoolConv(kernel_size=2, stride=2, input_channel=in_channels)]
else:
layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
nn.BatchNorm2d(x) if use_bn else nn.Dropout(0.25),
nn.ReLU(inplace=True)]
in_channels = x
#layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
return nn.Sequential(*layers)
but if I delete the first 2 FC layers from the classifier as shown below, the model is trained and loss can be optimized ??
self.features = self._make_layers(cfg[vgg_name], use_bn)
self.classifier = nn.Linear(512, num_class)
Why this happens?
First, it would be good to verify if the parameters are really not updated or just that the change is small.
Different architectures might require different tuning (learning rate, weight decay if you use it etc.). A good thing to try when debugging is a test "can I overfit it"; use a single batch (or a single sample even) and check if you can get it to 0; you might need to tweak optimization parameters mentioned before.
Assuming everything is correct and the gradient flows, I'd say - tune the learning rate and try adding batch normalization between your linear and relu layers (should make the training much faster).

PyTorch: How to calculate output size of the CNN?

I went through this PyTorch CNN implementation available here: https://machinelearningknowledge.ai/pytorch-conv2d-explained-with-examples/
I am unable to understand how they replace the '?' with some value. What is the formula for calculating the CNN layer output?
This is essential to be calculated in PyTorch; not so in Tensorflow - Keras. If there is any other blog that explains this well, please drop it in the comments.
# Implementation of CNN/ConvNet Model
class CNN(torch.nn.Module):
def __init__(self):
super(CNN, self).__init__()
# L1 ImgIn shape=(?, 28, 28, 1)
# Conv -> (?, 28, 28, 32)
# Pool -> (?, 14, 14, 32)
self.layer1 = torch.nn.Sequential(
torch.nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2),
torch.nn.Dropout(p=1 - keep_prob))
# L2 ImgIn shape=(?, 14, 14, 32)
# Conv ->(?, 14, 14, 64)
# Pool ->(?, 7, 7, 64)
self.layer2 = torch.nn.Sequential(
torch.nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2),
torch.nn.Dropout(p=1 - keep_prob))
# L3 ImgIn shape=(?, 7, 7, 64)
# Conv ->(?, 7, 7, 128)
# Pool ->(?, 4, 4, 128)
self.layer3 = torch.nn.Sequential(
torch.nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
torch.nn.Dropout(p=1 - keep_prob))
# L4 FC 4x4x128 inputs -> 625 outputs
self.fc1 = torch.nn.Linear(4 * 4 * 128, 625, bias=True)
torch.nn.init.xavier_uniform(self.fc1.weight)
self.layer4 = torch.nn.Sequential(
self.fc1,
torch.nn.ReLU(),
torch.nn.Dropout(p=1 - keep_prob))
# L5 Final FC 625 inputs -> 10 outputs
self.fc2 = torch.nn.Linear(625, 10, bias=True)
torch.nn.init.xavier_uniform_(self.fc2.weight) # initialize parameters
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0), -1) # Flatten them for FC
out = self.fc1(out)
out = self.fc2(out)
return out
#instantiate CNN model
model = CNN()
model
Thanks!
I assume you calculation is wrong because:
Pytorch support images in format C * H * W (e.g. 3x32x32 not 32x32x3)
First dimension always batch dimension and must be omitted in calculation because, all nn.Modules handle it by default
So if you want calculate input size for first Linear layer, you can use this trick:
conv = nn.Sequential(self.layer1,self.layer2, self.layer3, nn.Flatten())
out = conv(torch.randn(1,im_height,im_width).unsqueeze(0))
# fc_layer_in_channels = out.shape[1]
self.fc1 = torch.nn.Linear(out.shape[1], 625, bias=True)
but only if you know im_height,im_width
The best practice is use torch.nn.AdaptiveAvgPool2d.
With this layer you always can get output of fixed spatial size.

Shall we lower case input data for (pre) training a BERT uncased model using huggingface?

Shall we lower case input data for (pre) training a BERT uncased model using huggingface? I looked into this response from Thomas Wolf (https://github.com/huggingface/transformers/issues/92#issuecomment-444677920) but not entirely sure if he meant that.
What happens if we lowercase the text ?
Tokenizer will take care of that.
A simple example:
import torch
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_length = 10, padding_side = 'right')
input_ids = torch.tensor(tokenizer.encode('this is a cat', add_special_tokens=True, max_length = 10, pad_to_max_length = True)).unsqueeze(0)
print(input_ids)
input_ids = torch.tensor(tokenizer.encode('This is a Cat', add_special_tokens=True, max_length = 10, pad_to_max_length = True)).unsqueeze(0)
print(input_ids)
Out:
tensor([[ 101, 2023, 2003, 1037, 4937, 102, 0, 0, 0, 0]])
tensor([[ 101, 2023, 2003, 1037, 4937, 102, 0, 0, 0, 0]])
But in case of cased,
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', max_length = 10, padding_side = 'right')
input_ids = torch.tensor(tokenizer.encode('this is a cat', add_special_tokens=True, max_length = 10, pad_to_max_length = True)).unsqueeze(0)
print(input_ids)
input_ids = torch.tensor(tokenizer.encode('This is a Cat', add_special_tokens=True, max_length = 10, pad_to_max_length = True)).unsqueeze(0)
print(input_ids)
tensor([[ 101, 1142, 1110, 170, 5855, 102, 0, 0, 0, 0]])
tensor([[ 101, 1188, 1110, 170, 8572, 102, 0, 0, 0, 0]])
I think the bert-base-uncased model will lower case the text irrespective of what you pass to the model. You can also try playing with a toy dataset and print the tokens using the BERT tokenizer so as to just confirm.

Combining Two CNN's

I Want to Combine Two CNN Into Just One In Keras, What I Mean Is that I Want The Neural Network To Take Two Images And Process Each One in Separate CNN, and Then Concatenate Them Together Into The Flattening Layer and Use Fully Connected Layer to Do The Last Work, Here What I Did:
# Start With First Branch ############################################################
branch_one = Sequential()
# Adding The Convolution
branch_one.add(Conv2D(32, (3,3),input_shape = (64,64,3) , activation = 'relu'))
branch_one.add(Conv2D(32, (3, 3), activation='relu'))
# Doing The Pooling Phase
branch_one.add(MaxPooling2D(pool_size=(2, 2)))
branch_one.add(Dropout(0.25))
branch_one.add(Flatten())
# Start With Second Branch ############################################################
branch_two = Sequential()
# Adding The Convolution
branch_two.add(Conv2D(32, (3,3),input_shape = (64,64,3) , activation = 'relu'))
branch_two.add(Conv2D(32, (3, 3), activation='relu'))
# Doing The Pooling Phase
branch_two.add(MaxPooling2D(pool_size=(2, 2)))
branch_two.add(Dropout(0.25))
branch_two.add(Flatten())
# Making The Combinition ##########################################################
final = Sequential()
final.add(Concatenate([branch_one, branch_two]))
final.add(Dense(units = 128, activation = "relu"))
final.add(Dense(units = 1, activation = "sigmoid"))
# Doing The Compilation
final.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
# Adding and Pushing The Images to CNN
# use ImageDataGenerator to preprocess the data
from keras.preprocessing.image import ImageDataGenerator
# augment the data that we have
train_datagen = ImageDataGenerator(rescale = 1./255,
shear_range = 0.2,
zoom_range = 0.2,
horizontal_flip = True)
test_datagen = ImageDataGenerator(rescale = 1./255)
# prepare training data
X1 = train_datagen.flow_from_directory('./ddsm1000_resized/images/train',
target_size = (64, 64),
batch_size = 32,
class_mode = 'binary')
X2 = train_datagen.flow_from_directory('./ddsm1000_resized_canny/images/train',
target_size = (64, 64),
batch_size = 32,
class_mode = 'binary')
# prepare test data
Y1 = test_datagen.flow_from_directory('./ddsm1000_resized/images/test',
target_size = (64, 64),
batch_size = 32,
class_mode = 'binary')
Y2 = test_datagen.flow_from_directory('./ddsm1000_resized_canny/images/test',
target_size = (64, 64),
batch_size = 32,
class_mode = 'binary')
final.fit_generator([X1, X2], steps_per_epoch = (8000 / 32), epochs = 1, validation_data = [Y1,Y2], validation_steps = 2000)
Keras Telling Me
RuntimeError: You must compile your model before using it.
I Think That is The CNN Does not the shapes of input data, so what Can I Do Here ?? Thanks
Make the change as pointed below:
from keras.layers import Merge
...
...
# Making The Combinition ##########################################################
final = Sequential()
final.add(Merge([branch_one, branch_two], mode = 'concat'))
...
...

Reshaping image data in Keras to match CNN requirements

I've created a CNN designed to recognize objects.
from keras.preprocessing.image import img_to_array, load_img
img = load_img('newimage.jpg')
x = img_to_array(img)
x = x.reshape( (1,) + x.shape )
scores = model.predict(x, verbose=1)
print(scores)
However I'm getting:
expected convolution2d_input_1 to have shape (None, 3, 108, 192) but got array with shape (1, 3, 192, 108)
My model:
def create_model():
model = Sequential()
model.add(Convolution2D(32, 3, 3, input_shape=(3, img_width, img_height)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Convolution2D(32, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Convolution2D(64, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
return model
I've looked at related answers and the documentation, but at a loss as to how to reshape the array to match what's expected?
I guess the problem is with setting up the image width and height. As the error says:
expected convolution2d_input_1 to have shape (None, 3, 108, 192) # expected width = 108 and height = 192
but got array with shape (1, 3, 192, 108) # width = 192, height = 108
Update: I tested your code with a small change and it worked!
I am giving just changed lines:
img_width, img_height = 960, 717
model.add(Convolution2D(32, 3, 3, input_shape=(img_height, img_width, 3)))
This is the main change - input_shape=(img_height, img_width, 3)
The image i used to run this code was of width = 960 and height = 717. I have updated my previous answer as some part of the answer was wrong! Sorry for that.