RuntimeError: only batches of spatial targets supported (3D tensors) but got targets of dimension: 4 - deep-learning

I am having a hard time understanding image segmentation. I have implemented Unet model for image segmentation. I am using PASCAL VOC dataset and I am trying to train my model. However, I got stuck when calculating the loss. I am unsure of what should be the expected shapes of the output and target classes. Can someone please educate me on what I am doing wrong? My only guess is that I am missing something when it comes to the ground truth images since I don't know how the model will learn which class is which. Thank!
Here is my Unet class:
import torch
import torch.nn as nn
from torchvision import transforms
def x2conv(in_channels, out_channels):
double_conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=0),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=0),
nn.ReLU(inplace=True))
return double_conv
class Encoder(nn.Module):
def __init__(self, chs):
super().__init__()
self.enc_blocks = nn.ModuleList(
[x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
def forward(self, x):
ftrs = []
for block in self.enc_blocks:
x = block(x)
ftrs.append(x)
x = self.pool(x)
return ftrs
class Decoder(nn.Module):
def __init__(self, chs):
super().__init__()
self.chs = chs
self.upconvs = nn.ModuleList(
[nn.ConvTranspose2d(chs[i], chs[i+1], kernel_size=2, stride=2) for i in range(len(chs)-1)])
self.dec_blocks = nn.ModuleList(
[x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])
def forward(self, x, encoder_features):
for i in range(len(self.chs)-1):
x = self.upconvs[i](x)
enc_ftrs = self.crop(encoder_features[i], x)
x = torch.cat([x, enc_ftrs], dim=1)
x = self.dec_blocks[i](x)
return x
def crop(self, enc_ftrs, x):
_, _, H, W = x.shape
enc_ftrs = transforms.CenterCrop([H, W])(enc_ftrs)
return enc_ftrs
class UNet(nn.Module):
def __init__(self, enc_chs, dec_chs, num_class):
super(UNet, self).__init__()
self.encoder = Encoder(enc_chs)
self.decoder = Decoder(dec_chs)
self.softmax = nn.Conv2d(dec_chs[-1], num_class, kernel_size=1)
def forward(self, x):
enc_ftrs = self.encoder(x)
out = self.decoder(enc_ftrs[::-1][0], enc_ftrs[::-1][1:])
out = self.softmax(out)
return out
And here is my dataset class:
from PIL import Image
import torchvision
VOC_CLASSES = [ # How to use?
"background",
"aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"pottedplant",
"sheep",
"sofa",
"train",
"tvmonitor",
]
VOC_COLORMAP = [ # How to use?
[0, 0, 0], # Background
[128, 0, 0], # Aeroplane
[0, 128, 0], # Bicycle
[128, 128, 0], # Bird
[0, 0, 128], # Boat
[128, 0, 128], # Bottle
[0, 128, 128], # Bus
[128, 128, 128], # Car
[64, 0, 0], # Cat
[192, 0, 0], # Chair
[64, 128, 0], # Cow
[192, 128, 0], # Diningtable
[64, 0, 128], # Dog
[192, 0, 128], # Horse
[64, 128, 128], # Motorbike
[192, 128, 128], # Person
[0, 64, 0], # Pottedplant
[128, 64, 0], # Sheep
[0, 192, 0], # Sofa
[128, 192, 0], # Train
[0, 64, 128], # tvmonitor
]
class VocDataset(torchvision.datasets.VOCSegmentation):
def __init__(self, image_set, transform, root="../data/VOCtrainval_11-May-2012/", download=False, year="2012"):
self.transform = transform
self.year = year
super().__init__(root=root, image_set=image_set,
download=download, transform=transform, year=year)
def __len__(self):
return len(self.images)
def __getitem__(self, index):
# open images and do transformation img = jpg, mask = png
img = Image.open(self.images[index]).convert("RGB")
target = Image.open(self.masks[index]).convert("RGB")
if self.transform:
img = self.transform(img)
trfm = T.Compose([T.ToTensor(), T.Resize((388, 388))])
target = trfm(target)
return img, target
and lastly here is my train function
import torch
import torch.nn as nn
import torch.optim as optim
from unet import UNet
from torch.utils.data import DataLoader
from dataset import VocDataset
import torchvision.transforms as T
import torch.nn.functional as F
# Hyperparameters etc.
STD = [0.2686, 0.2652, 0.2812] # Std for dataset
MEAN = [0.4568, 0.4431, 0.4083] # Mean for dataset
MOMENTUM = 0.9
LEARNING_RATE = 1e-4
BATCH_SIZE = 32
NUM_EPOCHS = 1
NUM_WORKERS = 2
NUM_CLASSES = 20
TRAIN_SET = "train"
VAL_SET = "val"
ENC_CHANNELS = (3, 64, 128, 256, 512, 1024) # Encoder channels
DEC_CHANNELS = (1024, 512, 256, 128, 64) # Decoder channels
TRANSFORM = T.Compose(
[T.ToTensor(), T.Resize(SIZE), T.Normalize(MEAN, STD)]
)
def main():
training_data = VocDataset(TRAIN_SET, TRANSFORM)
train_dataloader = DataLoader(
training_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)
# Create instance of unet
unet = UNet(ENC_CHANNELS, DEC_CHANNELS, NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
unet.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
for epoch in range(NUM_EPOCHS): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data # Shape for labels and inputs are: [32,3,388,388]
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = unet(inputs) # output shape is [32, 32, 388, 388]
loss = criterion(outputs, labels) # Error here
loss.backward()
optimizer.step()
# print('Finished Training')
if __name__ == "__main__":
main()

For starters, your label and outputs have different dimension. (32 vs 3 channels). Cross Entropy Loss expects them to either have the same number of channels, or for the target to have only one channel with integer values indicating the relevant class.
Let's work with the latter case. In this case, we need to reduce the target to be a single channel [32 x 388 x 388] for your input and batch size. (Secondarily, the Unet should ideally have one output channel for each class (looks like there are 22 classes so you should change the final output layer of the Unet decoder to have 22 outputs)).
To convert the label of size [32 x 3 x 388 x 388] to [32 x 388 x 388], you need to use the colormap for conversion. That is, create a new tensor target of size [32 x 1 x 388 x 388]. For each value target[i,j,k], assign the index into VOC_COLORMAP that matches the value stored in the pixels at label[i,:,j,k].

Related

ValueError: Expected input batch_size (59) to match target batch_size (1)

I'm trying to build a semantic segmentation model with pytorch. However, I encounter this error and do not know how to fix it.
This is the model:
class SegmentationNN(pl.LightningModule):
def __init__(self, num_classes=23, hparams=None):
super().__init__()
self.hparams = hparams
self.model=models.alexnet(pretrained=True).features
self.conv=nn.Conv2d(256, 3, kernel_size=1)
self.upsample = nn.Upsample(size=(240,240))
def forward(self, x):
print('Input:', x.shape)
x = self.model(x)
print('After Alexnet convs:', x.shape)
x = self.conv(x)
print('After 1-conv:', x.shape)
x = self.upsample(x)
print('After upsampling:', x.shape)
return x
def training_step(self, batch, batch_idx):
images, targets = batch
# targets = targets.view(targets.size(0), -1)
out = self.forward(images)
loss_func = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss = loss_func(out, targets.unsqueeze(0))
tensorboard_logs = {'loss': loss}
return {'loss': loss, 'log':tensorboard_logs}
def validation_step(self, batch, batch_idx):
images, targets = batch
# targets = targets.view(targets.size(0), -1)
out = self.forward(images)
loss_func = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss = loss_func(out, targets.unsqueeze(0))
tensorboard_logs = {'loss': loss}
return {'loss': loss, 'log':tensorboard_logs}
def configure_optimizers(self):
optim = torch.optim.Adam(self.parameters(), lr=self.hparams['learning_rate'])
return optim
And this is the training and fit:
train_dataloader = DataLoader(train_data, batch_size=hparams['batch_size'])
val_dataloader = DataLoader(val_data, batch_size=hparams['batch_size'])
trainer = pl.Trainer(
max_epochs=50,
gpus=1 if torch.cuda.is_available() else None
)
pass
trainer.fit(model, train_dataloader, val_dataloader)
These are the sizes of the tensor after each layer:
Input: torch.Size([59, 3, 240, 240])
After Alexnet convs: torch.Size([59, 256, 6, 6])
After 1-conv: torch.Size([59, 3, 6, 6])
After upsampling: torch.Size([59, 3, 240, 240])
I am pretty a beginner with Pytorch and Pytorch Lightning so every advice would be apprreciated!
Can you delete the unsqueeze(0) part here : loss = loss_func(out, targets.unsqueeze(0))

Implement a Network in Network CNN model using pytorch-lightning

I am trying to implement a NiN model. Basically trying to replicate code from d2l Here is my code.
import pandas as pd
import torch
from torch import nn
import torchmetrics
from torchvision import transforms
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl
from torchvision.datasets import FashionMNIST
import wandb
from pytorch_lightning.loggers import WandbLogger
wandb.login()
## class definition
class Lightning_nin(pl.LightningModule):
def __init__(self):
super().__init__()
self.accuracy = torchmetrics.Accuracy(top_k=1)
self.model = nn.Sequential(
self.nin_block(1, 96, kernel_size=11, strides=4, padding=0),
nn.MaxPool2d(3, stride=2),
self.nin_block(96, 256, kernel_size=5, strides=1, padding=2),
nn.MaxPool2d(3, stride=2),
self.nin_block(256, 384, kernel_size=3, strides=1, padding=1),
nn.MaxPool2d(3, stride=2), nn.Dropout(0.5),
# There are 10 label classes
self.nin_block(384, 10, kernel_size=3, strides=1, padding=1),
nn.AdaptiveAvgPool2d((1, 1)),
# Transform the four-dimensional output into two-dimensional output with a
# shape of (batch size, 10)
nn.Flatten())
for layer in self.model:
if type(layer) == nn.Linear or type(layer) == nn.Conv2d:
nn.init.xavier_uniform_(layer.weight)
def nin_block(self,in_channels, out_channels, kernel_size, strides, padding):
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size, strides, padding),
nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1),
nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1),
nn.ReLU())
def forward(self, x):
x = self.model(x)
return x
def loss_fn(self,logits,y):
loss = nn.CrossEntropyLoss()
return loss(logits,y)
def training_step(self,train_batch,batch_idx):
X, y = train_batch
logits = self.forward(X)
loss = self.loss_fn(logits,y)
self.log('train_loss',loss)
m = nn.Softmax(dim=1)
output = m(logits)
self.log('train_acc',self.accuracy(output,y))
return loss
def validation_step(self,val_batch,batch_idx):
X,y = val_batch
logits = self.forward(X)
loss = self.loss_fn(logits,y)
self.log('test_loss',loss)
m = nn.Softmax(dim=1)
output = m(logits)
self.log('test_acc',self.accuracy(output,y))
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.model.parameters(),lr= 0.1)
return optimizer
class Light_DataModule(pl.LightningDataModule):
def __init__(self,resize= None):
super().__init__()
if resize:
self.resize = resize
def setup(self, stage):
# transforms for images
trans = [transforms.ToTensor()]
if self.resize:
trans.insert(0, transforms.Resize(self.resize))
trans = transforms.Compose(trans)
# prepare transforms standard to MNIST
self.mnist_train = FashionMNIST(root="../data", train=True, download=True, transform=trans)
self.mnist_test = FashionMNIST(root="../data", train=False, download=True, transform=trans)
def train_dataloader(self):
return DataLoader(self.mnist_train, batch_size=128,shuffle=True,num_workers=4)
def val_dataloader(self):
return DataLoader(self.mnist_test, batch_size=128,num_workers=4)
## Train model
data_module = Light_DataModule(resize=224)
wandb_logger = WandbLogger(project="d2l",name ='NIN')
model = Lightning_nin()
trainer = pl.Trainer(logger=wandb_logger,max_epochs=4,gpus=1,progress_bar_refresh_rate =1)
trainer.fit(model, data_module)
wandb.finish()
After running the code I am only getting an accuracy of 0.1. Not sure where I am going wrong. I have been able to implement other CNN (like VGG) using the same template. Not sure where I am going wrong. The accuracy should be close to 0.9 after 10 epochs.
The kernel_size & strides are very big for the image size of 224. It will drastically reduce the information that is passed on to subsequent layers. Try reducing them. Also, VGG was a very carefully designed architecture.

Neural Network cannot overfit even one sample

I am using neural network for a regression task.
My input is an gray image whose size is 100x70x1.
The gray area has a unique value 60.
The input will go through a preprocessing layer, which multiply 1./255 on every pixel value.
My output is just three double number: [0.87077969, 0.98989031, 0.98888382]
I used ResNet152 model as shown below:
class Bottleneck(tf.keras.Model):
expansion = 4
def __init__(self, in_channels, out_channels, strides=1):
super(Bottleneck, self).__init__()
self.conv1 = tf.keras.layers.Conv2D(out_channels, 1, 1, use_bias=False)
self.bn1 = tf.keras.layers.BatchNormalization()
self.conv2 = tf.keras.layers.Conv2D(out_channels, 3, strides, padding="same", use_bias=False)
self.bn2 = tf.keras.layers.BatchNormalization()
self.conv3 = tf.keras.layers.Conv2D(out_channels*self.expansion, 1, 1, use_bias=False)
self.bn3 = tf.keras.layers.BatchNormalization()
if strides != 1 or in_channels != self.expansion * out_channels:
self.shortcut = tf.keras.Sequential([
tf.keras.layers.Conv2D(self.expansion*out_channels, kernel_size=1,
strides=strides, use_bias=False),
tf.keras.layers.BatchNormalization()]
)
else:
self.shortcut = lambda x,_: x
def call(self, x, training=False):
out = tf.nn.elu(self.bn1(self.conv1(x), training))
out = tf.nn.elu(self.bn2(self.conv2(out), training))
out = self.bn3(self.conv3(out), training)
out += self.shortcut(x, training)
return tf.nn.elu(out)
class ResNet(tf.keras.Model):
def __init__(self, block, num_blocks):
super(ResNet, self).__init__()
self.in_channels = 64
self.conv1 = tf.keras.layers.Conv2D(64, 7, 2, padding="same", use_bias=False) # 60x60
self.bn1 = tf.keras.layers.BatchNormalization()
self.pool1 = tf.keras.layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same') # 30x30
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.avg_pool2d = tf.keras.layers.GlobalAveragePooling2D()
self.flatten = tf.keras.layers.Flatten()
def _make_layer(self, block, out_channels, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels * block.expansion
return tf.keras.Sequential(layers)
def call(self, x, training=False):
out = self.pool1(tf.nn.elu(self.bn1(self.conv1(x), training)))
out = self.layer1(out, training=training)
out = self.layer2(out, training=training)
out = self.layer3(out, training=training)
out = self.layer4(out, training=training)
# For classification
out = self.flatten(out)
# out = tf.keras.layers.Reshape((out.shape[-1],))(out)
#out = self.linear(out)
return out
def model(self):
x = tf.keras.layers.Input(shape=(100,70,1))
return tf.keras.Model(inputs=[x], outputs=self.call(x))
def ResNet152():
return ResNet(Bottleneck, [3,8,36,3])
I used elu as activation function and changed the GlobalAveragePooling layer into flatten layer at the end of ResNet.
Before output I stack two Dense layer(2048 units and 3 units) on top of the ResNet model.
For training I used adam optimizer and inital learning rate is 1e-4, which will decreasing by factor 10 when the val_loss not decreasing for 3 epoch.
The loss is just mse error.
After early stopping while learning rate is 1e-8, the mse loss is still very high:8.6225
The prediction is [2.92318237, 5.53124916, 3.00686643] which is far away from the ground truth: [0.87077969, 0.98989031, 0.98888382]
I don't know why such a deep network cannot overfit such a sample.
Is this the reason that my input image has too few information? Could someone help me?

RuntimeError: Expected 3-dimensional input for 3-dimensional weight [64, 512, 1], but got 2-dimensional input of size [4, 512] instead

Hello below is the pytorch model I am trying to run. But getting error. I have posted the error trace as well. It was running very well unless I added convolution layers. I am still new to deep learning and Pytorch. So I apologize if this is silly question. I am using conv1d so why should conv1d expect 3 dimensional input and it is also getting a 2d input which is also odd.
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(CROP_SIZE*CROP_SIZE*3, 512)
self.conv1d1 = nn.Conv1d(in_channels=512, out_channels=64, kernel_size=1, stride=2)
self.fc2 = nn.Linear(64, 128)
self.conv1d2 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=1, stride=2)
self.fc3 = nn.Linear(64, 256)
self.conv1d3 = nn.Conv1d(in_channels=256, out_channels=64, kernel_size=1, stride=2)
self.fc4 = nn.Linear(64, 256)
self.fc4 = nn.Linear(256, 128)
self.fc5 = nn.Linear(128, 64)
self.fc6 = nn.Linear(64, 32)
self.fc7 = nn.Linear(32, 64)
self.fc8 = nn.Linear(64, frame['landmark_id'].nunique())
def forward(self, x):
x = F.relu(self.conv1d1(self.fc1(x)))
x = F.relu(self.conv1d2(self.fc2(x)))
x = F.relu(self.conv1d3(self.fc3(x)))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
x = self.fc8(x)
return F.log_softmax(x, dim=1)
net = Net()
import torch.optim as optim
loss_function = nn.CrossEntropyLoss()
net.to(torch.device('cuda:0'))
for epoch in range(3): # 3 full passes over the data
optimizer = optim.Adam(net.parameters(), lr=0.001)
for data in tqdm(train_loader): # `data` is a batch of data
X = data['image'].to(device) # X is the batch of features
y = data['landmarks'].to(device) # y is the batch of targets.
optimizer.zero_grad() # sets gradients to 0 before loss calc. You will do this likely every step.
output = net(X.view(-1,CROP_SIZE*CROP_SIZE*3)) # pass in the reshaped batch
# print(np.argmax(output))
# print(y)
loss = F.nll_loss(output, y) # calc and grab the loss value
loss.backward() # apply this loss backwards thru the network's parameters
optimizer.step() # attempt to optimize weights to account for loss/gradients
print(loss) # print loss. We hope loss (a measure of wrong-ness) declines!
Error trace
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-42-f5ed7999ce57> in <module>
5 y = data['landmarks'].to(device) # y is the batch of targets.
6 optimizer.zero_grad() # sets gradients to 0 before loss calc. You will do this likely every step.
----> 7 output = net(X.view(-1,CROP_SIZE*CROP_SIZE*3)) # pass in the reshaped batch
8 # print(np.argmax(output))
9 # print(y)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
<ipython-input-37-6d3e34d425a0> in forward(self, x)
16
17 def forward(self, x):
---> 18 x = F.relu(self.conv1d1(self.fc1(x)))
19 x = F.relu(self.conv1d2(self.fc2(x)))
20 x = F.relu(self.conv1d3(self.fc3(x)))
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/conv.py in forward(self, input)
210 _single(0), self.dilation, self.groups)
211 return F.conv1d(input, self.weight, self.bias, self.stride,
--> 212 self.padding, self.dilation, self.groups)
213
214
RuntimeError: Expected 3-dimensional input for 3-dimensional weight [64, 512, 1], but got 2-dimensional input of size [4, 512] instead
You should learn how convolutions work (e.g. see this answer) and some neural network basics (this tutorial from PyTorch).
Basically, Conv1d expects inputs of shape [batch, channels, features] (where features can be some timesteps and can vary, see example).
nn.Linear expects shape [batch, features] as it is fully connected and each input feature is connected to each output feature.
You can verify those shapes by yourself, for torch.nn.Linear:
import torch
layer = torch.nn.Linear(20, 10)
data = torch.randn(64, 20) # [batch, in_features]
layer(data).shape # [64, 10], [batch, out_features]
For Conv1d:
layer = torch.nn.Conv1d(in_channels=20, out_channels=10, kernel_size=3, padding=1)
data = torch.randn(64, 20, 15) # [batch, channels, timesteps]
layer(data).shape # [64, 10, 15], [batch, out_features]
layer(torch.randn(32, 20, 25)).shape # [32, 10, 25]
BTW. As you are working with images, you should use torch.nn.Conv2d instead.
Most of the Pytorch functions work on batch data i.e they accept input of size (batch_size, shape). #Szymon Maszke already posted answer related to that.
So in your case, you can use unsqueeze and sqeeze functions for adding and removing extra dimensions.
Here's the sample code:
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(100, 512)
self.conv1d1 = nn.Conv1d(in_channels=512, out_channels=64, kernel_size=1, stride=2)
self.fc2 = nn.Linear(64, 128)
def forward(self, x):
x = self.fc1(x)
x = x.unsqueeze(dim=2)
x = F.relu(self.conv1d1(x))
x = x.squeeze()
x = self.fc2(x)
return x
net = Net()
bsize = 4
inp = torch.randn((bsize, 100))
out = net(inp)
print(out.shape)

Why bottleneck structure is slower and costs more memory when training a network comparing to standard convolution?

I'm using vnet to train a model. I want to train the model faster with less memory. So I replaced the standard 3x3 convolution to a combination of [1x1, 3x3 , 1x1] convolutions. The first 1x1 conv will reduce the channel to 1/N to reduce memory cost. The code is as follows.
The first two classes is bottleneck structure and standard convolution. When I replace the standard convolution to bottleneck structure, although the model size and flops decrease, the real GPU memory cost and training time increase.
For example, I got :
Using standard convolution..........
Total parameters : 10,052,609 float, model size : 39,268.00390625M
191.78 GFLOPs
end : 10.62517523765564s
Max memory allocated : 3818.25341796875M
Using bottleneck...........
Total parameters : 1,145,061 float, model size : 4,472.89453125M
16.05 GFLOPs
end : 16.890745162963867s
Max memory allocated : 4408.35107421875 M
However, in inference stage, the bottleneck structure can accelerate the network to some extend.
Does anyone know why this happens and how to accelerate the network both in training and inference stage?
Code :
import torch
import torch.nn as nn
import torch.nn.functional as F
def groupNorm(channel, num_groups=16):
return nn.GroupNorm(num_groups=num_groups, num_channels=channel)
Norm = nn.BatchNorm3d
BottleNeck_Ratio = 4
class BottleNeck(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1, N=BottleNeck_Ratio):
super(BottleNeck, self).__init__()
self.conv_1 = nn.Conv3d(in_channels=in_channels, out_channels=out_channels // N, kernel_size=1, stride=1)
self.conv_2 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels // N, kernel_size=kernel_size,
stride=stride, padding=padding)
self.conv_3 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels, kernel_size=1, stride=1)
self.norm = Norm(out_channels)
self.relu = nn.ReLU()
self.drop = nn.Dropout3d(drop)
def forward(self, input):
x = self.conv_1(input)
x = self.conv_2(x)
x = self.conv_3(x)
return self.drop(self.relu(self.norm(x)))
class CBR(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1):
super(CBR, self).__init__()
self.conv = nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
stride=stride, padding=padding)
self.norm = Norm(out_channels)
self.relu = nn.ReLU()
self.drop = nn.Dropout3d(drop)
def forward(self, input):
return self.drop(self.relu(self.norm(self.conv(input))))
ConvBnReluDrop = BottleNeck
class ResidualDown(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, down=True):
super(ResidualDown, self).__init__()
if down:
self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=2, stride=2, padding=0, drop=drop)
else:
self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)
self.convs = nn.ModuleList()
for i in range(conv_nums):
self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))
self.has_down = down
def forward(self, x):
# downsample
res = self.down(x)
# convolution
out = res
for conv in self.convs:
out = conv(out)
# residual
return out + res
class ResidualUp(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, up=True):
super(ResidualUp, self).__init__()
if up:
self.deconv = nn.ConvTranspose3d(in_channels, out_channels, kernel_size=2, stride=2)
else:
self.deconv = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)
self.convs = nn.ModuleList()
self.convs.append(ConvBnReluDrop(2 * out_channels, out_channels, kernel_size, drop))
for i in range(conv_nums - 1):
self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))
def forward(self, big, small):
x = self.deconv(small)
# interpolate to prevent size not match
x = F.interpolate(x, big.size()[-3:], mode='trilinear', align_corners=False)
# save x as residual, [out_ch]
res = x
# skip connection, concat and conv to small's channel
# [2*out_ch] => [out_ch]
x = torch.cat([big, x], 1)
for conv in self.convs:
x = conv(x)
return x + res
class VBNet(nn.Module):
def __init__(self, in_ch=1, nclass=1, drop=0.01, level=5, bn='batch', bottleneck=False):
super(VBNet, self).__init__()
# levels
self.level = level
# Normalization layer
global Norm
if bn == 'batch':
Norm = nn.BatchNorm3d
elif bn == 'group':
Norm = groupNorm
# elif bn == 'syncbn':
# Norm = SyncBN3d
else:
raise Exception("Error for bn")
global ConvBnReluDrop
if bottleneck:
ConvBnReluDrop = BottleNeck
else:
ConvBnReluDrop = CBR
# down 2
self.downs = nn.ModuleList()
self.downs.append(ResidualDown(in_ch, 16, 3, drop, 1, False))
self.downs.append(ResidualDown(16, 32, 3, drop, 2))
# down layers
channels = 32
for i in range(level - 2):
self.downs.append(ResidualDown(channels, channels * 2, 3, drop, 3))
channels *= 2
# up layers
self.ups = nn.ModuleList()
for i in range(level - 3):
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 3))
channels = channels // 2
# up 2
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 2))
channels = channels // 2
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 1, False))
channels = channels // 2
# classifier
self.classifier = nn.Conv3d(channels, nclass, kernel_size=1)
def forward(self, x): # 4,472.89453125M
outs = []
for layer in self.downs:
x = layer(x)
outs.append(x)
small = outs[-1]
for i in range(len(self.ups)):
layer = self.ups[i]
big = outs[self.level - i - 2]
small = layer(big, small)
out = self.classifier(small)
return out
def get_net_size(net):
params = list(net.parameters())
k = 0
for i in params:
l = 1
for j in i.size():
l *= j
k = k + l
s = ("Total parameters : {:,} float, model size : {:,}M".format(k, k * 4 / 1024))
return s
if __name__ == '__main__':
# count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
import count_ops
import os
import time
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
# 4003728896
print("Using standard convolution..........")
a = torch.randn(6, 1, 32, 128, 128)
net = VBNet(bn='batch', bottleneck=False)
print(get_net_size(net))
print(count_ops(net, a))
net = net.cuda()
start = time.time()
for i in range(10):
a = torch.randn(6, 1, 32, 128, 128).cuda()
b = net(a)
b.sum().backward()
print('end : {}s'.format(time.time() - start))
print("Max memory allocated : {}M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))
# 4543840768 4622491136
print("\nUsing bottleneck...........")
# torch.cuda.reset_max_memory_allocated(0)
a = torch.randn(6, 1, 32, 128, 128)
net = VBNet(bn='batch', bottleneck=True)
print(get_net_size(net))
print(count_ops(net, a))
net = net.cuda()
start = time.time()
for i in range(10):
a = torch.randn(6, 1, 32, 128, 128).cuda()
b = net(a)
b.sum().backward()
print('end : {}s'.format(time.time() - start))
print("Max memory allocated : {} M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))
I compared three convolutions : standard convolution, bottleneck structure and separable convolution and got the performance results:
For standard Convolution :
Total parameters : 13920 float, model size : 54.3750M
2.75 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.0517 s
Total iteration : 250
mean forward time : 0.0003 s
mean backward time : 0.0007 s
Max memory allocated : 120.1846 M
-------------------Test analyze----------------
total test time : 7.6900 s
Total iteration : 250
mean data time : 0.0305 s
mean forward time : 0.0003 s
Max memory allocated : 72.1826 M
For bottleneck :
Total parameters : 7872 float, model size : 30.7500M
1.56 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.7080 s
Total iteration : 250
mean forward time : 0.0009 s
mean backward time : 0.0016 s
Max memory allocated : 168.0767 M
-------------------Test analyze----------------
total test time : 8.8901 s
Total iteration : 250
mean data time : 0.0348 s
mean forward time : 0.0008 s
Max memory allocated : 72.0728 M
For Separable Convolution :
Total parameters : 1088 float, model size : 4.2500M
0.23 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.3567 s
Total iteration : 250
mean forward time : 0.0009 s
mean backward time : 0.0014 s
Max memory allocated : 144.2021 M
-------------------Test analyze----------------
total test time : 7.9258 s
Total iteration : 250
mean data time : 0.0309 s
mean forward time : 0.0008 s
Max memory allocated : 72.1992 M
We can see that the standard convolution is twice faster than bottleneck structure and separable convolution. And its memory cost is also not larger than other two methods.
I guess the reason could be that when forward or backward in training, the bottleneck and separable structure which have more convolution modules will use more memory to save input for back propagation and they also do more convolution operation than standard convolution. So either the memory cost nor speed of these two structure can surpass the standard convolution.
Another reason why separable convolution is slower could be that the cuDNN library dosen't directly support depthwise separable convolutions.
But these two structure indeed reduce the model size dramatically comparing to standard convolution, which is very useful for mobile device.
Code is following:
Three different convolutions.
import torch
import torch.nn as nn
import analyze_network_performance
import functools
Norm = nn.BatchNorm3d
class CBRSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(CBRSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
class BottleNeckSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(BottleNeckSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=out_channels//N, kernel_size=1, stride=1),
Norm(out_channels//N),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels//N, kernel_size=kernel_size, stride=stride, padding=padding),
Norm(out_channels//N),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels, kernel_size=1),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
class GroupSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(GroupSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=in_channels, groups=in_channels,
kernel_size=kernel_size, stride=stride, padding=padding),
Norm(in_channels),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
def test_bottleneck():
data_gen = functools.partial(torch.randn, 6, 16, 32, 32, 32)
a = BottleNeckSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
b = CBRSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
c = GroupSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
print('BottleNeck Structure ....')
analyze_network_performance(a, data_gen, train_time=250, test_time=250)
print('\nStandard Convolution ....')
analyze_network_performance(b, data_gen, train_time=250, test_time=250)
print('\nSeparable Convolution ...')
analyze_network_performance(c, data_gen, train_time=250, test_time=250)
if __name__ == '__main__':
test_bottleneck()
analyze_network_performance code.
import time
# count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
from ops import count_ops
import torch
import numpy as np
def get_net_size(net):
params = list(net.parameters())
k = 0
for i in params:
l = 1
for j in i.size():
l *= j
k = k + l
s = ("Total parameters : {:} float, model size : {:.4f}M".format(k, k * 4 / 1024))
return s
class Timer(object):
def __init__(self, verbose=False):
self.start_time = time.time()
self.verbose = verbose
self.duration = 0
def restart(self):
self.duration = self.start_time = time.time()
return self.duration
def stop(self):
return time.time() - self.start_time
def get_last_duration(self):
return self.duration
def __enter__(self):
self.restart()
def __exit__(self, exc_type, exc_val, exc_tb):
self.duration = self.stop()
if self.verbose:
print('{:^.4f} s'.format(self.stop()))
def to_cuda(data, device):
if device < 0:
return data
else:
return data.cuda(device)
def network_train_analyze(net, data_generate_func, cuda=0, train_time=10, forward_verbose=False):
t1 = Timer(verbose=True)
t2 = Timer(forward_verbose)
t3 = Timer(verbose=False)
if cuda >= 0:
torch.cuda.reset_max_memory_allocated(cuda)
forward_times = []
backward_times = []
with t1:
for i in range(train_time):
a = to_cuda(data_generate_func(), cuda)
with t3:
b = net(a)
if forward_verbose:
print('forward : ', end='')
forward_times.append(t3.get_last_duration())
with t2:
b.sum().backward()
if forward_verbose:
print('backward : ', end='')
backward_times.append(t2.get_last_duration())
print('total train time : ', end='')
print("Total iteration : {}".format(train_time))
print('mean forward time : {:^.4f} s'.format(np.mean(forward_times[1:])))
print('mean backward time : {:^.4f} s'.format(np.mean(backward_times[1:])))
if cuda >= 0:
print("Max memory allocated : {:^.4f} M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))
def network_test_analyze(net, data_generate_func, cuda=0, test_time=50, forward_verbose=False):
t1 = Timer(verbose=True)
t2 = Timer(verbose=forward_verbose)
t3 = Timer(verbose=False)
if cuda >= 0:
torch.cuda.reset_max_memory_allocated(cuda)
forward_times = []
data_times = []
with t1:
with torch.no_grad():
for i in range(test_time):
with t3:
a = to_cuda(data_generate_func(), cuda)
data_times.append(t3.get_last_duration())
with t2:
net(a)
if forward_verbose:
print('forward : ', end='')
forward_times.append(t2.get_last_duration())
print('total test time : ', end='')
print("Total iteration : {}".format(test_time))
print('mean data time : {:^.4f} s'.format(np.mean(data_times[1:])))
print('mean forward time : {:^.4f} s'.format(np.mean(forward_times[1:])))
if cuda >= 0:
print("Max memory allocated : {:^.4f} M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))
def analyze_network_performance(net, data_generate_func, cuda=0, train_time=10, test_time=20, forward_verbose=False):
print('============ Analyzing network performance ==============')
print(get_net_size(net))
net = to_cuda(net, cuda)
a = data_generate_func()
a = to_cuda(a, cuda)
print(count_ops(net, a))
print('-------------------Train analyze----------------')
network_train_analyze(net, data_generate_func, cuda, train_time, forward_verbose)
print('-------------------Test analyze----------------')
network_test_analyze(net, data_generate_func, cuda, test_time, forward_verbose)