I am using a CNN to extract features from temporal data of different lengths. I am using pad_sequence to pad the data in a batch. However as the max length in a batch will change, the padded sequence length differs by batch. This creates errors when i flatten the data for the FCN layer (as the dimension of the flattened vector changes). I am currently handling this by using an 'adaptive avg pooling layer' in before the FCN layers. As this is a global averaging, it fixes the output dimension for the FCN. However I am not sure if this is the correct thing to do.
Code is:
##pad tensors
def pad_collate(batch):
sequences = [item[0] for item in batch]
lengths = [len(seq) for seq in sequences]
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
return padded_sequences, lengths
## Create dataloader
trainData = Sequence(root = path)
trainDataLoader = DataLoader(trainData, batch_size = BATCH_SIZE, collate_fn= pad_collate)
## CNN model
class FeatureExtractor(nn.Module):
def __init__(self, block, layers):
super(FeatureExtractor, self).__init__()
self.inplanes = 6
## 1st CONV layers
self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 3, stride = 2, padding = 4)
self.bn1 = nn.BatchNorm2d(6)
self.relu1 = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride = 2, padding = 1)
## residual blocks
self.layer0 = self._make_layer(block, 12, layers[0], stride = 1)
self.layer1 = self._make_layer(block, 24, layers[1], stride = 2)
self.avgpool = nn.AdaptiveAvgPool2d((5,5)) ##### MY CURRENT SOLUTION #####
self.fc = nn.Linear(600, 128)
def _make_layer(self, block, planes, blocks, stride):
downsample = None
if stride != 1 or self.inplanes != planes:
downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
nn.BatchNorm2d(planes))
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
## first conv
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.maxpool1(x)
## conv blocks
x = self.layer0(x)
x = self.layer1(x)
##FCN layer
x = self.avgpool(x)
x = torch.flatten(x, 1)
output = self.fc(x)
return output
Any other comments are also welcome (i am self-taught)
I'm trying to build a semantic segmentation model with pytorch. However, I encounter this error and do not know how to fix it.
This is the model:
class SegmentationNN(pl.LightningModule):
def __init__(self, num_classes=23, hparams=None):
super().__init__()
self.hparams = hparams
self.model=models.alexnet(pretrained=True).features
self.conv=nn.Conv2d(256, 3, kernel_size=1)
self.upsample = nn.Upsample(size=(240,240))
def forward(self, x):
print('Input:', x.shape)
x = self.model(x)
print('After Alexnet convs:', x.shape)
x = self.conv(x)
print('After 1-conv:', x.shape)
x = self.upsample(x)
print('After upsampling:', x.shape)
return x
def training_step(self, batch, batch_idx):
images, targets = batch
# targets = targets.view(targets.size(0), -1)
out = self.forward(images)
loss_func = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss = loss_func(out, targets.unsqueeze(0))
tensorboard_logs = {'loss': loss}
return {'loss': loss, 'log':tensorboard_logs}
def validation_step(self, batch, batch_idx):
images, targets = batch
# targets = targets.view(targets.size(0), -1)
out = self.forward(images)
loss_func = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss = loss_func(out, targets.unsqueeze(0))
tensorboard_logs = {'loss': loss}
return {'loss': loss, 'log':tensorboard_logs}
def configure_optimizers(self):
optim = torch.optim.Adam(self.parameters(), lr=self.hparams['learning_rate'])
return optim
And this is the training and fit:
train_dataloader = DataLoader(train_data, batch_size=hparams['batch_size'])
val_dataloader = DataLoader(val_data, batch_size=hparams['batch_size'])
trainer = pl.Trainer(
max_epochs=50,
gpus=1 if torch.cuda.is_available() else None
)
pass
trainer.fit(model, train_dataloader, val_dataloader)
These are the sizes of the tensor after each layer:
Input: torch.Size([59, 3, 240, 240])
After Alexnet convs: torch.Size([59, 256, 6, 6])
After 1-conv: torch.Size([59, 3, 6, 6])
After upsampling: torch.Size([59, 3, 240, 240])
I am pretty a beginner with Pytorch and Pytorch Lightning so every advice would be apprreciated!
Can you delete the unsqueeze(0) part here : loss = loss_func(out, targets.unsqueeze(0))
Is this concept possible to be implemented with the GAN algorithm?
I want the GAN to generate a regression-output(G-Value) of the shape(4,) by the real-image, not from the random noise, and discriminate G-Value with real regression-value(R-Value) of the same shape(4, ). R-Value is of the "y-train" dataset.
It means that if an image has a pattern like circular, it generally has the 4 features of position x, y, z, and alpha. I call it Real-Value(R-Value) and I want the GAN to generate fake value (G-Value) fooling the discriminator.
I have tried to implement it as below.
class UTModel:
def __init__(self):
optimizer__ = Adam(2e-4)
self.__dropout = .3
self.optimizerGenerator = Adam(1e-4)
self.optimizerDiscriminator = Adam(1e-4)
self.generator, self.discriminator = self.build()
def build(self):
# build the generator
g = Sequential()
g.add(Conv2D(512, kernel_size=3, strides=2, input_shape=(128, 128, 1), padding='same'))
g.add(BatchNormalization(momentum=0.8))
g.add(LeakyReLU(alpha=0.2))
g.add(Dropout(self.__dropout))
g.add(Conv2D(256, kernel_size=3, strides=2, padding='same'))
g.add(BatchNormalization(momentum=0.8))
g.add(LeakyReLU(alpha=0.2))
g.add(Dropout(self.__dropout))
g.add(Conv2D(128, kernel_size=3, strides=2, padding='same'))
g.add(BatchNormalization(momentum=0.8))
g.add(LeakyReLU(alpha=0.2))
g.add(Dropout(self.__dropout))
g.add(Conv2D(64, kernel_size=3, strides=1, padding='same'))
g.add(BatchNormalization(momentum=0.8))
g.add(LeakyReLU(alpha=0.2))
g.add(Dropout(self.__dropout))
g.add(Flatten())
g.add(Dense(4, activation='linear'))
# build the discriminator
d = Sequential()
d.add(Dense(128, input_shape=(4,)))
d.add(LeakyReLU(alpha=0.2))
d.add(Dropout(self.__dropout))
d.add(Dense(64))
d.add(LeakyReLU(alpha=0.2))
d.add(Dropout(self.__dropout))
d.add(Dense(64))
d.add(LeakyReLU(alpha=0.2))
d.add(Dropout(self.__dropout))
d.add(Dense(32))
d.add(LeakyReLU(alpha=0.2))
d.add(Dropout(self.__dropout))
d.add(Dense(1, activation='sigmoid'))
return g, d
def computeLosses(self, rValid, fValid):
bce = BinaryCrossentropy(from_logits=True)
# Discriminator loss
rLoss = bce(tf.ones_like(rValid), rValid)
fLoss = bce(tf.zeros_like(fValid), fValid)
dLoss = rLoss + fLoss
# Generator loss
gLoss = bce(tf.zeros_like(fValid), fValid)
return dLoss, gLoss
def train(self, images, rValues):
with tf.GradientTape() as gTape, tf.GradientTape() as dTape:
gValues = self.generator(images, training=True)
rValid = self.discriminator(rValues, training=True)
fValid = self.discriminator(gValues, training=True)
dLoss, gLoss = self.computeLosses(rValid, fValid)
dGradients = dTape.gradient(dLoss, self.discriminator.trainable_variables)
gGradients = gTape.gradient(gLoss, self.generator.trainable_variables)
self.optimizerDiscriminator.apply_gradients(zip(dGradients, self.discriminator.trainable_variables))
self.optimizerGenerator.apply_gradients(zip(gGradients, self.generator.trainable_variables))
print (dLoss, gLoss)
class UTTrainer:
def __init__(self):
self.env = 3DPatterns()
self.model = UTModel()
def start(self):
if not self.env.available:
return
batch = 32
for epoch in range(1):
# set new episod
while self.env.setEpisod():
for i in range(0, self.env.episodelen, batch):
self.model.train(self.env.episode[i:i+batch], self.env.y[i:i+batch])
But the G-Values have not generated as valid values. It converges the 1 or -1 always. The proper value should be like [-0.192798, 0.212887, -0.034519, -0.015000]. Please help me to find the right way.
Thank you.
I am working on tensorflow2 and I am trying to implement Max unpool with indices to implement SegNet.
When I run it I get the following problem. I am defining the def MaxUnpool2D and then calling it in the model. I suppose that the problem is given by the fact that updates and mask have got shape (None, H,W,ch).
def MaxUnpooling2D(updates, mask):
size = 2
mask = tf.cast(mask, 'int32')
input_shape = tf.shape(updates, out_type='int32')
# calculation new shape
output_shape = (
input_shape[0],
input_shape[1]*size,
input_shape[2]*size,
input_shape[3])
# calculation indices for batch, height, width and feature maps
one_like_mask = tf.ones_like(mask, dtype='int32')
batch_shape = tf.concat(
[[input_shape[0]], [1], [1], [1]],
axis=0)
batch_range = tf.reshape(
tf.range(output_shape[0], dtype='int32'),
shape=batch_shape)
b = one_like_mask * batch_range
y = mask // (output_shape[2] * output_shape[3])
x = (mask // output_shape[3]) % output_shape[2]
feature_range = tf.range(output_shape[3], dtype='int32')
f = one_like_mask * feature_range
updates_size = tf.size(updates)
indices = K.transpose(K.reshape(
tf.stack([b, y, x, f]),
[4, updates_size]))
values = tf.reshape(updates, [updates_size])
return tf.scatter_nd(indices, values, output_shape)
def segnet_conv(
inputs,
kernel_size=3,
kernel_initializer='glorot_uniform',
batch_norm = False,
**kwargs):
conv1 = Conv2D(
filters=64,
kernel_size=kernel_size,
padding='same',
activation=None,
kernel_initializer=kernel_initializer,
name='conv_1'
)(inputs)
if batch_norm:
conv1 = BatchNormalization(name='bn_1')(conv1)
conv1 = LeakyReLU(alpha=0.3, name='activation_1')(conv1)
conv1 = Conv2D(
filters=64,
kernel_size=kernel_size,
padding='same',
activation=None,
kernel_initializer=kernel_initializer,
name='conv_2'
)(conv1)
if batch_norm:
conv1 = BatchNormalization(name='bn_2')(conv1)
conv1 = LeakyReLU(alpha=0.3, name='activation_2')(conv1)
pool1, mask1 = tf.nn.max_pool_with_argmax(
input=conv1,
ksize=2,
strides=2,
padding='SAME'
)
def segnet_deconv(
pool1,
mask1,
kernel_size=3,
kernel_initializer='glorot_uniform',
batch_norm = False,
**kwargs
):
dec = MaxUnpooling2D(pool5, mask5)
dec = Conv2D(
filters=512,
kernel_size=kernel_size,
padding='same',
activation=None,
kernel_initializer=kernel_initializer,
name='upconv_13'
)(dec)
def classifier(
dec,
ch_out=2,
kernel_size=3,
final_activation=None,
batch_norm = False,
**kwargs
):
dec = Conv2D(
filters=64,
kernel_size=kernel_size,
activation='relu',
padding='same',
name='dec_out1'
)(dec)
#tf.function
def segnet(
inputs,
ch_out=2,
kernel_size=3,
kernel_initializer='glorot_uniform',
final_activation=None,
batch_norm = False,
**kwargs
):
pool5, mask1, mask2, mask3, mask4, mask5 = segnet_conv(
inputs,
kernel_size=3,
kernel_initializer='glorot_uniform',
batch_norm = False
)
dec = segnet_deconv(
pool5,
mask1,
mask2,
mask3,
mask4,
mask5,
kernel_size=kernel_size,
kernel_initializer=kernel_initializer,
batch_norm = batch_norm
)
output = classifier(
dec,
ch_out=2,
kernel_size=3,
final_activation=None,
batch_norm = batch_norm
)
return output
inputs = Input(shape=(*params['image_size'], params['num_channels']), name='input')
outputs = segnet(inputs, n_labels=2, kernel=3, pool_size=(2, 2), output_mode=None)
# we define our U-Net to output logits
model = Model(inputs, outputs)
Can you please help me with this problem?
I have solved the problem. If someone will need here is the code for MaxUnpooling2D:
def MaxUnpooling2D(pool, ind, output_shape, batch_size, name=None):
"""
Unpooling layer after max_pool_with_argmax.
Args:
pool: max pooled output tensor
ind: argmax indices
ksize: ksize is the same as for the pool
Return:
unpool: unpooling tensor
:param batch_size:
"""
with tf.compat.v1.variable_scope(name):
pool_ = tf.reshape(pool, [-1])
batch_range = tf.reshape(tf.range(batch_size, dtype=ind.dtype), [tf.shape(pool)[0], 1, 1, 1])
b = tf.ones_like(ind) * batch_range
b = tf.reshape(b, [-1, 1])
ind_ = tf.reshape(ind, [-1, 1])
ind_ = tf.concat([b, ind_], 1)
ret = tf.scatter_nd(ind_, pool_, shape=[batch_size, output_shape[1] * output_shape[2] * output_shape[3]])
# the reason that we use tf.scatter_nd: if we use tf.sparse_tensor_to_dense, then the gradient is None, which will cut off the network.
# But if we use tf.scatter_nd, the gradients for all the trainable variables will be tensors, instead of None.
# The usage for tf.scatter_nd is that: create a new tensor by applying sparse UPDATES(which is the pooling value) to individual values of slices within a
# zero tensor of given shape (FLAT_OUTPUT_SHAPE) according to the indices (ind_). If we ues the orignal code, the only thing we need to change is: changeing
# from tf.sparse_tensor_to_dense(sparse_tensor) to tf.sparse_add(tf.zeros((output_sahpe)),sparse_tensor) which will give us the gradients!!!
ret = tf.reshape(ret, [tf.shape(pool)[0], output_shape[1], output_shape[2], output_shape[3]])
return ret
I'm using vnet to train a model. I want to train the model faster with less memory. So I replaced the standard 3x3 convolution to a combination of [1x1, 3x3 , 1x1] convolutions. The first 1x1 conv will reduce the channel to 1/N to reduce memory cost. The code is as follows.
The first two classes is bottleneck structure and standard convolution. When I replace the standard convolution to bottleneck structure, although the model size and flops decrease, the real GPU memory cost and training time increase.
For example, I got :
Using standard convolution..........
Total parameters : 10,052,609 float, model size : 39,268.00390625M
191.78 GFLOPs
end : 10.62517523765564s
Max memory allocated : 3818.25341796875M
Using bottleneck...........
Total parameters : 1,145,061 float, model size : 4,472.89453125M
16.05 GFLOPs
end : 16.890745162963867s
Max memory allocated : 4408.35107421875 M
However, in inference stage, the bottleneck structure can accelerate the network to some extend.
Does anyone know why this happens and how to accelerate the network both in training and inference stage?
Code :
import torch
import torch.nn as nn
import torch.nn.functional as F
def groupNorm(channel, num_groups=16):
return nn.GroupNorm(num_groups=num_groups, num_channels=channel)
Norm = nn.BatchNorm3d
BottleNeck_Ratio = 4
class BottleNeck(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1, N=BottleNeck_Ratio):
super(BottleNeck, self).__init__()
self.conv_1 = nn.Conv3d(in_channels=in_channels, out_channels=out_channels // N, kernel_size=1, stride=1)
self.conv_2 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels // N, kernel_size=kernel_size,
stride=stride, padding=padding)
self.conv_3 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels, kernel_size=1, stride=1)
self.norm = Norm(out_channels)
self.relu = nn.ReLU()
self.drop = nn.Dropout3d(drop)
def forward(self, input):
x = self.conv_1(input)
x = self.conv_2(x)
x = self.conv_3(x)
return self.drop(self.relu(self.norm(x)))
class CBR(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1):
super(CBR, self).__init__()
self.conv = nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
stride=stride, padding=padding)
self.norm = Norm(out_channels)
self.relu = nn.ReLU()
self.drop = nn.Dropout3d(drop)
def forward(self, input):
return self.drop(self.relu(self.norm(self.conv(input))))
ConvBnReluDrop = BottleNeck
class ResidualDown(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, down=True):
super(ResidualDown, self).__init__()
if down:
self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=2, stride=2, padding=0, drop=drop)
else:
self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)
self.convs = nn.ModuleList()
for i in range(conv_nums):
self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))
self.has_down = down
def forward(self, x):
# downsample
res = self.down(x)
# convolution
out = res
for conv in self.convs:
out = conv(out)
# residual
return out + res
class ResidualUp(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, up=True):
super(ResidualUp, self).__init__()
if up:
self.deconv = nn.ConvTranspose3d(in_channels, out_channels, kernel_size=2, stride=2)
else:
self.deconv = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)
self.convs = nn.ModuleList()
self.convs.append(ConvBnReluDrop(2 * out_channels, out_channels, kernel_size, drop))
for i in range(conv_nums - 1):
self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))
def forward(self, big, small):
x = self.deconv(small)
# interpolate to prevent size not match
x = F.interpolate(x, big.size()[-3:], mode='trilinear', align_corners=False)
# save x as residual, [out_ch]
res = x
# skip connection, concat and conv to small's channel
# [2*out_ch] => [out_ch]
x = torch.cat([big, x], 1)
for conv in self.convs:
x = conv(x)
return x + res
class VBNet(nn.Module):
def __init__(self, in_ch=1, nclass=1, drop=0.01, level=5, bn='batch', bottleneck=False):
super(VBNet, self).__init__()
# levels
self.level = level
# Normalization layer
global Norm
if bn == 'batch':
Norm = nn.BatchNorm3d
elif bn == 'group':
Norm = groupNorm
# elif bn == 'syncbn':
# Norm = SyncBN3d
else:
raise Exception("Error for bn")
global ConvBnReluDrop
if bottleneck:
ConvBnReluDrop = BottleNeck
else:
ConvBnReluDrop = CBR
# down 2
self.downs = nn.ModuleList()
self.downs.append(ResidualDown(in_ch, 16, 3, drop, 1, False))
self.downs.append(ResidualDown(16, 32, 3, drop, 2))
# down layers
channels = 32
for i in range(level - 2):
self.downs.append(ResidualDown(channels, channels * 2, 3, drop, 3))
channels *= 2
# up layers
self.ups = nn.ModuleList()
for i in range(level - 3):
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 3))
channels = channels // 2
# up 2
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 2))
channels = channels // 2
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 1, False))
channels = channels // 2
# classifier
self.classifier = nn.Conv3d(channels, nclass, kernel_size=1)
def forward(self, x): # 4,472.89453125M
outs = []
for layer in self.downs:
x = layer(x)
outs.append(x)
small = outs[-1]
for i in range(len(self.ups)):
layer = self.ups[i]
big = outs[self.level - i - 2]
small = layer(big, small)
out = self.classifier(small)
return out
def get_net_size(net):
params = list(net.parameters())
k = 0
for i in params:
l = 1
for j in i.size():
l *= j
k = k + l
s = ("Total parameters : {:,} float, model size : {:,}M".format(k, k * 4 / 1024))
return s
if __name__ == '__main__':
# count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
import count_ops
import os
import time
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
# 4003728896
print("Using standard convolution..........")
a = torch.randn(6, 1, 32, 128, 128)
net = VBNet(bn='batch', bottleneck=False)
print(get_net_size(net))
print(count_ops(net, a))
net = net.cuda()
start = time.time()
for i in range(10):
a = torch.randn(6, 1, 32, 128, 128).cuda()
b = net(a)
b.sum().backward()
print('end : {}s'.format(time.time() - start))
print("Max memory allocated : {}M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))
# 4543840768 4622491136
print("\nUsing bottleneck...........")
# torch.cuda.reset_max_memory_allocated(0)
a = torch.randn(6, 1, 32, 128, 128)
net = VBNet(bn='batch', bottleneck=True)
print(get_net_size(net))
print(count_ops(net, a))
net = net.cuda()
start = time.time()
for i in range(10):
a = torch.randn(6, 1, 32, 128, 128).cuda()
b = net(a)
b.sum().backward()
print('end : {}s'.format(time.time() - start))
print("Max memory allocated : {} M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))
I compared three convolutions : standard convolution, bottleneck structure and separable convolution and got the performance results:
For standard Convolution :
Total parameters : 13920 float, model size : 54.3750M
2.75 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.0517 s
Total iteration : 250
mean forward time : 0.0003 s
mean backward time : 0.0007 s
Max memory allocated : 120.1846 M
-------------------Test analyze----------------
total test time : 7.6900 s
Total iteration : 250
mean data time : 0.0305 s
mean forward time : 0.0003 s
Max memory allocated : 72.1826 M
For bottleneck :
Total parameters : 7872 float, model size : 30.7500M
1.56 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.7080 s
Total iteration : 250
mean forward time : 0.0009 s
mean backward time : 0.0016 s
Max memory allocated : 168.0767 M
-------------------Test analyze----------------
total test time : 8.8901 s
Total iteration : 250
mean data time : 0.0348 s
mean forward time : 0.0008 s
Max memory allocated : 72.0728 M
For Separable Convolution :
Total parameters : 1088 float, model size : 4.2500M
0.23 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.3567 s
Total iteration : 250
mean forward time : 0.0009 s
mean backward time : 0.0014 s
Max memory allocated : 144.2021 M
-------------------Test analyze----------------
total test time : 7.9258 s
Total iteration : 250
mean data time : 0.0309 s
mean forward time : 0.0008 s
Max memory allocated : 72.1992 M
We can see that the standard convolution is twice faster than bottleneck structure and separable convolution. And its memory cost is also not larger than other two methods.
I guess the reason could be that when forward or backward in training, the bottleneck and separable structure which have more convolution modules will use more memory to save input for back propagation and they also do more convolution operation than standard convolution. So either the memory cost nor speed of these two structure can surpass the standard convolution.
Another reason why separable convolution is slower could be that the cuDNN library dosen't directly support depthwise separable convolutions.
But these two structure indeed reduce the model size dramatically comparing to standard convolution, which is very useful for mobile device.
Code is following:
Three different convolutions.
import torch
import torch.nn as nn
import analyze_network_performance
import functools
Norm = nn.BatchNorm3d
class CBRSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(CBRSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
class BottleNeckSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(BottleNeckSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=out_channels//N, kernel_size=1, stride=1),
Norm(out_channels//N),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels//N, kernel_size=kernel_size, stride=stride, padding=padding),
Norm(out_channels//N),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels, kernel_size=1),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
class GroupSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(GroupSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=in_channels, groups=in_channels,
kernel_size=kernel_size, stride=stride, padding=padding),
Norm(in_channels),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
def test_bottleneck():
data_gen = functools.partial(torch.randn, 6, 16, 32, 32, 32)
a = BottleNeckSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
b = CBRSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
c = GroupSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
print('BottleNeck Structure ....')
analyze_network_performance(a, data_gen, train_time=250, test_time=250)
print('\nStandard Convolution ....')
analyze_network_performance(b, data_gen, train_time=250, test_time=250)
print('\nSeparable Convolution ...')
analyze_network_performance(c, data_gen, train_time=250, test_time=250)
if __name__ == '__main__':
test_bottleneck()
analyze_network_performance code.
import time
# count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
from ops import count_ops
import torch
import numpy as np
def get_net_size(net):
params = list(net.parameters())
k = 0
for i in params:
l = 1
for j in i.size():
l *= j
k = k + l
s = ("Total parameters : {:} float, model size : {:.4f}M".format(k, k * 4 / 1024))
return s
class Timer(object):
def __init__(self, verbose=False):
self.start_time = time.time()
self.verbose = verbose
self.duration = 0
def restart(self):
self.duration = self.start_time = time.time()
return self.duration
def stop(self):
return time.time() - self.start_time
def get_last_duration(self):
return self.duration
def __enter__(self):
self.restart()
def __exit__(self, exc_type, exc_val, exc_tb):
self.duration = self.stop()
if self.verbose:
print('{:^.4f} s'.format(self.stop()))
def to_cuda(data, device):
if device < 0:
return data
else:
return data.cuda(device)
def network_train_analyze(net, data_generate_func, cuda=0, train_time=10, forward_verbose=False):
t1 = Timer(verbose=True)
t2 = Timer(forward_verbose)
t3 = Timer(verbose=False)
if cuda >= 0:
torch.cuda.reset_max_memory_allocated(cuda)
forward_times = []
backward_times = []
with t1:
for i in range(train_time):
a = to_cuda(data_generate_func(), cuda)
with t3:
b = net(a)
if forward_verbose:
print('forward : ', end='')
forward_times.append(t3.get_last_duration())
with t2:
b.sum().backward()
if forward_verbose:
print('backward : ', end='')
backward_times.append(t2.get_last_duration())
print('total train time : ', end='')
print("Total iteration : {}".format(train_time))
print('mean forward time : {:^.4f} s'.format(np.mean(forward_times[1:])))
print('mean backward time : {:^.4f} s'.format(np.mean(backward_times[1:])))
if cuda >= 0:
print("Max memory allocated : {:^.4f} M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))
def network_test_analyze(net, data_generate_func, cuda=0, test_time=50, forward_verbose=False):
t1 = Timer(verbose=True)
t2 = Timer(verbose=forward_verbose)
t3 = Timer(verbose=False)
if cuda >= 0:
torch.cuda.reset_max_memory_allocated(cuda)
forward_times = []
data_times = []
with t1:
with torch.no_grad():
for i in range(test_time):
with t3:
a = to_cuda(data_generate_func(), cuda)
data_times.append(t3.get_last_duration())
with t2:
net(a)
if forward_verbose:
print('forward : ', end='')
forward_times.append(t2.get_last_duration())
print('total test time : ', end='')
print("Total iteration : {}".format(test_time))
print('mean data time : {:^.4f} s'.format(np.mean(data_times[1:])))
print('mean forward time : {:^.4f} s'.format(np.mean(forward_times[1:])))
if cuda >= 0:
print("Max memory allocated : {:^.4f} M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))
def analyze_network_performance(net, data_generate_func, cuda=0, train_time=10, test_time=20, forward_verbose=False):
print('============ Analyzing network performance ==============')
print(get_net_size(net))
net = to_cuda(net, cuda)
a = data_generate_func()
a = to_cuda(a, cuda)
print(count_ops(net, a))
print('-------------------Train analyze----------------')
network_train_analyze(net, data_generate_func, cuda, train_time, forward_verbose)
print('-------------------Test analyze----------------')
network_test_analyze(net, data_generate_func, cuda, test_time, forward_verbose)