Self attention module occpuying too large VRAM with large input - deep-learning

I was trying to add a self-attention module for Progressive GAN (ProGAN) and put it to the last layer before to RGB. After running a simple test file I found that when the model grows to 256x256 output, the process is killed.
Then I tried the following test code:
import torch
import torch.nn as nn
class SelfAttention(nn.Module):
def __init__(self, channels):
super(SelfAttention, self).__init__()
self.channels = channels
num_heads = 4
self.mha = nn.MultiheadAttention(channels, num_heads, batch_first=True)
self.ln = nn.LayerNorm([channels])
self.ff_self = nn.Sequential(
nn.LayerNorm([channels]),
nn.Linear(channels, channels),
nn.GELU(),
nn.Linear(channels, channels)
)
def forward(self, x):
size = x.shape[3]
print("SIZE", size)
print("CHANNELS", self.channels)
x = x.view(-1, self.channels, size * size).swapaxes(1, 2)
print()
x_ln = self.ln(x)
attention_value, _ = self.mha(x_ln, x_ln, x_ln)
attention_value = attention_value + x
attention_value = self.ff_self(attention_value) + attention_value
return attention_value.swapaxes(2, 1).view(-1, self.channels, size, size)
class SelfAttention2(nn.Module):
def __init__(self, channels):
super(SelfAttention2, self).__init__()
self.query = nn.Conv2d(channels, channels // 8, kernel_size=1)
self.key = nn.Conv2d(channels, channels // 8, kernel_size=1)
self.value = nn.Conv2d(channels, channels, kernel_size=1)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
N, C, H, W = x.size()
query = self.query(x).view(N, -1, W*H).permute(0, 2, 1) # (N, C, H*W)
key = self.key(x).view(N, -1, W*H) # (N, C, H*W)
energy = torch.bmm(query, key) # (N, H*W, H*W)
attention = self.softmax(energy)
value = self.value(x).view(N, -1, W*H) # (N, C, H*W)
out = torch.bmm(value, attention.permute(0, 2, 1))
out = out.view(N, C, H, W)
return out
if __name__ == '__main__':
x = torch.randn((1, 64, 256, 256))
sa1 = SelfAttention(64)
sa1(x)
sa2 = SelfAttention2(64)
sa2(x)
Neither module worked for trying to allocate 16G VRAM. (With this one module take up 16G I cannot even run the whole model in 3090)
And I am told explictly that the method itself, i.e. "Add attention to ProGAN or StyleGAN" will work and has been done.
So, is my understanding of the idea is false or the implementation has flaw?
Also, I have train the model to 32x32 and it worked ok.
Suggentions upon my understanding or my coding.

Related

Temporal sequence feature extraction CNN, batches with different dimensions

I am using a CNN to extract features from temporal data of different lengths. I am using pad_sequence to pad the data in a batch. However as the max length in a batch will change, the padded sequence length differs by batch. This creates errors when i flatten the data for the FCN layer (as the dimension of the flattened vector changes). I am currently handling this by using an 'adaptive avg pooling layer' in before the FCN layers. As this is a global averaging, it fixes the output dimension for the FCN. However I am not sure if this is the correct thing to do.
Code is:
##pad tensors
def pad_collate(batch):
sequences = [item[0] for item in batch]
lengths = [len(seq) for seq in sequences]
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
return padded_sequences, lengths
## Create dataloader
trainData = Sequence(root = path)
trainDataLoader = DataLoader(trainData, batch_size = BATCH_SIZE, collate_fn= pad_collate)
## CNN model
class FeatureExtractor(nn.Module):
def __init__(self, block, layers):
super(FeatureExtractor, self).__init__()
self.inplanes = 6
## 1st CONV layers
self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 3, stride = 2, padding = 4)
self.bn1 = nn.BatchNorm2d(6)
self.relu1 = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride = 2, padding = 1)
## residual blocks
self.layer0 = self._make_layer(block, 12, layers[0], stride = 1)
self.layer1 = self._make_layer(block, 24, layers[1], stride = 2)
self.avgpool = nn.AdaptiveAvgPool2d((5,5)) ##### MY CURRENT SOLUTION #####
self.fc = nn.Linear(600, 128)
def _make_layer(self, block, planes, blocks, stride):
downsample = None
if stride != 1 or self.inplanes != planes:
downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
nn.BatchNorm2d(planes))
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
## first conv
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.maxpool1(x)
## conv blocks
x = self.layer0(x)
x = self.layer1(x)
##FCN layer
x = self.avgpool(x)
x = torch.flatten(x, 1)
output = self.fc(x)
return output
Any other comments are also welcome (i am self-taught)

ValueError: Expected input batch_size (59) to match target batch_size (1)

I'm trying to build a semantic segmentation model with pytorch. However, I encounter this error and do not know how to fix it.
This is the model:
class SegmentationNN(pl.LightningModule):
def __init__(self, num_classes=23, hparams=None):
super().__init__()
self.hparams = hparams
self.model=models.alexnet(pretrained=True).features
self.conv=nn.Conv2d(256, 3, kernel_size=1)
self.upsample = nn.Upsample(size=(240,240))
def forward(self, x):
print('Input:', x.shape)
x = self.model(x)
print('After Alexnet convs:', x.shape)
x = self.conv(x)
print('After 1-conv:', x.shape)
x = self.upsample(x)
print('After upsampling:', x.shape)
return x
def training_step(self, batch, batch_idx):
images, targets = batch
# targets = targets.view(targets.size(0), -1)
out = self.forward(images)
loss_func = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss = loss_func(out, targets.unsqueeze(0))
tensorboard_logs = {'loss': loss}
return {'loss': loss, 'log':tensorboard_logs}
def validation_step(self, batch, batch_idx):
images, targets = batch
# targets = targets.view(targets.size(0), -1)
out = self.forward(images)
loss_func = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss = loss_func(out, targets.unsqueeze(0))
tensorboard_logs = {'loss': loss}
return {'loss': loss, 'log':tensorboard_logs}
def configure_optimizers(self):
optim = torch.optim.Adam(self.parameters(), lr=self.hparams['learning_rate'])
return optim
And this is the training and fit:
train_dataloader = DataLoader(train_data, batch_size=hparams['batch_size'])
val_dataloader = DataLoader(val_data, batch_size=hparams['batch_size'])
trainer = pl.Trainer(
max_epochs=50,
gpus=1 if torch.cuda.is_available() else None
)
pass
trainer.fit(model, train_dataloader, val_dataloader)
These are the sizes of the tensor after each layer:
Input: torch.Size([59, 3, 240, 240])
After Alexnet convs: torch.Size([59, 256, 6, 6])
After 1-conv: torch.Size([59, 3, 6, 6])
After upsampling: torch.Size([59, 3, 240, 240])
I am pretty a beginner with Pytorch and Pytorch Lightning so every advice would be apprreciated!
Can you delete the unsqueeze(0) part here : loss = loss_func(out, targets.unsqueeze(0))

Is it possible to combine 2 neural networks?

I have a NET like (exemple from here)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square, you can specify with a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
and another net like (exemple from here)
class binaryClassification(nn.Module):
def __init__(self):
super(binaryClassification, self).__init__()
# Number of input features is 12.
self.layer_1 = nn.Linear(12, 64)
self.layer_2 = nn.Linear(64, 64)
self.layer_out = nn.Linear(64, 1)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.1)
self.batchnorm1 = nn.BatchNorm1d(64)
self.batchnorm2 = nn.BatchNorm1d(64)
def forward(self, inputs):
x = self.relu(self.layer_1(inputs))
x = self.batchnorm1(x)
x = self.relu(self.layer_2(x))
x = self.batchnorm2(x)
x = self.dropout(x)
x = self.layer_out(x)
return x
I'd like to change, for exemple "self.fc2 = nn.Linear(120, 84)" in order to have 121 inputs, where the 121th is the x (output) of the binaryClassification network.
The idea is: I'd like to use in the same time, CNN network, and not-CNN network, to train both, with influence one on the other.
Is it possible? How can I perform that? (Keras or Pytorch examples are both ok).
Or maybe the idea is crazy and there is easier way to mix data and image as input of an unique network?
It is a perfectly valid approach, you are taking two different input data sources, processing them and combining the result to solve a common goal (in this case it seems like a 10-class image classification). You can define the input to your Net network to be a tuple of the image you need for the original Net and the features 12-value vector for your BinaryClassificator. An example code would be:
import torch
import torch.nn as nn
class binaryClassification(nn.Module):
#> ...same as above
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
self.binClas = binaryClassification()
self.fc2 = nn.Linear(121, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, inputs):
x, features = inputs # split tuple
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square, you can specify with a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
# Concatenate with BinaryClassification
x = torch.cat([F.relu(self.fc1(x)), self.binClas(features)])
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
However! Be careful about training them together, it is hard to balance both branches in the network to make them learn. I would recommend you to train them separately for a while before plugging them together (generally speaking, the hyperparameters of one part of the network will probably not be optimal for the other). To do this, you could freeze one part of the network while training the other, and viceversa. (check this link to see how to freeze parts of a torch nn)
The most naive way to do it would be to instantiate both models, sum the two predictions and compute the loss with it. This will backpropagate through both models:
net1 = Net1()
net2 = Net2()
bce = torch.nn.BCEWithLogitsLoss()
params = list(net1.parameters()) + list(net2.parameters())
optimizer = optim.SGD(params)
for (x, ground_truth) in enumerate(your_data_loader):
optimizer.zero_grad()
prediction = net1(x) + net2(x) # the 2 models must output tensors of same shape
loss = bce(prediction, ground_truth)
train_loss.backward()
optimizer.step()
You could also e.g.
implement the layers of Net1 and Net2 in a single model
train Net1 and Net2 separately and ensemble them later

Why bottleneck structure is slower and costs more memory when training a network comparing to standard convolution?

I'm using vnet to train a model. I want to train the model faster with less memory. So I replaced the standard 3x3 convolution to a combination of [1x1, 3x3 , 1x1] convolutions. The first 1x1 conv will reduce the channel to 1/N to reduce memory cost. The code is as follows.
The first two classes is bottleneck structure and standard convolution. When I replace the standard convolution to bottleneck structure, although the model size and flops decrease, the real GPU memory cost and training time increase.
For example, I got :
Using standard convolution..........
Total parameters : 10,052,609 float, model size : 39,268.00390625M
191.78 GFLOPs
end : 10.62517523765564s
Max memory allocated : 3818.25341796875M
Using bottleneck...........
Total parameters : 1,145,061 float, model size : 4,472.89453125M
16.05 GFLOPs
end : 16.890745162963867s
Max memory allocated : 4408.35107421875 M
However, in inference stage, the bottleneck structure can accelerate the network to some extend.
Does anyone know why this happens and how to accelerate the network both in training and inference stage?
Code :
import torch
import torch.nn as nn
import torch.nn.functional as F
def groupNorm(channel, num_groups=16):
return nn.GroupNorm(num_groups=num_groups, num_channels=channel)
Norm = nn.BatchNorm3d
BottleNeck_Ratio = 4
class BottleNeck(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1, N=BottleNeck_Ratio):
super(BottleNeck, self).__init__()
self.conv_1 = nn.Conv3d(in_channels=in_channels, out_channels=out_channels // N, kernel_size=1, stride=1)
self.conv_2 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels // N, kernel_size=kernel_size,
stride=stride, padding=padding)
self.conv_3 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels, kernel_size=1, stride=1)
self.norm = Norm(out_channels)
self.relu = nn.ReLU()
self.drop = nn.Dropout3d(drop)
def forward(self, input):
x = self.conv_1(input)
x = self.conv_2(x)
x = self.conv_3(x)
return self.drop(self.relu(self.norm(x)))
class CBR(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1):
super(CBR, self).__init__()
self.conv = nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
stride=stride, padding=padding)
self.norm = Norm(out_channels)
self.relu = nn.ReLU()
self.drop = nn.Dropout3d(drop)
def forward(self, input):
return self.drop(self.relu(self.norm(self.conv(input))))
ConvBnReluDrop = BottleNeck
class ResidualDown(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, down=True):
super(ResidualDown, self).__init__()
if down:
self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=2, stride=2, padding=0, drop=drop)
else:
self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)
self.convs = nn.ModuleList()
for i in range(conv_nums):
self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))
self.has_down = down
def forward(self, x):
# downsample
res = self.down(x)
# convolution
out = res
for conv in self.convs:
out = conv(out)
# residual
return out + res
class ResidualUp(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, up=True):
super(ResidualUp, self).__init__()
if up:
self.deconv = nn.ConvTranspose3d(in_channels, out_channels, kernel_size=2, stride=2)
else:
self.deconv = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)
self.convs = nn.ModuleList()
self.convs.append(ConvBnReluDrop(2 * out_channels, out_channels, kernel_size, drop))
for i in range(conv_nums - 1):
self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))
def forward(self, big, small):
x = self.deconv(small)
# interpolate to prevent size not match
x = F.interpolate(x, big.size()[-3:], mode='trilinear', align_corners=False)
# save x as residual, [out_ch]
res = x
# skip connection, concat and conv to small's channel
# [2*out_ch] => [out_ch]
x = torch.cat([big, x], 1)
for conv in self.convs:
x = conv(x)
return x + res
class VBNet(nn.Module):
def __init__(self, in_ch=1, nclass=1, drop=0.01, level=5, bn='batch', bottleneck=False):
super(VBNet, self).__init__()
# levels
self.level = level
# Normalization layer
global Norm
if bn == 'batch':
Norm = nn.BatchNorm3d
elif bn == 'group':
Norm = groupNorm
# elif bn == 'syncbn':
# Norm = SyncBN3d
else:
raise Exception("Error for bn")
global ConvBnReluDrop
if bottleneck:
ConvBnReluDrop = BottleNeck
else:
ConvBnReluDrop = CBR
# down 2
self.downs = nn.ModuleList()
self.downs.append(ResidualDown(in_ch, 16, 3, drop, 1, False))
self.downs.append(ResidualDown(16, 32, 3, drop, 2))
# down layers
channels = 32
for i in range(level - 2):
self.downs.append(ResidualDown(channels, channels * 2, 3, drop, 3))
channels *= 2
# up layers
self.ups = nn.ModuleList()
for i in range(level - 3):
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 3))
channels = channels // 2
# up 2
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 2))
channels = channels // 2
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 1, False))
channels = channels // 2
# classifier
self.classifier = nn.Conv3d(channels, nclass, kernel_size=1)
def forward(self, x): # 4,472.89453125M
outs = []
for layer in self.downs:
x = layer(x)
outs.append(x)
small = outs[-1]
for i in range(len(self.ups)):
layer = self.ups[i]
big = outs[self.level - i - 2]
small = layer(big, small)
out = self.classifier(small)
return out
def get_net_size(net):
params = list(net.parameters())
k = 0
for i in params:
l = 1
for j in i.size():
l *= j
k = k + l
s = ("Total parameters : {:,} float, model size : {:,}M".format(k, k * 4 / 1024))
return s
if __name__ == '__main__':
# count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
import count_ops
import os
import time
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
# 4003728896
print("Using standard convolution..........")
a = torch.randn(6, 1, 32, 128, 128)
net = VBNet(bn='batch', bottleneck=False)
print(get_net_size(net))
print(count_ops(net, a))
net = net.cuda()
start = time.time()
for i in range(10):
a = torch.randn(6, 1, 32, 128, 128).cuda()
b = net(a)
b.sum().backward()
print('end : {}s'.format(time.time() - start))
print("Max memory allocated : {}M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))
# 4543840768 4622491136
print("\nUsing bottleneck...........")
# torch.cuda.reset_max_memory_allocated(0)
a = torch.randn(6, 1, 32, 128, 128)
net = VBNet(bn='batch', bottleneck=True)
print(get_net_size(net))
print(count_ops(net, a))
net = net.cuda()
start = time.time()
for i in range(10):
a = torch.randn(6, 1, 32, 128, 128).cuda()
b = net(a)
b.sum().backward()
print('end : {}s'.format(time.time() - start))
print("Max memory allocated : {} M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))
I compared three convolutions : standard convolution, bottleneck structure and separable convolution and got the performance results:
For standard Convolution :
Total parameters : 13920 float, model size : 54.3750M
2.75 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.0517 s
Total iteration : 250
mean forward time : 0.0003 s
mean backward time : 0.0007 s
Max memory allocated : 120.1846 M
-------------------Test analyze----------------
total test time : 7.6900 s
Total iteration : 250
mean data time : 0.0305 s
mean forward time : 0.0003 s
Max memory allocated : 72.1826 M
For bottleneck :
Total parameters : 7872 float, model size : 30.7500M
1.56 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.7080 s
Total iteration : 250
mean forward time : 0.0009 s
mean backward time : 0.0016 s
Max memory allocated : 168.0767 M
-------------------Test analyze----------------
total test time : 8.8901 s
Total iteration : 250
mean data time : 0.0348 s
mean forward time : 0.0008 s
Max memory allocated : 72.0728 M
For Separable Convolution :
Total parameters : 1088 float, model size : 4.2500M
0.23 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.3567 s
Total iteration : 250
mean forward time : 0.0009 s
mean backward time : 0.0014 s
Max memory allocated : 144.2021 M
-------------------Test analyze----------------
total test time : 7.9258 s
Total iteration : 250
mean data time : 0.0309 s
mean forward time : 0.0008 s
Max memory allocated : 72.1992 M
We can see that the standard convolution is twice faster than bottleneck structure and separable convolution. And its memory cost is also not larger than other two methods.
I guess the reason could be that when forward or backward in training, the bottleneck and separable structure which have more convolution modules will use more memory to save input for back propagation and they also do more convolution operation than standard convolution. So either the memory cost nor speed of these two structure can surpass the standard convolution.
Another reason why separable convolution is slower could be that the cuDNN library dosen't directly support depthwise separable convolutions.
But these two structure indeed reduce the model size dramatically comparing to standard convolution, which is very useful for mobile device.
Code is following:
Three different convolutions.
import torch
import torch.nn as nn
import analyze_network_performance
import functools
Norm = nn.BatchNorm3d
class CBRSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(CBRSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
class BottleNeckSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(BottleNeckSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=out_channels//N, kernel_size=1, stride=1),
Norm(out_channels//N),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels//N, kernel_size=kernel_size, stride=stride, padding=padding),
Norm(out_channels//N),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels, kernel_size=1),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
class GroupSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(GroupSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=in_channels, groups=in_channels,
kernel_size=kernel_size, stride=stride, padding=padding),
Norm(in_channels),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
def test_bottleneck():
data_gen = functools.partial(torch.randn, 6, 16, 32, 32, 32)
a = BottleNeckSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
b = CBRSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
c = GroupSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
print('BottleNeck Structure ....')
analyze_network_performance(a, data_gen, train_time=250, test_time=250)
print('\nStandard Convolution ....')
analyze_network_performance(b, data_gen, train_time=250, test_time=250)
print('\nSeparable Convolution ...')
analyze_network_performance(c, data_gen, train_time=250, test_time=250)
if __name__ == '__main__':
test_bottleneck()
analyze_network_performance code.
import time
# count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
from ops import count_ops
import torch
import numpy as np
def get_net_size(net):
params = list(net.parameters())
k = 0
for i in params:
l = 1
for j in i.size():
l *= j
k = k + l
s = ("Total parameters : {:} float, model size : {:.4f}M".format(k, k * 4 / 1024))
return s
class Timer(object):
def __init__(self, verbose=False):
self.start_time = time.time()
self.verbose = verbose
self.duration = 0
def restart(self):
self.duration = self.start_time = time.time()
return self.duration
def stop(self):
return time.time() - self.start_time
def get_last_duration(self):
return self.duration
def __enter__(self):
self.restart()
def __exit__(self, exc_type, exc_val, exc_tb):
self.duration = self.stop()
if self.verbose:
print('{:^.4f} s'.format(self.stop()))
def to_cuda(data, device):
if device < 0:
return data
else:
return data.cuda(device)
def network_train_analyze(net, data_generate_func, cuda=0, train_time=10, forward_verbose=False):
t1 = Timer(verbose=True)
t2 = Timer(forward_verbose)
t3 = Timer(verbose=False)
if cuda >= 0:
torch.cuda.reset_max_memory_allocated(cuda)
forward_times = []
backward_times = []
with t1:
for i in range(train_time):
a = to_cuda(data_generate_func(), cuda)
with t3:
b = net(a)
if forward_verbose:
print('forward : ', end='')
forward_times.append(t3.get_last_duration())
with t2:
b.sum().backward()
if forward_verbose:
print('backward : ', end='')
backward_times.append(t2.get_last_duration())
print('total train time : ', end='')
print("Total iteration : {}".format(train_time))
print('mean forward time : {:^.4f} s'.format(np.mean(forward_times[1:])))
print('mean backward time : {:^.4f} s'.format(np.mean(backward_times[1:])))
if cuda >= 0:
print("Max memory allocated : {:^.4f} M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))
def network_test_analyze(net, data_generate_func, cuda=0, test_time=50, forward_verbose=False):
t1 = Timer(verbose=True)
t2 = Timer(verbose=forward_verbose)
t3 = Timer(verbose=False)
if cuda >= 0:
torch.cuda.reset_max_memory_allocated(cuda)
forward_times = []
data_times = []
with t1:
with torch.no_grad():
for i in range(test_time):
with t3:
a = to_cuda(data_generate_func(), cuda)
data_times.append(t3.get_last_duration())
with t2:
net(a)
if forward_verbose:
print('forward : ', end='')
forward_times.append(t2.get_last_duration())
print('total test time : ', end='')
print("Total iteration : {}".format(test_time))
print('mean data time : {:^.4f} s'.format(np.mean(data_times[1:])))
print('mean forward time : {:^.4f} s'.format(np.mean(forward_times[1:])))
if cuda >= 0:
print("Max memory allocated : {:^.4f} M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))
def analyze_network_performance(net, data_generate_func, cuda=0, train_time=10, test_time=20, forward_verbose=False):
print('============ Analyzing network performance ==============')
print(get_net_size(net))
net = to_cuda(net, cuda)
a = data_generate_func()
a = to_cuda(a, cuda)
print(count_ops(net, a))
print('-------------------Train analyze----------------')
network_train_analyze(net, data_generate_func, cuda, train_time, forward_verbose)
print('-------------------Test analyze----------------')
network_test_analyze(net, data_generate_func, cuda, test_time, forward_verbose)

MXNET CNN+LSTM save/serialize to json

I'm finding a hardtime figuring out how to correctly define a mxnet net so that i can serialize/convert this model to a json file.
The pipeline is composed of a CNN + biLSTM + CTC.
I now i must use HybridBlock and hybridize() but i can't seem to make it work or if its even possible or if there is any other way around.
I'm sure its lack of knowledge on my part and wonder is anyone can help.
Here is the net definition in python:
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
# conv layer
featurizer.add(gluon.nn.Conv2D(kernel_size=(3,3), padding=(1,1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
....
featurizer.hybridize()
return featurizer
class EncoderLayer(gluon.Block):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def forward(self, x):
x = x.transpose((0,3,1,2))
x = x.flatten()
x = x.split(num_outputs=SEQ_LEN, axis = 1) # (SEQ_LEN, N, CHANNELS)
x = nd.concat(*[elem.expand_dims(axis=0) for elem in x], dim=0)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.Sequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
decoder.hybridize()
return decoder
def get_net():
net = gluon.nn.Sequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
Any help would be highly appreciated.
Thank you very much.
There are few requirements for a model in Gluon to be exportable to json:
It needs to be hybridizable, meaning that each children block should be hybridizable as well and the model works in both modes
All parameters should be initialized. Since Gluon uses deferred parameter initialization, that means that you should do forward pass at least once before you can save the model.
I did some fixes for your code also introducing new constants when I needed. The most significant changes are:
Don't use split if you can avoid it, because it returns list of NDArrays. Use reshape, which works seemlessly with Symbol as well.
Starting from 1.3.0 version of MXNet, LSTM is also hybridizable, so you can wrap it in a HybridBlock instead of just a Block.
Use HybridSequential.
Here is the adjusted code with an example at the bottom how to save the model and how to load it back. You can find more information in this tutorial.
import mxnet as mx
from mxnet import gluon
from mxnet import nd
BATCH_SIZE = 1
CHANNELS = 100
ALPHABET_SIZE = 1000
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
HEIGHT = 100
WIDTH = 100
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
featurizer.add(
gluon.nn.Conv2D(kernel_size=(3, 3), padding=(1, 1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
return featurizer
class EncoderLayer(gluon.HybridBlock):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def hybrid_forward(self, F, x):
x = x.transpose((0, 3, 1, 2))
x = x.flatten()
x = x.reshape(shape=(SEQ_LEN, -1, CHANNELS)) #x.split(num_outputs=SEQ_LEN, axis=1) # (SEQ_LEN, N, CHANNELS)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.HybridSequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
return decoder
def get_net():
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
if __name__ == '__main__':
net = get_net()
net.initialize()
net.hybridize()
fake_data = mx.random.uniform(shape=(BATCH_SIZE, HEIGHT, WIDTH, CHANNELS))
out = net(fake_data)
net.export("mymodel")
deserialized_net = gluon.nn.SymbolBlock.imports("mymodel-symbol.json", ['data'],
"mymodel-0000.params", ctx=mx.cpu())
out2 = deserialized_net(fake_data)
# just to check that we get the same results
assert (out - out2).sum().asscalar() == 0