is there any way I can concatenate branches of different dimensions of a small inception(googlenet)?
For example, at a 32 x 32 x 3 image (torch.Size([1, 3, 32, 32])), it will pass through an inception module with the following branches:
a convolution with (32 channels, 1 x 1 filters)
another convolution with (32 channels, 3 x3 filters)
merge (concatenate along the channel dimension)
Inception Module
The issue however is that the torch sizes as a result of the two convolutions are different.
(32 channels, 1 x 1 filters) -> [1, 32, 30, 30]
(32 channels, 3 x3 filters) -> [1, 32, 28, 28]
How can I concatenate the two branches?
Should I add padding? I tried ZeroPad2d. It’s not working.
For your reference:
class Inception(nn.Module):
def __init__(self, in_channel, ch1, ch3):
super(Inception, self).__init__()
self.branch1 = nn.Sequential(
ConvBlock(in_channel, ch1, kernel_size = 1,stride=1, padding=0)
)
self.branch2 = nn.Sequential(
ConvBlock(in_channel, ch3, kernel_size = 3,stride=1, padding=0)
)
def forward(self, x):
branch1 = self.branch1(x)
branch2 = self.branch2(x)
return torch.cat([branch1, branch2], 1) *(error here)*
You need to match your kernel size and padding.
For kernel_size=1 no padding is needed, but for kernel_size=3 you need padding to be 1:
self.branch2 = nn.Sequential(
ConvBlock(in_channel, ch3, kernel_size=3,stride=1, padding=1)
)
See this nice tutorial for more details.
Related
I'm writing a thesis and want to present a visualisation of the CNN architecture used for the analysis (written in PyTorch). I came across this cool repository PlotNeuralNet with examples for how to generate LaTeX code for drawing neural networks for reports and presentation. However, I'm having trouble finding out how to exactly define my particular architecture.
Here is an example of how one would define an architecture.
import sys
sys.path.append('../')
from pycore.tikzeng import *
# define your arch
arch = \[
to_head( '..' ),
to_cor(),
to_begin(),
to_Conv("conv1", 512, 64, offset="(0,0,0)", to="(0,0,0)", height=64, depth=64, width=2 ),
to_Pool("pool1", offset="(0,0,0)", to="(conv1-east)"),
to_Conv("conv2", 128, 64, offset="(1,0,0)", to="(pool1-east)", height=32, depth=32, width=2 ),
to_connection( "pool1", "conv2"),
to_Pool("pool2", offset="(0,0,0)", to="(conv2-east)", height=28, depth=28, width=1),
to_SoftMax("soft1", 10 ,"(3,0,0)", "(pool1-east)", caption="SOFT" ),
to_connection("pool2", "soft1"),
to_Sum("sum1", offset="(1.5,0,0)", to="(soft1-east)", radius=2.5, opacity=0.6),
to_connection("soft1", "sum1"),
to_end()
\]
def main():
namefile = str(sys.argv[0]).split('.')[0]
to_generate(arch, namefile + '.tex' )
if __name__ == '__main__':
main()
However, looking at the different available blocks available in pycore module, I'm still not able to use the tool. Documentation for usage is not really that elaborate, so I was hoping someone here would find it trivial to define the architecture below. Else, any good ways to
class Net20(nn.Module):
""" CNN for 20-day Image
This particular model should have:
- 3 blocks
- 64 layers in first block, multiply by 2 each subsequent block
- filter size (5,3)
- vertical stride = 3 (but only in first layer)
- vertical dilation = 2 (but only in first layer)
- Leaky Relu activation function
- max pooling (2,1) at the end of each block
"""
def __init__(self):
super().__init__()
self.layer1 = nn.Sequential(
Conv2dSame(1, 64, kernel_size=(5,3), stride=(3,1), dilation=(2,1)),
nn.BatchNorm2d(64),
nn.LeakyReLU(negative_slope=0.01, inplace=True),
nn.MaxPool2d((2, 1), ceil_mode=True)
)
self.layer2 = nn.Sequential(
Conv2dSame(64, 128, kernel_size=(5,3)),
nn.BatchNorm2d(128),
nn.LeakyReLU(negative_slope=0.01, inplace=True),
nn.MaxPool2d((2, 1), ceil_mode=True)
)
self.layer3 = nn.Sequential(
Conv2dSame(128, 256, kernel_size=(5,3)),
nn.BatchNorm2d(256),
nn.LeakyReLU(negative_slope=0.01, inplace=True),
nn.MaxPool2d((2, 1), ceil_mode=True)
)
self.fc1 = nn.Sequential(
nn.Dropout(p=0.5),
nn.Linear(46080, 1),
)
def forward(self, x):
x = x.reshape(-1,1,64,60)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = x.reshape(-1,46080)
x = self.fc1(x)
return x
You can try model.summary() or keras.utils.plot_model. You may want to check: https://machinelearningmastery.com/visualize-deep-learning-neural-network-model-keras/
I'm building a DQN that takes a 24x10 array of 0,1,2 (representing a tetris board) and a int 0-5 (representing the current playable tetramino)
I flatten my array and convert it to a Tensor before inputting it to my convolutional layers but this is the error I keep on getting
Expected 4-dimensional input for 4-dimensional weight [16, 3, 240, 240], but got 1-dimensional input of size [240] instead
I've tried reducing the Kernel size and stride as well as not flattening the array but neither has worked.
For reference this is my DQN
class DQN(nn.Module):
def __init__(self):
super(DQN, self).__init__()
self.conv1_board = nn.Conv2d(3, 16, kernel_size=240, stride=1) #3 input channels for 0,1,2 . kernel_size 240 for length of tensor
self.conv2_board = nn.Conv2d(16, 32, kernel_size=240, stride=1)
self.conv3_board = nn.Conv2d(32, 6, kernel_size=240, stride=1)
self.conv1_piece = nn.Conv2d(6, 16, kernel_size=240, stride=1) #in channels 6 as 6 possible values
self.conv2_piece = nn.Conv2d(16, 6, kernel_size=240, stride=1)
self.fc1 = nn.Linear(1, 32)
self.fc2 = nn.Linear(32, 6)
self.flatten = nn.Flatten()
def flt_totns(self, arr):
flt = []
for l in arr:
flt.extend(l)
return torch.FloatTensor(flt)
def forward(self, states): #inputs to conv layers should be Tensors not list. convert list => tensor
board, piece = states
board = self.flt_totns(board)
embed_board = flatten(self.conv3_board(self.conv2_board(self.conv1_board(board))))
embed_piece = flatten(self.conv2_piece(self.conv1_piece(piece)))
embed_joined = torch.cat([embed_board, embed_piece])
return self.fc2(self.fc1(embed_joined))
I'm very new to CNNs in pytorch so I'm sure a lot of my reasoning is faulty. For example I'm still not sure how Kernel size exactly relates to the shape of your input, or if input channels still applies to array inputs. Buy any help would be greatly appreciated.
I have a NET like (exemple from here)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square, you can specify with a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
and another net like (exemple from here)
class binaryClassification(nn.Module):
def __init__(self):
super(binaryClassification, self).__init__()
# Number of input features is 12.
self.layer_1 = nn.Linear(12, 64)
self.layer_2 = nn.Linear(64, 64)
self.layer_out = nn.Linear(64, 1)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.1)
self.batchnorm1 = nn.BatchNorm1d(64)
self.batchnorm2 = nn.BatchNorm1d(64)
def forward(self, inputs):
x = self.relu(self.layer_1(inputs))
x = self.batchnorm1(x)
x = self.relu(self.layer_2(x))
x = self.batchnorm2(x)
x = self.dropout(x)
x = self.layer_out(x)
return x
I'd like to change, for exemple "self.fc2 = nn.Linear(120, 84)" in order to have 121 inputs, where the 121th is the x (output) of the binaryClassification network.
The idea is: I'd like to use in the same time, CNN network, and not-CNN network, to train both, with influence one on the other.
Is it possible? How can I perform that? (Keras or Pytorch examples are both ok).
Or maybe the idea is crazy and there is easier way to mix data and image as input of an unique network?
It is a perfectly valid approach, you are taking two different input data sources, processing them and combining the result to solve a common goal (in this case it seems like a 10-class image classification). You can define the input to your Net network to be a tuple of the image you need for the original Net and the features 12-value vector for your BinaryClassificator. An example code would be:
import torch
import torch.nn as nn
class binaryClassification(nn.Module):
#> ...same as above
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
self.binClas = binaryClassification()
self.fc2 = nn.Linear(121, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, inputs):
x, features = inputs # split tuple
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square, you can specify with a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
# Concatenate with BinaryClassification
x = torch.cat([F.relu(self.fc1(x)), self.binClas(features)])
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
However! Be careful about training them together, it is hard to balance both branches in the network to make them learn. I would recommend you to train them separately for a while before plugging them together (generally speaking, the hyperparameters of one part of the network will probably not be optimal for the other). To do this, you could freeze one part of the network while training the other, and viceversa. (check this link to see how to freeze parts of a torch nn)
The most naive way to do it would be to instantiate both models, sum the two predictions and compute the loss with it. This will backpropagate through both models:
net1 = Net1()
net2 = Net2()
bce = torch.nn.BCEWithLogitsLoss()
params = list(net1.parameters()) + list(net2.parameters())
optimizer = optim.SGD(params)
for (x, ground_truth) in enumerate(your_data_loader):
optimizer.zero_grad()
prediction = net1(x) + net2(x) # the 2 models must output tensors of same shape
loss = bce(prediction, ground_truth)
train_loss.backward()
optimizer.step()
You could also e.g.
implement the layers of Net1 and Net2 in a single model
train Net1 and Net2 separately and ensemble them later
I have a python test code which predefines
nz=10, ngf= 64
def test_Generator_shapes():
nz = 10
netG = Generator(nz, ngf=64, nc=1)
batch_size = 32
noise = torch.randn(batch_size, nz, 1, 1)
out = netG(noise, verbose=True)
assert out.shape == torch.Size([batch_size, 1, 28, 28]), f"Bad shape of out: out.shape={out.shape}"
print('Success')
test_Generator_shapes()
Now I need to reset the hidden layers and other parameters to be able to output imamges of size 28x28,
i.e.- torch.Size([batch_size, 1, 28, 28])
Please can someone suggest what changes I should do in the following code so as to be able to generate images of 28x28 instead of 64x64 presently
class Generator(nn.Module):
def __init__(self, nz=10, ngf=28, nc=1, ndf=28):
"""GAN generator.
Args:
nz: Number of elements in the latent code.
ngf: Base size (number of channels) of the generator layers.
nc: Number of channels in the generated images.
"""
ngf=28
super(Generator, self).__init__()
self.ngpu = 0
self.main = nn.Sequential(
# input is Z, going into a convolution
nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
# state size. (ngf*8) x 4 x 4
nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
# state size. (ngf*4) x 8 x 8
nn.ConvTranspose2d( ngf * 4, ngf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 2),
nn.ReLU(True),
# state size. (ngf*2) x 16 x 16
nn.ConvTranspose2d( ngf * 2, ngf, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf),
nn.ReLU(True),
# state size. (ngf) x 32 x 32
nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False),
nn.Tanh()
# state size. (nc) x 64 x 64
)
# YOUR CODE HERE
#raise NotImplementedError()
def forward(self, z, verbose=False):
"""Generate images by transforming the given noise tensor.
Args:
z of shape (batch_size, nz, 1, 1): Tensor of noise samples. We use the last two singleton dimensions
so that we can feed z to the generator without reshaping.
verbose (bool): Whether to print intermediate shapes (True) or not (False).
Returns:
out of shape (batch_size, nc, 28, 28): Generated images.
"""
# YOUR CODE HERE
x = self.main(z)
print (x.size())
return x
#raise NotImplementedError()
Trying to implement a simple multi-label image classifier using Pytorch Lightning. Here's the model definition:
import torch
from torch import nn
# creates network class
class Net(pl.LightningModule):
def __init__(self):
super().__init__()
# defines conv layers
self.conv_layer_b1 = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=32,
kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Flatten(),
)
# passes dummy x matrix to find the input size of the fc layer
x = torch.randn(1, 3, 800, 600)
self._to_linear = None
self.forward(x)
# defines fc layer
self.fc_layer = nn.Sequential(
nn.Linear(in_features=self._to_linear,
out_features=256),
nn.ReLU(),
nn.Linear(256, 5),
)
# defines accuracy metric
self.accuracy = pl.metrics.Accuracy()
self.confusion_matrix = pl.metrics.ConfusionMatrix(num_classes=5)
def forward(self, x):
x = self.conv_layer_b1(x)
if self._to_linear is None:
# does not run fc layer if input size is not determined yet
self._to_linear = x.shape[1]
else:
x = self.fc_layer(x)
return x
def cross_entropy_loss(self, logits, y):
criterion = nn.CrossEntropyLoss()
return criterion(logits, y)
def training_step(self, train_batch, batch_idx):
x, y = train_batch
logits = self.forward(x)
train_loss = self.cross_entropy_loss(logits, y)
train_acc = self.accuracy(logits, y)
train_cm = self.confusion_matrix(logits, y)
self.log('train_loss', train_loss)
self.log('train_acc', train_acc)
self.log('train_cm', train_cm)
return train_loss
def validation_step(self, val_batch, batch_idx):
x, y = val_batch
logits = self.forward(x)
val_loss = self.cross_entropy_loss(logits, y)
val_acc = self.accuracy(logits, y)
return {'val_loss': val_loss, 'val_acc': val_acc}
def validation_epoch_end(self, outputs):
avg_val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
self.log("val_loss", avg_val_loss)
self.log("val_acc", avg_val_acc)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=0.0008)
return optimizer
The issue is probably not the machine since I'm using a cloud instance with 60 GBs of RAM and 12 GBs of VRAM. Whenever I run this model even for a single epoch, I get an out of memory error. On the CPU it looks like this:
RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 1966080000 bytes. Error code 12 (Cannot allocate memory)
and on the GPU it looks like this:
RuntimeError: CUDA out of memory. Tried to allocate 7.32 GiB (GPU 0; 11.17 GiB total capacity; 4.00 KiB already allocated; 2.56 GiB free; 2.00 MiB reserved in total by PyTorch)
Clearing the cache and reducing the batch size did not work. I'm a novice so clearly something here is exploding but I can't tell what. Any help would be appreciated.
Thank you!
Indeed, it's not a machine issue; the model itself is simply unreasonably big. Typically, if you take a look at common CNN models, the fc layers occur near the end, after the inputs already pass through quite a few convolutional blocks (and have their spatial resolutions reduced).
Assuming inputs are of shape (batch, 3, 800, 600), while passing the conv_layer_b1 layer, the feature map shape would be (batch, 32, 400, 300) after the MaxPool operation. After flattening, the inputs become (batch, 32 * 400 * 300), ie, (batch, 3840000).
The immediately following fc_layer thus contains nn.Linear(3840000, 256), which is simply absurd. This single linear layer contains ~983 million trainable parameters! For reference, popular image classification CNNs roughly have 3 to 30 million parameters on average, with larger variants reaching 60 to 80 million. Few ever really cross the 100 million mark.
You can count your model params with this:
def count_params(model):
return sum(map(lambda p: p.data.numel(), model.parameters()))
My advice: 800 x 600 is really a massive input size. Reduce it to something like 400 x 300, if possible. Furthermore, add several convolutional blocks similar to conv_layer_b1, before the FC layer. For example:
def get_conv_block(C_in, C_out):
return nn.Sequential(
nn.Conv2d(in_channels=C_in, out_channels=C_out,
kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
class Net(pl.LightningModule):
def __init__(self):
super().__init__()
# defines conv layers
self.conv_layer_b1 = get_conv_block(3, 16)
self.conv_layer_b2 = get_conv_block(16, 32)
self.conv_layer_b3 = get_conv_block(32, 64)
self.conv_layer_b4 = get_conv_block(64, 128)
self.conv_layer_b5 = get_conv_block(128, 256)
# passes dummy x matrix to find the input size of the fc layer
x = torch.randn(1, 3, 800, 600)
self._to_linear = None
self.forward(x)
# defines fc layer
self.fc_layer = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features=self._to_linear,
out_features=256),
nn.ReLU(),
nn.Linear(256, 5)
)
# defines accuracy metric
self.accuracy = pl.metrics.Accuracy()
self.confusion_matrix = pl.metrics.ConfusionMatrix(num_classes=5)
def forward(self, x):
x = self.conv_layer_b1(x)
x = self.conv_layer_b2(x)
x = self.conv_layer_b3(x)
x = self.conv_layer_b4(x)
x = self.conv_layer_b5(x)
if self._to_linear is None:
# does not run fc layer if input size is not determined yet
self._to_linear = nn.Flatten()(x).shape[1]
else:
x = self.fc_layer(x)
return x
Here, because more conv-relu-pool layers are applied, the input is reduced to a feature map of a much smaller shape, (batch, 256, 25, 18), and the overall number of trainable parameters would be reduced to about ~30 million parameters.