I went through this PyTorch CNN implementation available here: https://machinelearningknowledge.ai/pytorch-conv2d-explained-with-examples/
I am unable to understand how they replace the '?' with some value. What is the formula for calculating the CNN layer output?
This is essential to be calculated in PyTorch; not so in Tensorflow - Keras. If there is any other blog that explains this well, please drop it in the comments.
# Implementation of CNN/ConvNet Model
class CNN(torch.nn.Module):
def __init__(self):
super(CNN, self).__init__()
# L1 ImgIn shape=(?, 28, 28, 1)
# Conv -> (?, 28, 28, 32)
# Pool -> (?, 14, 14, 32)
self.layer1 = torch.nn.Sequential(
torch.nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2),
torch.nn.Dropout(p=1 - keep_prob))
# L2 ImgIn shape=(?, 14, 14, 32)
# Conv ->(?, 14, 14, 64)
# Pool ->(?, 7, 7, 64)
self.layer2 = torch.nn.Sequential(
torch.nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2),
torch.nn.Dropout(p=1 - keep_prob))
# L3 ImgIn shape=(?, 7, 7, 64)
# Conv ->(?, 7, 7, 128)
# Pool ->(?, 4, 4, 128)
self.layer3 = torch.nn.Sequential(
torch.nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
torch.nn.Dropout(p=1 - keep_prob))
# L4 FC 4x4x128 inputs -> 625 outputs
self.fc1 = torch.nn.Linear(4 * 4 * 128, 625, bias=True)
torch.nn.init.xavier_uniform(self.fc1.weight)
self.layer4 = torch.nn.Sequential(
self.fc1,
torch.nn.ReLU(),
torch.nn.Dropout(p=1 - keep_prob))
# L5 Final FC 625 inputs -> 10 outputs
self.fc2 = torch.nn.Linear(625, 10, bias=True)
torch.nn.init.xavier_uniform_(self.fc2.weight) # initialize parameters
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0), -1) # Flatten them for FC
out = self.fc1(out)
out = self.fc2(out)
return out
#instantiate CNN model
model = CNN()
model
Thanks!
I assume you calculation is wrong because:
Pytorch support images in format C * H * W (e.g. 3x32x32 not 32x32x3)
First dimension always batch dimension and must be omitted in calculation because, all nn.Modules handle it by default
So if you want calculate input size for first Linear layer, you can use this trick:
conv = nn.Sequential(self.layer1,self.layer2, self.layer3, nn.Flatten())
out = conv(torch.randn(1,im_height,im_width).unsqueeze(0))
# fc_layer_in_channels = out.shape[1]
self.fc1 = torch.nn.Linear(out.shape[1], 625, bias=True)
but only if you know im_height,im_width
The best practice is use torch.nn.AdaptiveAvgPool2d.
With this layer you always can get output of fixed spatial size.
Related
I am trying to train the VGG16 model code, but the loss is not optimized and seems that model's parameters are not updated.
here is the model :
import torch
import torch.nn as nn
import math
import torch.nn.functional as F
from utils import AvgPoolConv
cfg = {
'VGG11': [16, 'M', 32, 'M', 64, 64, 'M', 128, 128, 'M', 128, 128, 'M'],
'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],}
class VGG(nn.Module):
def __init__(self, vgg_name, use_bn, num_class=100):
super(VGG, self).__init__()
self.features = self._make_layers(cfg[vgg_name], use_bn)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(512,4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096,4096),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096, num_class)
)
#self.classifier = nn.Linear(512, num_class)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
n = m.weight.size(1)
m.weight.data.normal_(0, 1.0/float(n))
m.bias.data.zero_()
def forward(self, x):
out = self.features(x)
out = self.classifier(out)
return out
def _make_layers(self, cfg, use_bn=True):
layers = []
in_channels = 3
for x in cfg:
if x == 'M':
layers += [nn.AvgPool2d(2)]
#layers += [AvgPoolConv(kernel_size=2, stride=2, input_channel=in_channels)]
else:
layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
nn.BatchNorm2d(x) if use_bn else nn.Dropout(0.25),
nn.ReLU(inplace=True)]
in_channels = x
#layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
return nn.Sequential(*layers)
but if I delete the first 2 FC layers from the classifier as shown below, the model is trained and loss can be optimized ??
self.features = self._make_layers(cfg[vgg_name], use_bn)
self.classifier = nn.Linear(512, num_class)
Why this happens?
First, it would be good to verify if the parameters are really not updated or just that the change is small.
Different architectures might require different tuning (learning rate, weight decay if you use it etc.). A good thing to try when debugging is a test "can I overfit it"; use a single batch (or a single sample even) and check if you can get it to 0; you might need to tweak optimization parameters mentioned before.
Assuming everything is correct and the gradient flows, I'd say - tune the learning rate and try adding batch normalization between your linear and relu layers (should make the training much faster).
I am having a hard time understanding image segmentation. I have implemented Unet model for image segmentation. I am using PASCAL VOC dataset and I am trying to train my model. However, I got stuck when calculating the loss. I am unsure of what should be the expected shapes of the output and target classes. Can someone please educate me on what I am doing wrong? My only guess is that I am missing something when it comes to the ground truth images since I don't know how the model will learn which class is which. Thank!
Here is my Unet class:
import torch
import torch.nn as nn
from torchvision import transforms
def x2conv(in_channels, out_channels):
double_conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=0),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=0),
nn.ReLU(inplace=True))
return double_conv
class Encoder(nn.Module):
def __init__(self, chs):
super().__init__()
self.enc_blocks = nn.ModuleList(
[x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
def forward(self, x):
ftrs = []
for block in self.enc_blocks:
x = block(x)
ftrs.append(x)
x = self.pool(x)
return ftrs
class Decoder(nn.Module):
def __init__(self, chs):
super().__init__()
self.chs = chs
self.upconvs = nn.ModuleList(
[nn.ConvTranspose2d(chs[i], chs[i+1], kernel_size=2, stride=2) for i in range(len(chs)-1)])
self.dec_blocks = nn.ModuleList(
[x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])
def forward(self, x, encoder_features):
for i in range(len(self.chs)-1):
x = self.upconvs[i](x)
enc_ftrs = self.crop(encoder_features[i], x)
x = torch.cat([x, enc_ftrs], dim=1)
x = self.dec_blocks[i](x)
return x
def crop(self, enc_ftrs, x):
_, _, H, W = x.shape
enc_ftrs = transforms.CenterCrop([H, W])(enc_ftrs)
return enc_ftrs
class UNet(nn.Module):
def __init__(self, enc_chs, dec_chs, num_class):
super(UNet, self).__init__()
self.encoder = Encoder(enc_chs)
self.decoder = Decoder(dec_chs)
self.softmax = nn.Conv2d(dec_chs[-1], num_class, kernel_size=1)
def forward(self, x):
enc_ftrs = self.encoder(x)
out = self.decoder(enc_ftrs[::-1][0], enc_ftrs[::-1][1:])
out = self.softmax(out)
return out
And here is my dataset class:
from PIL import Image
import torchvision
VOC_CLASSES = [ # How to use?
"background",
"aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"pottedplant",
"sheep",
"sofa",
"train",
"tvmonitor",
]
VOC_COLORMAP = [ # How to use?
[0, 0, 0], # Background
[128, 0, 0], # Aeroplane
[0, 128, 0], # Bicycle
[128, 128, 0], # Bird
[0, 0, 128], # Boat
[128, 0, 128], # Bottle
[0, 128, 128], # Bus
[128, 128, 128], # Car
[64, 0, 0], # Cat
[192, 0, 0], # Chair
[64, 128, 0], # Cow
[192, 128, 0], # Diningtable
[64, 0, 128], # Dog
[192, 0, 128], # Horse
[64, 128, 128], # Motorbike
[192, 128, 128], # Person
[0, 64, 0], # Pottedplant
[128, 64, 0], # Sheep
[0, 192, 0], # Sofa
[128, 192, 0], # Train
[0, 64, 128], # tvmonitor
]
class VocDataset(torchvision.datasets.VOCSegmentation):
def __init__(self, image_set, transform, root="../data/VOCtrainval_11-May-2012/", download=False, year="2012"):
self.transform = transform
self.year = year
super().__init__(root=root, image_set=image_set,
download=download, transform=transform, year=year)
def __len__(self):
return len(self.images)
def __getitem__(self, index):
# open images and do transformation img = jpg, mask = png
img = Image.open(self.images[index]).convert("RGB")
target = Image.open(self.masks[index]).convert("RGB")
if self.transform:
img = self.transform(img)
trfm = T.Compose([T.ToTensor(), T.Resize((388, 388))])
target = trfm(target)
return img, target
and lastly here is my train function
import torch
import torch.nn as nn
import torch.optim as optim
from unet import UNet
from torch.utils.data import DataLoader
from dataset import VocDataset
import torchvision.transforms as T
import torch.nn.functional as F
# Hyperparameters etc.
STD = [0.2686, 0.2652, 0.2812] # Std for dataset
MEAN = [0.4568, 0.4431, 0.4083] # Mean for dataset
MOMENTUM = 0.9
LEARNING_RATE = 1e-4
BATCH_SIZE = 32
NUM_EPOCHS = 1
NUM_WORKERS = 2
NUM_CLASSES = 20
TRAIN_SET = "train"
VAL_SET = "val"
ENC_CHANNELS = (3, 64, 128, 256, 512, 1024) # Encoder channels
DEC_CHANNELS = (1024, 512, 256, 128, 64) # Decoder channels
TRANSFORM = T.Compose(
[T.ToTensor(), T.Resize(SIZE), T.Normalize(MEAN, STD)]
)
def main():
training_data = VocDataset(TRAIN_SET, TRANSFORM)
train_dataloader = DataLoader(
training_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)
# Create instance of unet
unet = UNet(ENC_CHANNELS, DEC_CHANNELS, NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
unet.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
for epoch in range(NUM_EPOCHS): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data # Shape for labels and inputs are: [32,3,388,388]
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = unet(inputs) # output shape is [32, 32, 388, 388]
loss = criterion(outputs, labels) # Error here
loss.backward()
optimizer.step()
# print('Finished Training')
if __name__ == "__main__":
main()
For starters, your label and outputs have different dimension. (32 vs 3 channels). Cross Entropy Loss expects them to either have the same number of channels, or for the target to have only one channel with integer values indicating the relevant class.
Let's work with the latter case. In this case, we need to reduce the target to be a single channel [32 x 388 x 388] for your input and batch size. (Secondarily, the Unet should ideally have one output channel for each class (looks like there are 22 classes so you should change the final output layer of the Unet decoder to have 22 outputs)).
To convert the label of size [32 x 3 x 388 x 388] to [32 x 388 x 388], you need to use the colormap for conversion. That is, create a new tensor target of size [32 x 1 x 388 x 388]. For each value target[i,j,k], assign the index into VOC_COLORMAP that matches the value stored in the pixels at label[i,:,j,k].
I get this error message and I'm not sure why. My input is (batch, 1, 312) from tabular data and this CNN is constructed for a regression prediction. I worked out the shapes for each step with the formula (input + 2*padding - filter size)/stride + 1 as in the comment below. The problem appears to occur at x = self.fc(x) and I can't figure out why. Your help is greatly appreciated. Thank you.
class CNNWeather(nn.Module):
# input (batch, 1, 312)
def __init__(self):
super(CNNWeather, self).__init__()
self.conv1 = nn.Conv1d(in_channels=1, out_channels=8, kernel_size=9, stride=1, padding='valid') # (312+2*0-9)/1 + 1 = 304
self.pool1 = nn.AvgPool1d(kernel_size=2, stride=2) # 304/2 = 302
self.conv2 = nn.Conv1d(in_channels=8, out_channels=12, kernel_size=3, stride=1, padding='valid') # (302-3)/1+1 = 300
self.pool2 = nn.AvgPool1d(kernel_size=2, stride=2) # 300/2 = 150
self.conv3 = nn.Conv1d(in_channels=12, out_channels=16, kernel_size=3, stride=1, padding='valid') # (150-3)/1+1 = 76
self.pool3 = nn.AvgPool1d(kernel_size=2, stride=2) # 76/2 = 38
self.conv4 = nn.Conv1d(in_channels=16, out_channels=20, kernel_size=3, stride=1, padding='valid') # (38-3)/1+1 = 36
self.pool4 = nn.AvgPool1d(kernel_size=2, stride=2) # 36/2 = 18 (batch, 20, 18)
self.fc = nn.Linear(in_features=20*18, out_features=1)
def forward(self, x):
x = self.pool1(F.relu(self.conv1(x)))
x = self.pool2(F.relu(self.conv2(x)))
x = self.pool3(F.relu(self.conv3(x)))
x = self.pool4(F.relu(self.conv4(x)))
print(x.size())
x = x.view(x.size(0), -1) # flatten (batch, 20*18)
x = self.fc(x)
return x
The problem seems to be related to the input size of your FC layer:
self.fc = nn.Linear(in_features=20*18, out_features=1)
The output of the previous layer is 340, so you must use in_features=340.
These are the shapes of the output for the third and fourth layers.
torch.Size([5, 16, 73]) conv3 out
torch.Size([5, 16, 36]) pool3 out
torch.Size([5, 20, 34]) conv4 out
torch.Size([5, 20, 17]) pool4 out
Notice that out of the "pool4" layer come 20x17, meaning 340 elements.
Hello below is the pytorch model I am trying to run. But getting error. I have posted the error trace as well. It was running very well unless I added convolution layers. I am still new to deep learning and Pytorch. So I apologize if this is silly question. I am using conv1d so why should conv1d expect 3 dimensional input and it is also getting a 2d input which is also odd.
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(CROP_SIZE*CROP_SIZE*3, 512)
self.conv1d1 = nn.Conv1d(in_channels=512, out_channels=64, kernel_size=1, stride=2)
self.fc2 = nn.Linear(64, 128)
self.conv1d2 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=1, stride=2)
self.fc3 = nn.Linear(64, 256)
self.conv1d3 = nn.Conv1d(in_channels=256, out_channels=64, kernel_size=1, stride=2)
self.fc4 = nn.Linear(64, 256)
self.fc4 = nn.Linear(256, 128)
self.fc5 = nn.Linear(128, 64)
self.fc6 = nn.Linear(64, 32)
self.fc7 = nn.Linear(32, 64)
self.fc8 = nn.Linear(64, frame['landmark_id'].nunique())
def forward(self, x):
x = F.relu(self.conv1d1(self.fc1(x)))
x = F.relu(self.conv1d2(self.fc2(x)))
x = F.relu(self.conv1d3(self.fc3(x)))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
x = self.fc8(x)
return F.log_softmax(x, dim=1)
net = Net()
import torch.optim as optim
loss_function = nn.CrossEntropyLoss()
net.to(torch.device('cuda:0'))
for epoch in range(3): # 3 full passes over the data
optimizer = optim.Adam(net.parameters(), lr=0.001)
for data in tqdm(train_loader): # `data` is a batch of data
X = data['image'].to(device) # X is the batch of features
y = data['landmarks'].to(device) # y is the batch of targets.
optimizer.zero_grad() # sets gradients to 0 before loss calc. You will do this likely every step.
output = net(X.view(-1,CROP_SIZE*CROP_SIZE*3)) # pass in the reshaped batch
# print(np.argmax(output))
# print(y)
loss = F.nll_loss(output, y) # calc and grab the loss value
loss.backward() # apply this loss backwards thru the network's parameters
optimizer.step() # attempt to optimize weights to account for loss/gradients
print(loss) # print loss. We hope loss (a measure of wrong-ness) declines!
Error trace
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-42-f5ed7999ce57> in <module>
5 y = data['landmarks'].to(device) # y is the batch of targets.
6 optimizer.zero_grad() # sets gradients to 0 before loss calc. You will do this likely every step.
----> 7 output = net(X.view(-1,CROP_SIZE*CROP_SIZE*3)) # pass in the reshaped batch
8 # print(np.argmax(output))
9 # print(y)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
<ipython-input-37-6d3e34d425a0> in forward(self, x)
16
17 def forward(self, x):
---> 18 x = F.relu(self.conv1d1(self.fc1(x)))
19 x = F.relu(self.conv1d2(self.fc2(x)))
20 x = F.relu(self.conv1d3(self.fc3(x)))
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/conv.py in forward(self, input)
210 _single(0), self.dilation, self.groups)
211 return F.conv1d(input, self.weight, self.bias, self.stride,
--> 212 self.padding, self.dilation, self.groups)
213
214
RuntimeError: Expected 3-dimensional input for 3-dimensional weight [64, 512, 1], but got 2-dimensional input of size [4, 512] instead
You should learn how convolutions work (e.g. see this answer) and some neural network basics (this tutorial from PyTorch).
Basically, Conv1d expects inputs of shape [batch, channels, features] (where features can be some timesteps and can vary, see example).
nn.Linear expects shape [batch, features] as it is fully connected and each input feature is connected to each output feature.
You can verify those shapes by yourself, for torch.nn.Linear:
import torch
layer = torch.nn.Linear(20, 10)
data = torch.randn(64, 20) # [batch, in_features]
layer(data).shape # [64, 10], [batch, out_features]
For Conv1d:
layer = torch.nn.Conv1d(in_channels=20, out_channels=10, kernel_size=3, padding=1)
data = torch.randn(64, 20, 15) # [batch, channels, timesteps]
layer(data).shape # [64, 10, 15], [batch, out_features]
layer(torch.randn(32, 20, 25)).shape # [32, 10, 25]
BTW. As you are working with images, you should use torch.nn.Conv2d instead.
Most of the Pytorch functions work on batch data i.e they accept input of size (batch_size, shape). #Szymon Maszke already posted answer related to that.
So in your case, you can use unsqueeze and sqeeze functions for adding and removing extra dimensions.
Here's the sample code:
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(100, 512)
self.conv1d1 = nn.Conv1d(in_channels=512, out_channels=64, kernel_size=1, stride=2)
self.fc2 = nn.Linear(64, 128)
def forward(self, x):
x = self.fc1(x)
x = x.unsqueeze(dim=2)
x = F.relu(self.conv1d1(x))
x = x.squeeze()
x = self.fc2(x)
return x
net = Net()
bsize = 4
inp = torch.randn((bsize, 100))
out = net(inp)
print(out.shape)
I'm trying to implement a pattern recognition model using a fully convolutional network (fig 1 in https://www.sciencedirect.com/science/article/pii/S0031320318304370, I was able to get the full text without signing in or anything but if it's a problem I can attach a picture too!) but I'm getting a size error when moving from the final Conv2D layer to the first fc_layer.
Here is my error message:
RuntimeError: size mismatch, m1: [4 x 1024], m2: [4 x 1024] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:283
Originally, as in the figure, my first linear layer was:
nn.Linear(4*4*512, 1024)
but after getting the size mismatch, I changed it to:
nn.Linear(4,1024)
Now, I have a strange error message as written above.
For reference (if it helps), here is my code:
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
class convnet(nn.Module):
def __init__(self, num_classes=1000):
super(convnet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=3, stride=2, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.MaxPool2d(kernel_size=1),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),# stride=2),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2), #stride=2),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True), #nn.Dropout(p=0.5)
)
self.classifier = nn.Sequential(
nn.Linear(4, 1024),
nn.Dropout(p=0.5),
nn.ReLU(inplace=True),
#nn.Dropout(p=0.5),
nn.Linear(1024, 1024),
nn.ReLU(inplace=True),
nn.Linear(1024, num_classes),
)
def forward(self, x):
x = self.features(x)
x = torch.flatten(x,1)
x = self.classifier(x)
return x
I suspect it's an issue with the padding and stride.
Thanks!
The error is from a matrix multiplication, where m1 should be an m x n matrix and m2 an n x p matrix and the result would be an m x p matrix. In your case it's 4 x 1024 and 4 x 1024, but that doesn't work since 1024 != 4.
That means your input to the first linear layer has size [4, 1024] (4 being the batch size), therefore the input features of the first linear layer should be 1024.
self.classifier = nn.Sequential(
nn.Linear(1024, 1024),
nn.Dropout(p=0.5),
nn.ReLU(inplace=True),
#nn.Dropout(p=0.5),
nn.Linear(1024, 1024),
nn.ReLU(inplace=True),
nn.Linear(1024, num_classes),
)
If you are uncertain how many features your input has, you can print out its size just before the layer:
x = self.features(x)
x = torch.flatten(x,1)
print(x.size()) # => torch.Size([4, 1024])
x = self.classifier(x)