Temporal sequence feature extraction CNN, batches with different dimensions - deep-learning

I am using a CNN to extract features from temporal data of different lengths. I am using pad_sequence to pad the data in a batch. However as the max length in a batch will change, the padded sequence length differs by batch. This creates errors when i flatten the data for the FCN layer (as the dimension of the flattened vector changes). I am currently handling this by using an 'adaptive avg pooling layer' in before the FCN layers. As this is a global averaging, it fixes the output dimension for the FCN. However I am not sure if this is the correct thing to do.
Code is:
##pad tensors
def pad_collate(batch):
sequences = [item[0] for item in batch]
lengths = [len(seq) for seq in sequences]
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
return padded_sequences, lengths
## Create dataloader
trainData = Sequence(root = path)
trainDataLoader = DataLoader(trainData, batch_size = BATCH_SIZE, collate_fn= pad_collate)
## CNN model
class FeatureExtractor(nn.Module):
def __init__(self, block, layers):
super(FeatureExtractor, self).__init__()
self.inplanes = 6
## 1st CONV layers
self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 3, stride = 2, padding = 4)
self.bn1 = nn.BatchNorm2d(6)
self.relu1 = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride = 2, padding = 1)
## residual blocks
self.layer0 = self._make_layer(block, 12, layers[0], stride = 1)
self.layer1 = self._make_layer(block, 24, layers[1], stride = 2)
self.avgpool = nn.AdaptiveAvgPool2d((5,5)) ##### MY CURRENT SOLUTION #####
self.fc = nn.Linear(600, 128)
def _make_layer(self, block, planes, blocks, stride):
downsample = None
if stride != 1 or self.inplanes != planes:
downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
nn.BatchNorm2d(planes))
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
## first conv
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.maxpool1(x)
## conv blocks
x = self.layer0(x)
x = self.layer1(x)
##FCN layer
x = self.avgpool(x)
x = torch.flatten(x, 1)
output = self.fc(x)
return output
Any other comments are also welcome (i am self-taught)

Related

Self attention module occpuying too large VRAM with large input

I was trying to add a self-attention module for Progressive GAN (ProGAN) and put it to the last layer before to RGB. After running a simple test file I found that when the model grows to 256x256 output, the process is killed.
Then I tried the following test code:
import torch
import torch.nn as nn
class SelfAttention(nn.Module):
def __init__(self, channels):
super(SelfAttention, self).__init__()
self.channels = channels
num_heads = 4
self.mha = nn.MultiheadAttention(channels, num_heads, batch_first=True)
self.ln = nn.LayerNorm([channels])
self.ff_self = nn.Sequential(
nn.LayerNorm([channels]),
nn.Linear(channels, channels),
nn.GELU(),
nn.Linear(channels, channels)
)
def forward(self, x):
size = x.shape[3]
print("SIZE", size)
print("CHANNELS", self.channels)
x = x.view(-1, self.channels, size * size).swapaxes(1, 2)
print()
x_ln = self.ln(x)
attention_value, _ = self.mha(x_ln, x_ln, x_ln)
attention_value = attention_value + x
attention_value = self.ff_self(attention_value) + attention_value
return attention_value.swapaxes(2, 1).view(-1, self.channels, size, size)
class SelfAttention2(nn.Module):
def __init__(self, channels):
super(SelfAttention2, self).__init__()
self.query = nn.Conv2d(channels, channels // 8, kernel_size=1)
self.key = nn.Conv2d(channels, channels // 8, kernel_size=1)
self.value = nn.Conv2d(channels, channels, kernel_size=1)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
N, C, H, W = x.size()
query = self.query(x).view(N, -1, W*H).permute(0, 2, 1) # (N, C, H*W)
key = self.key(x).view(N, -1, W*H) # (N, C, H*W)
energy = torch.bmm(query, key) # (N, H*W, H*W)
attention = self.softmax(energy)
value = self.value(x).view(N, -1, W*H) # (N, C, H*W)
out = torch.bmm(value, attention.permute(0, 2, 1))
out = out.view(N, C, H, W)
return out
if __name__ == '__main__':
x = torch.randn((1, 64, 256, 256))
sa1 = SelfAttention(64)
sa1(x)
sa2 = SelfAttention2(64)
sa2(x)
Neither module worked for trying to allocate 16G VRAM. (With this one module take up 16G I cannot even run the whole model in 3090)
And I am told explictly that the method itself, i.e. "Add attention to ProGAN or StyleGAN" will work and has been done.
So, is my understanding of the idea is false or the implementation has flaw?
Also, I have train the model to 32x32 and it worked ok.
Suggentions upon my understanding or my coding.

Is it possible to combine 2 neural networks?

I have a NET like (exemple from here)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square, you can specify with a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
and another net like (exemple from here)
class binaryClassification(nn.Module):
def __init__(self):
super(binaryClassification, self).__init__()
# Number of input features is 12.
self.layer_1 = nn.Linear(12, 64)
self.layer_2 = nn.Linear(64, 64)
self.layer_out = nn.Linear(64, 1)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.1)
self.batchnorm1 = nn.BatchNorm1d(64)
self.batchnorm2 = nn.BatchNorm1d(64)
def forward(self, inputs):
x = self.relu(self.layer_1(inputs))
x = self.batchnorm1(x)
x = self.relu(self.layer_2(x))
x = self.batchnorm2(x)
x = self.dropout(x)
x = self.layer_out(x)
return x
I'd like to change, for exemple "self.fc2 = nn.Linear(120, 84)" in order to have 121 inputs, where the 121th is the x (output) of the binaryClassification network.
The idea is: I'd like to use in the same time, CNN network, and not-CNN network, to train both, with influence one on the other.
Is it possible? How can I perform that? (Keras or Pytorch examples are both ok).
Or maybe the idea is crazy and there is easier way to mix data and image as input of an unique network?
It is a perfectly valid approach, you are taking two different input data sources, processing them and combining the result to solve a common goal (in this case it seems like a 10-class image classification). You can define the input to your Net network to be a tuple of the image you need for the original Net and the features 12-value vector for your BinaryClassificator. An example code would be:
import torch
import torch.nn as nn
class binaryClassification(nn.Module):
#> ...same as above
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 5x5 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 5)
self.conv2 = nn.Conv2d(6, 16, 5)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 5 * 5, 120) # 5*5 from image dimension
self.binClas = binaryClassification()
self.fc2 = nn.Linear(121, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, inputs):
x, features = inputs # split tuple
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square, you can specify with a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
# Concatenate with BinaryClassification
x = torch.cat([F.relu(self.fc1(x)), self.binClas(features)])
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
However! Be careful about training them together, it is hard to balance both branches in the network to make them learn. I would recommend you to train them separately for a while before plugging them together (generally speaking, the hyperparameters of one part of the network will probably not be optimal for the other). To do this, you could freeze one part of the network while training the other, and viceversa. (check this link to see how to freeze parts of a torch nn)
The most naive way to do it would be to instantiate both models, sum the two predictions and compute the loss with it. This will backpropagate through both models:
net1 = Net1()
net2 = Net2()
bce = torch.nn.BCEWithLogitsLoss()
params = list(net1.parameters()) + list(net2.parameters())
optimizer = optim.SGD(params)
for (x, ground_truth) in enumerate(your_data_loader):
optimizer.zero_grad()
prediction = net1(x) + net2(x) # the 2 models must output tensors of same shape
loss = bce(prediction, ground_truth)
train_loss.backward()
optimizer.step()
You could also e.g.
implement the layers of Net1 and Net2 in a single model
train Net1 and Net2 separately and ensemble them later

Neural Network cannot overfit even one sample

I am using neural network for a regression task.
My input is an gray image whose size is 100x70x1.
The gray area has a unique value 60.
The input will go through a preprocessing layer, which multiply 1./255 on every pixel value.
My output is just three double number: [0.87077969, 0.98989031, 0.98888382]
I used ResNet152 model as shown below:
class Bottleneck(tf.keras.Model):
expansion = 4
def __init__(self, in_channels, out_channels, strides=1):
super(Bottleneck, self).__init__()
self.conv1 = tf.keras.layers.Conv2D(out_channels, 1, 1, use_bias=False)
self.bn1 = tf.keras.layers.BatchNormalization()
self.conv2 = tf.keras.layers.Conv2D(out_channels, 3, strides, padding="same", use_bias=False)
self.bn2 = tf.keras.layers.BatchNormalization()
self.conv3 = tf.keras.layers.Conv2D(out_channels*self.expansion, 1, 1, use_bias=False)
self.bn3 = tf.keras.layers.BatchNormalization()
if strides != 1 or in_channels != self.expansion * out_channels:
self.shortcut = tf.keras.Sequential([
tf.keras.layers.Conv2D(self.expansion*out_channels, kernel_size=1,
strides=strides, use_bias=False),
tf.keras.layers.BatchNormalization()]
)
else:
self.shortcut = lambda x,_: x
def call(self, x, training=False):
out = tf.nn.elu(self.bn1(self.conv1(x), training))
out = tf.nn.elu(self.bn2(self.conv2(out), training))
out = self.bn3(self.conv3(out), training)
out += self.shortcut(x, training)
return tf.nn.elu(out)
class ResNet(tf.keras.Model):
def __init__(self, block, num_blocks):
super(ResNet, self).__init__()
self.in_channels = 64
self.conv1 = tf.keras.layers.Conv2D(64, 7, 2, padding="same", use_bias=False) # 60x60
self.bn1 = tf.keras.layers.BatchNormalization()
self.pool1 = tf.keras.layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same') # 30x30
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.avg_pool2d = tf.keras.layers.GlobalAveragePooling2D()
self.flatten = tf.keras.layers.Flatten()
def _make_layer(self, block, out_channels, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels * block.expansion
return tf.keras.Sequential(layers)
def call(self, x, training=False):
out = self.pool1(tf.nn.elu(self.bn1(self.conv1(x), training)))
out = self.layer1(out, training=training)
out = self.layer2(out, training=training)
out = self.layer3(out, training=training)
out = self.layer4(out, training=training)
# For classification
out = self.flatten(out)
# out = tf.keras.layers.Reshape((out.shape[-1],))(out)
#out = self.linear(out)
return out
def model(self):
x = tf.keras.layers.Input(shape=(100,70,1))
return tf.keras.Model(inputs=[x], outputs=self.call(x))
def ResNet152():
return ResNet(Bottleneck, [3,8,36,3])
I used elu as activation function and changed the GlobalAveragePooling layer into flatten layer at the end of ResNet.
Before output I stack two Dense layer(2048 units and 3 units) on top of the ResNet model.
For training I used adam optimizer and inital learning rate is 1e-4, which will decreasing by factor 10 when the val_loss not decreasing for 3 epoch.
The loss is just mse error.
After early stopping while learning rate is 1e-8, the mse loss is still very high:8.6225
The prediction is [2.92318237, 5.53124916, 3.00686643] which is far away from the ground truth: [0.87077969, 0.98989031, 0.98888382]
I don't know why such a deep network cannot overfit such a sample.
Is this the reason that my input image has too few information? Could someone help me?

DenseNet, Sizes of tensors must match

would you know how I can adapt this code so that sizes of tensors must match because I have this error: x = torch.cat([x1,x2],1) RuntimeError: Sizes of tensors must match except in dimension 0. Got 32 and 1 (The offending index is 0).
My images are size 416x416.
Thank you in advance for your help,
num_classes = 20
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.inc = models.inception_v3(pretrained=True)
self.inc.aux_logits = False
for child in list(self.inc.children())[:-5]:
for param in child.parameters():
param.requires_grad = False
self.inc.fc = nn.Sequential()
self.dens121 = models.densenet121(pretrained=True)
for child in list(self.dens121.children())[:-6]:
for param in child.parameters():
param.requires_grad = False
self.dens121 = nn.Sequential(*list(self.dens121.children())[:-1])
self.SiLU = nn.SiLU()
self.linear = nn.Linear(4096, num_classes)
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x1 = self.SiLU(self.dens121(x))
x1 = x1.view(-1, 2048)
x2 = self.inc(x).view(-1, 2048)
x = torch.cat([x1,x2],1)
return self.linear(self.dropout(x))
The shapes of the two tensors are very different and that's why the torch.cat() fails. I tried to run your code with the following example:
def forward(self, x):
x1 = self.SiLU(self.dens121(x))
x1 = x1.view(-1, 2048)
x2 = self.inc(x).view(-1, 2048)
print(x1.shape, x2.shape)
x = torch.cat([x1,x2], dim=1)
return self.linear(self.dropout(x))
Here's the driver code
inputs = torch.randn(2, 3, 416, 416)
model = Net()
outputs = model(inputs)
The shapes of x1 of x2 are as follows:
torch.Size([169, 2048]) torch.Size([2, 2048])
Either your DenseNet should output the same shape as the output of Inceptionv3 or vice-versa. The output from DenseNet is of shape torch.Size([2, 1024, 13, 13]) and the output from Inceptionv3 is of shape torch.Size([2, 2048]).
EDIT
Add this line to the init method:
self.conv_reshape= nn.Conv2d(1024, 2048, kernel_size=13, stride=1)
Add these lines to your forward():
x1 = self.SiLU(self.dens121(x))
out = self.conv_reshape(x1)
x1 = out.view(-1, out.size(1))
x2 = self.inc(x).view(-1, 2048)

pytorch cnn Test result is sticked

TRAINING CODE
if os.path.isfile(PATH):
print("checkpoint training '{}' ...".format(PATH))
checkpoint = torch.load(PATH)
start_epoch = checkpoint['epoch']
start_i = checkpoint['i']
net.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (trained for {} epochs, {} i)".format(PATH, checkpoint['epoch'],
checkpoint['i']))
else:
print('new training')
for epoch in range(num_epochs): # loop over the dataset multiple times
running_loss = 0.0
for i in range(len(train_folder_list2)):
# get the inputs; data is a list of [inputs, labels]
# net.train()
inputs, labels = train_input[i], train_list[i]
inputs = torch.as_tensor(inputs).cuda()
inputs = inputs.transpose(1, 3)
labels = torch.as_tensor(labels).cuda()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
# zero the parameter gradients
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 100 == 1:
save_checkpoint({
'epoch': start_epoch + epoch + 1,
'i': start_i + i + 1,
'state_dict': net.state_dict(),
})
TEST CODE
PATH = './checkpoint.pth'
model = Net().cuda()
if os.path.isfile(PATH):
print('checkpoint check!')
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
model.eval()
for k in range(len(train_folder_list2)):
inputs = train_input[k]
inputs = torch.as_tensor(inputs).cuda()
inputs = inputs.transpose(1, 3)
outputs = model(inputs)
result = outputs.cpu().detach().numpy()
This is the code to find the edges of the image.
If I run the training code, train it, and test it with the test code, it doesn't seem to find any edges in the image. The edges are on the same side, whatever image i put.
**ADD
CNN CODE
In addition, we added cnn code to give you information. Data input was put in the list separately from the image and label.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(293904, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 18)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 293904)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
x = x.view(18)
return x