I'm building a DQN that takes a 24x10 array of 0,1,2 (representing a tetris board) and a int 0-5 (representing the current playable tetramino)
I flatten my array and convert it to a Tensor before inputting it to my convolutional layers but this is the error I keep on getting
Expected 4-dimensional input for 4-dimensional weight [16, 3, 240, 240], but got 1-dimensional input of size [240] instead
I've tried reducing the Kernel size and stride as well as not flattening the array but neither has worked.
For reference this is my DQN
class DQN(nn.Module):
def __init__(self):
super(DQN, self).__init__()
self.conv1_board = nn.Conv2d(3, 16, kernel_size=240, stride=1) #3 input channels for 0,1,2 . kernel_size 240 for length of tensor
self.conv2_board = nn.Conv2d(16, 32, kernel_size=240, stride=1)
self.conv3_board = nn.Conv2d(32, 6, kernel_size=240, stride=1)
self.conv1_piece = nn.Conv2d(6, 16, kernel_size=240, stride=1) #in channels 6 as 6 possible values
self.conv2_piece = nn.Conv2d(16, 6, kernel_size=240, stride=1)
self.fc1 = nn.Linear(1, 32)
self.fc2 = nn.Linear(32, 6)
self.flatten = nn.Flatten()
def flt_totns(self, arr):
flt = []
for l in arr:
flt.extend(l)
return torch.FloatTensor(flt)
def forward(self, states): #inputs to conv layers should be Tensors not list. convert list => tensor
board, piece = states
board = self.flt_totns(board)
embed_board = flatten(self.conv3_board(self.conv2_board(self.conv1_board(board))))
embed_piece = flatten(self.conv2_piece(self.conv1_piece(piece)))
embed_joined = torch.cat([embed_board, embed_piece])
return self.fc2(self.fc1(embed_joined))
I'm very new to CNNs in pytorch so I'm sure a lot of my reasoning is faulty. For example I'm still not sure how Kernel size exactly relates to the shape of your input, or if input channels still applies to array inputs. Buy any help would be greatly appreciated.
Related
I am using neural network for a regression task.
My input is an gray image whose size is 100x70x1.
The gray area has a unique value 60.
The input will go through a preprocessing layer, which multiply 1./255 on every pixel value.
My output is just three double number: [0.87077969, 0.98989031, 0.98888382]
I used ResNet152 model as shown below:
class Bottleneck(tf.keras.Model):
expansion = 4
def __init__(self, in_channels, out_channels, strides=1):
super(Bottleneck, self).__init__()
self.conv1 = tf.keras.layers.Conv2D(out_channels, 1, 1, use_bias=False)
self.bn1 = tf.keras.layers.BatchNormalization()
self.conv2 = tf.keras.layers.Conv2D(out_channels, 3, strides, padding="same", use_bias=False)
self.bn2 = tf.keras.layers.BatchNormalization()
self.conv3 = tf.keras.layers.Conv2D(out_channels*self.expansion, 1, 1, use_bias=False)
self.bn3 = tf.keras.layers.BatchNormalization()
if strides != 1 or in_channels != self.expansion * out_channels:
self.shortcut = tf.keras.Sequential([
tf.keras.layers.Conv2D(self.expansion*out_channels, kernel_size=1,
strides=strides, use_bias=False),
tf.keras.layers.BatchNormalization()]
)
else:
self.shortcut = lambda x,_: x
def call(self, x, training=False):
out = tf.nn.elu(self.bn1(self.conv1(x), training))
out = tf.nn.elu(self.bn2(self.conv2(out), training))
out = self.bn3(self.conv3(out), training)
out += self.shortcut(x, training)
return tf.nn.elu(out)
class ResNet(tf.keras.Model):
def __init__(self, block, num_blocks):
super(ResNet, self).__init__()
self.in_channels = 64
self.conv1 = tf.keras.layers.Conv2D(64, 7, 2, padding="same", use_bias=False) # 60x60
self.bn1 = tf.keras.layers.BatchNormalization()
self.pool1 = tf.keras.layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same') # 30x30
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.avg_pool2d = tf.keras.layers.GlobalAveragePooling2D()
self.flatten = tf.keras.layers.Flatten()
def _make_layer(self, block, out_channels, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels * block.expansion
return tf.keras.Sequential(layers)
def call(self, x, training=False):
out = self.pool1(tf.nn.elu(self.bn1(self.conv1(x), training)))
out = self.layer1(out, training=training)
out = self.layer2(out, training=training)
out = self.layer3(out, training=training)
out = self.layer4(out, training=training)
# For classification
out = self.flatten(out)
# out = tf.keras.layers.Reshape((out.shape[-1],))(out)
#out = self.linear(out)
return out
def model(self):
x = tf.keras.layers.Input(shape=(100,70,1))
return tf.keras.Model(inputs=[x], outputs=self.call(x))
def ResNet152():
return ResNet(Bottleneck, [3,8,36,3])
I used elu as activation function and changed the GlobalAveragePooling layer into flatten layer at the end of ResNet.
Before output I stack two Dense layer(2048 units and 3 units) on top of the ResNet model.
For training I used adam optimizer and inital learning rate is 1e-4, which will decreasing by factor 10 when the val_loss not decreasing for 3 epoch.
The loss is just mse error.
After early stopping while learning rate is 1e-8, the mse loss is still very high:8.6225
The prediction is [2.92318237, 5.53124916, 3.00686643] which is far away from the ground truth: [0.87077969, 0.98989031, 0.98888382]
I don't know why such a deep network cannot overfit such a sample.
Is this the reason that my input image has too few information? Could someone help me?
Trying to implement a simple multi-label image classifier using Pytorch Lightning. Here's the model definition:
import torch
from torch import nn
# creates network class
class Net(pl.LightningModule):
def __init__(self):
super().__init__()
# defines conv layers
self.conv_layer_b1 = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=32,
kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Flatten(),
)
# passes dummy x matrix to find the input size of the fc layer
x = torch.randn(1, 3, 800, 600)
self._to_linear = None
self.forward(x)
# defines fc layer
self.fc_layer = nn.Sequential(
nn.Linear(in_features=self._to_linear,
out_features=256),
nn.ReLU(),
nn.Linear(256, 5),
)
# defines accuracy metric
self.accuracy = pl.metrics.Accuracy()
self.confusion_matrix = pl.metrics.ConfusionMatrix(num_classes=5)
def forward(self, x):
x = self.conv_layer_b1(x)
if self._to_linear is None:
# does not run fc layer if input size is not determined yet
self._to_linear = x.shape[1]
else:
x = self.fc_layer(x)
return x
def cross_entropy_loss(self, logits, y):
criterion = nn.CrossEntropyLoss()
return criterion(logits, y)
def training_step(self, train_batch, batch_idx):
x, y = train_batch
logits = self.forward(x)
train_loss = self.cross_entropy_loss(logits, y)
train_acc = self.accuracy(logits, y)
train_cm = self.confusion_matrix(logits, y)
self.log('train_loss', train_loss)
self.log('train_acc', train_acc)
self.log('train_cm', train_cm)
return train_loss
def validation_step(self, val_batch, batch_idx):
x, y = val_batch
logits = self.forward(x)
val_loss = self.cross_entropy_loss(logits, y)
val_acc = self.accuracy(logits, y)
return {'val_loss': val_loss, 'val_acc': val_acc}
def validation_epoch_end(self, outputs):
avg_val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
self.log("val_loss", avg_val_loss)
self.log("val_acc", avg_val_acc)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=0.0008)
return optimizer
The issue is probably not the machine since I'm using a cloud instance with 60 GBs of RAM and 12 GBs of VRAM. Whenever I run this model even for a single epoch, I get an out of memory error. On the CPU it looks like this:
RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 1966080000 bytes. Error code 12 (Cannot allocate memory)
and on the GPU it looks like this:
RuntimeError: CUDA out of memory. Tried to allocate 7.32 GiB (GPU 0; 11.17 GiB total capacity; 4.00 KiB already allocated; 2.56 GiB free; 2.00 MiB reserved in total by PyTorch)
Clearing the cache and reducing the batch size did not work. I'm a novice so clearly something here is exploding but I can't tell what. Any help would be appreciated.
Thank you!
Indeed, it's not a machine issue; the model itself is simply unreasonably big. Typically, if you take a look at common CNN models, the fc layers occur near the end, after the inputs already pass through quite a few convolutional blocks (and have their spatial resolutions reduced).
Assuming inputs are of shape (batch, 3, 800, 600), while passing the conv_layer_b1 layer, the feature map shape would be (batch, 32, 400, 300) after the MaxPool operation. After flattening, the inputs become (batch, 32 * 400 * 300), ie, (batch, 3840000).
The immediately following fc_layer thus contains nn.Linear(3840000, 256), which is simply absurd. This single linear layer contains ~983 million trainable parameters! For reference, popular image classification CNNs roughly have 3 to 30 million parameters on average, with larger variants reaching 60 to 80 million. Few ever really cross the 100 million mark.
You can count your model params with this:
def count_params(model):
return sum(map(lambda p: p.data.numel(), model.parameters()))
My advice: 800 x 600 is really a massive input size. Reduce it to something like 400 x 300, if possible. Furthermore, add several convolutional blocks similar to conv_layer_b1, before the FC layer. For example:
def get_conv_block(C_in, C_out):
return nn.Sequential(
nn.Conv2d(in_channels=C_in, out_channels=C_out,
kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
class Net(pl.LightningModule):
def __init__(self):
super().__init__()
# defines conv layers
self.conv_layer_b1 = get_conv_block(3, 16)
self.conv_layer_b2 = get_conv_block(16, 32)
self.conv_layer_b3 = get_conv_block(32, 64)
self.conv_layer_b4 = get_conv_block(64, 128)
self.conv_layer_b5 = get_conv_block(128, 256)
# passes dummy x matrix to find the input size of the fc layer
x = torch.randn(1, 3, 800, 600)
self._to_linear = None
self.forward(x)
# defines fc layer
self.fc_layer = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features=self._to_linear,
out_features=256),
nn.ReLU(),
nn.Linear(256, 5)
)
# defines accuracy metric
self.accuracy = pl.metrics.Accuracy()
self.confusion_matrix = pl.metrics.ConfusionMatrix(num_classes=5)
def forward(self, x):
x = self.conv_layer_b1(x)
x = self.conv_layer_b2(x)
x = self.conv_layer_b3(x)
x = self.conv_layer_b4(x)
x = self.conv_layer_b5(x)
if self._to_linear is None:
# does not run fc layer if input size is not determined yet
self._to_linear = nn.Flatten()(x).shape[1]
else:
x = self.fc_layer(x)
return x
Here, because more conv-relu-pool layers are applied, the input is reduced to a feature map of a much smaller shape, (batch, 256, 25, 18), and the overall number of trainable parameters would be reduced to about ~30 million parameters.
is there any way I can concatenate branches of different dimensions of a small inception(googlenet)?
For example, at a 32 x 32 x 3 image (torch.Size([1, 3, 32, 32])), it will pass through an inception module with the following branches:
a convolution with (32 channels, 1 x 1 filters)
another convolution with (32 channels, 3 x3 filters)
merge (concatenate along the channel dimension)
Inception Module
The issue however is that the torch sizes as a result of the two convolutions are different.
(32 channels, 1 x 1 filters) -> [1, 32, 30, 30]
(32 channels, 3 x3 filters) -> [1, 32, 28, 28]
How can I concatenate the two branches?
Should I add padding? I tried ZeroPad2d. It’s not working.
For your reference:
class Inception(nn.Module):
def __init__(self, in_channel, ch1, ch3):
super(Inception, self).__init__()
self.branch1 = nn.Sequential(
ConvBlock(in_channel, ch1, kernel_size = 1,stride=1, padding=0)
)
self.branch2 = nn.Sequential(
ConvBlock(in_channel, ch3, kernel_size = 3,stride=1, padding=0)
)
def forward(self, x):
branch1 = self.branch1(x)
branch2 = self.branch2(x)
return torch.cat([branch1, branch2], 1) *(error here)*
You need to match your kernel size and padding.
For kernel_size=1 no padding is needed, but for kernel_size=3 you need padding to be 1:
self.branch2 = nn.Sequential(
ConvBlock(in_channel, ch3, kernel_size=3,stride=1, padding=1)
)
See this nice tutorial for more details.
I am trying to implement the dot product and general implementation of calculating similarity scores from encoder and decoder output and hidden states respectively in keras.
I have got the idea to do the product of tf.keras.layers.dot(encoder_output,decoder_state) for calculating product score but there is error in multiplication of these two values.
class Attention(tf.keras.Model):
def __init__(self,units):
super().__init__()
self.units = units
def call(self, decoder_state, encoder_output):
score = tf.keras.layers.dot([encoder_output,decoder_state], axes=[2, 1])
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * encoder_output
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
batch_size = 16
units = 32
input_length = 20
decoder_state = tf.random.uniform(shape=[batch_size, units])
encoder_output = tf.random.uniform(shape=[batch_size, input_length, units])
attention = Attention(units)
context_vector, attention_weights = attention(decoder_state, encoder_output)
I am getting the following error:
/usr/local/lib/python3.6/dist-packages/six.py in raise_from(value, from_value)
InvalidArgumentError: Incompatible shapes: [16,20] vs. [16,20,32] [Op:Mul]
It is a very simple fix but as I am new to this I am not able to get the exact method needed to be called here.
I have tried reshaping the values of encoder_output but still this does not work.
Request to help me fix this.
I am just putting #Ayush Srivastava's comment as a response so that the post gets an answer.
Basically, the error occurs because you are trying to multiply 2 tensors (namely attention_weights and encoder_output) with different shapes, so you need to reshape the decoder_state.
Here is the full answer:
class Attention(tf.keras.Model):
def __init__(self,units):
super().__init__()
self.units = units
def call(self, decoder_state, encoder_output):
decoder_state = tf.keras.layers.Reshape((decoder_state.shape[1], 1))(decoder_state)
score = tf.keras.layers.dot([encoder_output, decoder_state],[2, 1])
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * encoder_output
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
Shapes:
decoder_state before reshape: (16, 32)
decoder_state after reshape: (16, 32, 1)
enc_output: (16, 20, 32)
score: (16, 20, 1)
attention_weights: (16, 20, 1)
context_vector before sum: (16, 20, 32)
I have a csv file with 339732 rows and two columns :
the first being 29 feature values, i.e. X
the second being a binary label value, i.e. Y
dataframe = pd.read_csv("features.csv", header = None)
dataset = dataframe.values
X = dataset[:, 0:29].astype(float)
Y = dataset[:,29]
X_train, y_train, X_test, y_test = train_test_split(X,Y, random_state = 42)
I am trying to train it on a 1D convolutional layer:
model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[0], 29)))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=16, epochs=2)
score = model.evaluate(X_test, y_test, batch_size=16)
Since, the Conv1D layer expects a 3-D input, I transformed my input as follows:
X_train = np.reshape(X_train, (1, X_train.shape[0], X_train.shape[1]))
X_test = np.reshape(X_test, (1, X_test.shape[0], X_test.shape[1]))
However, this still throws error:
ValueError: Negative dimension size caused by subtracting 3 from 1 for 'conv1d_1/convolution/Conv2D' (op: 'Conv2D') with input shapes: [?,1,1,29], [1,3,29,64].
Is there any way to feed my input correctly?
As far as I know 1D Convolution layer accepts inputs of the form Batchsize x Width x Channels. You are reshaping with
X_train = np.reshape(X_train, (1, X_train.shape[0], X_train.shape[1]))
But X_train.shape[0] is your batchsize I guess.I think the problem is somewhere here. Can you please tell what is the shape of X_train before reshape?
You have to think about if your data have some progression relation between the 339732 entries or the 29 features, this means if the order matters. If not I don't think that CNN is suitable for this case.
If the 29 features "indicates the progression of something":
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1],1))
If the 29 features are independent, then is like the channels on the image, but doesn't make sense convolute with only 1.
X_train = X_train.reshape((X_train.shape[0],1, X_train.shape[1]))
If you want to pick the 339732 entries like in blocks where the order matters (clip the 339732 or add zero padding in order to be divisible by timesteps):
X_train = X_train.reshape((int(X_train.shape[0]/timesteps),timesteps, X_train.shape[1],1))