I am using Swin transformer to build a Facial expression recognition model. The complete code is shared by Author: Rishit Dagli on 'https://keras.io/examples/vision/swin_transformers/'.
In Swin transformer class, the authors have used two Dense layers and I want to replace it with ResNet50. I came to know that the input to Resnet50 needs to be of the shape (none, none, 3). But when the images of shape (96, 96, 3) in the model are flowing through the initial layers -'PatchExtract' and 'PatchEmbedding', its getting converted to shape (576, 64). Then, when its getting passed to SwinTransformer with ResNet50, its giving an error:
Please suggest how can I embed Resnet model as a layer in SwinTransformer.
Swin Transformer class:
class SwinTransformer(layers.Layer):
def __init__(
self,
dim,
num_patch,
num_heads,
window_size=7,
shift_size=0,
num_mlp=1024,
qkv_bias=True,
dropout_rate=0.0,
**kwargs,
):
super(SwinTransformer, self).__init__(**kwargs)
self.dim = dim # number of input dimensions
self.num_patch = num_patch # number of embedded patches
self.num_heads = num_heads # number of attention heads
self.window_size = window_size # size of window
self.shift_size = shift_size # size of window shift
self.num_mlp = num_mlp # number of MLP nodes
self.norm1 = layers.LayerNormalization(epsilon=1e-5)
self.attn = WindowAttention(
dim,
window_size=(self.window_size, self.window_size),
num_heads=num_heads,
qkv_bias=qkv_bias,
dropout_rate=dropout_rate,
)
self.drop_path = DropPath(dropout_rate)
self.norm2 = layers.LayerNormalization(epsilon=1e-5)
self.mlp = keras.Sequential(
[
# I am trying to add pretrained ResNet50 here
layers.Dense(num_mlp),
layers.Activation(keras.activations.gelu),
layers.Dropout(dropout_rate),
layers.Dense(dim),
layers.Dropout(dropout_rate),
]
)
if min(self.num_patch) < self.window_size:
self.shift_size = 0
self.window_size = min(self.num_patch)
def build(self, input_shape):
if self.shift_size == 0:
self.attn_mask = None
else:
height, width = self.num_patch
h_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
w_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
mask_array = np.zeros((1, height, width, 1))
count = 0
for h in h_slices:
for w in w_slices:
mask_array[:, h, w, :] = count
count += 1
mask_array = tf.convert_to_tensor(mask_array)
# mask array to windows
mask_windows = window_partition(mask_array, self.window_size)
mask_windows = tf.reshape(
mask_windows, shape=[-1, self.window_size * self.window_size]
)
attn_mask = tf.expand_dims(mask_windows, axis=1) - tf.expand_dims(
mask_windows, axis=2
)
attn_mask = tf.where(attn_mask != 0, -100.0, attn_mask)
attn_mask = tf.where(attn_mask == 0, 0.0, attn_mask)
self.attn_mask = tf.Variable(initial_value=attn_mask, trainable=False)
def call(self, x):
height, width = self.num_patch
_, num_patches_before, channels = x.shape
x_skip = x
x = self.norm1(x)
x = tf.reshape(x, shape=(-1, height, width, channels))
if self.shift_size > 0:
shifted_x = tf.roll(
x, shift=[-self.shift_size, -self.shift_size], axis=[1, 2]
)
else:
shifted_x = x
x_windows = window_partition(shifted_x, self.window_size)
x_windows = tf.reshape(
x_windows, shape=(-1, self.window_size * self.window_size, channels)
)
attn_windows = self.attn(x_windows, mask=self.attn_mask)
attn_windows = tf.reshape(
attn_windows, shape=(-1, self.window_size, self.window_size, channels)
)
shifted_x = window_reverse(
attn_windows, self.window_size, height, width, channels
)
if self.shift_size > 0:
x = tf.roll(
shifted_x, shift=[self.shift_size, self.shift_size], axis=[1, 2]
)
else:
x = shifted_x
x = tf.reshape(x, shape=(-1, height * width, channels))
x = self.drop_path(x)
x = x_skip + x
x_skip = x
x = self.norm2(x)
x = self.mlp(x)
x = self.drop_path(x)
x = x_skip + x
return x
Model:
input = layers.Input(input_shape)
x = layers.RandomCrop(image_dimension, image_dimension)(input)
x = layers.RandomFlip("horizontal")(x)
x = PatchExtract(patch_size)(x)
x = PatchEmbedding(num_patch_x * num_patch_y, embed_dim)(x)
x = SwinTransformer(
dim=embed_dim,
num_patch=(num_patch_x, num_patch_y),
num_heads=num_heads,
window_size=window_size,
shift_size=0,
num_mlp=num_mlp,
qkv_bias=qkv_bias,
dropout_rate=dropout_rate,
)(x)
x = SwinTransformer(
dim=embed_dim,
num_patch=(num_patch_x, num_patch_y),
num_heads=num_heads,
window_size=window_size,
shift_size=shift_size,
num_mlp=num_mlp,
qkv_bias=qkv_bias,
dropout_rate=dropout_rate,
)(x)
x = PatchMerging((num_patch_x, num_patch_y), embed_dim=embed_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
output = layers.Dense(num_classes, activation="softmax")(x)
I added ResNet50 layer but got the below error:
ValueError: Exception encountered when calling layer "swin_transformer_2" (type SwinTransformer).
in user code:
File "<ipython-input-12-4a18cac0b25c>", line 121, in call *
x = self.mlp(x)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.7/dist-packages/keras/engine/input_spec.py", line 214, in assert_input_compatibility
raise ValueError(f'Input {input_index} of layer "{layer_name}" '
ValueError: Exception encountered when calling layer "resnet50" (type Functional).
Input 0 of layer "conv1_pad" is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: (None, 576, 64)
Call arguments received by layer "resnet50" (type Functional):
• inputs=tf.Tensor(shape=(None, 576, 64), dtype=float32)
• training=False
• mask=None
Call arguments received by layer "swin_transformer_2" (type SwinTransformer):
• x=tf.Tensor(shape=(None, 576, 64), dtype=float32)
Related
I am using a CNN to extract features from temporal data of different lengths. I am using pad_sequence to pad the data in a batch. However as the max length in a batch will change, the padded sequence length differs by batch. This creates errors when i flatten the data for the FCN layer (as the dimension of the flattened vector changes). I am currently handling this by using an 'adaptive avg pooling layer' in before the FCN layers. As this is a global averaging, it fixes the output dimension for the FCN. However I am not sure if this is the correct thing to do.
Code is:
##pad tensors
def pad_collate(batch):
sequences = [item[0] for item in batch]
lengths = [len(seq) for seq in sequences]
padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
return padded_sequences, lengths
## Create dataloader
trainData = Sequence(root = path)
trainDataLoader = DataLoader(trainData, batch_size = BATCH_SIZE, collate_fn= pad_collate)
## CNN model
class FeatureExtractor(nn.Module):
def __init__(self, block, layers):
super(FeatureExtractor, self).__init__()
self.inplanes = 6
## 1st CONV layers
self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 3, stride = 2, padding = 4)
self.bn1 = nn.BatchNorm2d(6)
self.relu1 = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride = 2, padding = 1)
## residual blocks
self.layer0 = self._make_layer(block, 12, layers[0], stride = 1)
self.layer1 = self._make_layer(block, 24, layers[1], stride = 2)
self.avgpool = nn.AdaptiveAvgPool2d((5,5)) ##### MY CURRENT SOLUTION #####
self.fc = nn.Linear(600, 128)
def _make_layer(self, block, planes, blocks, stride):
downsample = None
if stride != 1 or self.inplanes != planes:
downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
nn.BatchNorm2d(planes))
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
## first conv
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.maxpool1(x)
## conv blocks
x = self.layer0(x)
x = self.layer1(x)
##FCN layer
x = self.avgpool(x)
x = torch.flatten(x, 1)
output = self.fc(x)
return output
Any other comments are also welcome (i am self-taught)
I am triying to apply Restricted Boltzman Machine (RBM) to MNIST dataset, although I am getting small reconstructing errors, when reconstruct the test set to the same dimensions as the original, the reconstructed images differ considerably from their original.
class RBM:
def init(self, n_input_neurons, n_output_neurons):
self.W = np.random.normal(loc=0.0, scale=1.0, size=(n_input_neurons, n_output_neurons)).astype(np.float32)
self.visible_bias = np.random.rand(1, n_input_neurons) #visible layer bias
self.hidden_bias = np.random.rand(1, n_output_neurons) #hidden layer bias
def __sample(self, probability_distribution):
random_dist = np.random.uniform(0, 1, probability_distribution.shape)
example = probability_distribution - random_dist
example[example > 0] = 1.0
example[example <= 0] = 0.0
return example
def __sigmoid(self, x):
return 1 / (1 + np.e ** (-x))
def __encode(self, X):
probability_distribution = self.__sigmoid(X # self.W + self.hidden_bias) #probabilities of the hidden units
return probability_distribution, self.__sample(probability_distribution)
def __decode(self, X):
probability_distribution = self.__sigmoid(X # self.W.T + self.visible_bias) #probabilities of the visible units
return probability_distribution, self.__sample(probability_distribution)
def getReconstructedOutput(self, X):
encode_probability, encode_sample = self.__encode(X)
decode_probability, decode_sample = self.__decode(encode_sample)
return decode_sample
def train(self, X, loss_function, lr=.01, epochs= 500, verbose = False):
epoch = 0
history = []
while epoch < epochs:
h0_prob, h0_state = self.__encode(X)
positive_associations = X.T.dot(h0_prob)
v1_prob, v1_state = self.__decode(h0_state)
h1_prob, h1_state = self.__encode(v1_state)
negative_associations = v1_state.T.dot(h1_prob)
#Updating weights
self.W+= lr * (positive_associations - negative_associations)
self.hidden_bias+= (lr * (h0_prob.sum(axis = 0) - h1_prob.sum(axis = 0)) )
self.visible_bias+= (lr * (X.sum(axis=0) - v1_state.sum(axis=0)) )
epoch+=1
loss = loss_function(v1_state, X) #loss
history.append(loss)
if verbose:
print(f'Epoch {epoch} ==> Loss: {loss}')
return history
What i am doing wrong?
I'm trying to make a hybrid binary text classification model using a multi-head attention mechanism with CNN-LSTM. However, I'm facing an issue when trying to pass the values obtained from CNN-LSTM to the attention layer.
This was what I tried:
Here's the code for defining multihead attention layer.
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super().__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = keras.Model(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
This one is for positional encoding and tokenization
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super().__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
Defining the whole arhitecture
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
x = Conv1D(256, 3, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = Dropout(0.5)(x)
x = LSTM(64, return_sequences=True)(x)
x = Dropout(0.5)(x)
x = LSTM(32)(x)
x = TransformerBlock(256, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
The error
StagingError Traceback (most recent call last)
<ipython-input-19-ea38c3c52d0e> in <module>
10 x = Dropout(0.5)(x)
11 x = LSTM(32)(x)
---> 12 x = TransformerBlock(256, num_heads, ff_dim)(x)
13 x = layers.GlobalAveragePooling1D()(x)
14 x = layers.Dropout(0.1)(x)
1 frames
/usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
690 except Exception as e: # pylint:disable=broad-except
691 if hasattr(e, 'ag_error_metadata'):
--> 692 raise e.ag_error_metadata.to_exception(e)
693 else:
694 raise
StagingError: Exception encountered when calling layer "transformer_block" (type TransformerBlock).
in user code:
File "<ipython-input-17-4c5de9a08c11>", line 14, in call *
attn_output = self.att(inputs, inputs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.8/dist-packages/keras/layers/activation/softmax.py", line 98, in call
return backend.softmax(inputs, axis=self.axis[0])
IndexError: Exception encountered when calling layer "softmax" (type Softmax).
tuple index out of range
Call arguments received by layer "softmax" (type Softmax):
• inputs=tf.Tensor(shape=(None, 8), dtype=float32)
• mask=None
Call arguments received by layer "transformer_block" (type TransformerBlock):
• inputs=tf.Tensor(shape=(None, 32), dtype=float32)
• training=False
would you know how I can adapt this code so that sizes of tensors must match because I have this error: x = torch.cat([x1,x2],1) RuntimeError: Sizes of tensors must match except in dimension 0. Got 32 and 1 (The offending index is 0).
My images are size 416x416.
Thank you in advance for your help,
num_classes = 20
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.inc = models.inception_v3(pretrained=True)
self.inc.aux_logits = False
for child in list(self.inc.children())[:-5]:
for param in child.parameters():
param.requires_grad = False
self.inc.fc = nn.Sequential()
self.dens121 = models.densenet121(pretrained=True)
for child in list(self.dens121.children())[:-6]:
for param in child.parameters():
param.requires_grad = False
self.dens121 = nn.Sequential(*list(self.dens121.children())[:-1])
self.SiLU = nn.SiLU()
self.linear = nn.Linear(4096, num_classes)
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x1 = self.SiLU(self.dens121(x))
x1 = x1.view(-1, 2048)
x2 = self.inc(x).view(-1, 2048)
x = torch.cat([x1,x2],1)
return self.linear(self.dropout(x))
The shapes of the two tensors are very different and that's why the torch.cat() fails. I tried to run your code with the following example:
def forward(self, x):
x1 = self.SiLU(self.dens121(x))
x1 = x1.view(-1, 2048)
x2 = self.inc(x).view(-1, 2048)
print(x1.shape, x2.shape)
x = torch.cat([x1,x2], dim=1)
return self.linear(self.dropout(x))
Here's the driver code
inputs = torch.randn(2, 3, 416, 416)
model = Net()
outputs = model(inputs)
The shapes of x1 of x2 are as follows:
torch.Size([169, 2048]) torch.Size([2, 2048])
Either your DenseNet should output the same shape as the output of Inceptionv3 or vice-versa. The output from DenseNet is of shape torch.Size([2, 1024, 13, 13]) and the output from Inceptionv3 is of shape torch.Size([2, 2048]).
EDIT
Add this line to the init method:
self.conv_reshape= nn.Conv2d(1024, 2048, kernel_size=13, stride=1)
Add these lines to your forward():
x1 = self.SiLU(self.dens121(x))
out = self.conv_reshape(x1)
x1 = out.view(-1, out.size(1))
x2 = self.inc(x).view(-1, 2048)
I am trying to concatenate embedding layer with other features. It doesn’t give me any error, but doesn’t do any training either. Is anything wrong with this model definition, how to debug this?
Note: The last column (feature) in my X is feature with word2ix (single word).
Note: The net works fine without the embedding feature/layer
originally posted on pytorch forum
class Net(torch.nn.Module):
def __init__(self, n_features, h_sizes, num_words, embed_dim, out_size, dropout=None):
super().__init__()
self.num_layers = len(h_sizes) # hidden + input
self.embedding = torch.nn.Embedding(num_words, embed_dim)
self.hidden = torch.nn.ModuleList()
self.bnorm = torch.nn.ModuleList()
if dropout is not None:
self.dropout = torch.nn.ModuleList()
else:
self.dropout = None
for k in range(len(h_sizes)):
if k == 0:
self.hidden.append(torch.nn.Linear(n_features, h_sizes[0]))
self.bnorm.append(torch.nn.BatchNorm1d(h_sizes[0]))
if self.dropout is not None:
self.dropout.append(torch.nn.Dropout(p=dropout))
else:
if k == 1:
input_dim = h_sizes[0] + embed_dim
else:
input_dim = h_sizes[k-1]
self.hidden.append(torch.nn.Linear(input_dim, h_sizes[k]))
self.bnorm.append(torch.nn.BatchNorm1d(h_sizes[k]))
if self.dropout is not None:
self.dropout.append(torch.nn.Dropout(p=dropout))
# Output layer
self.out = torch.nn.Linear(h_sizes[-1], out_size)
def forward(self, inputs):
# Feedforward
for l in range(self.num_layers):
if l == 0:
x = self.hidden[l](inputs[:, :-1])
x = self.bnorm[l](x)
if self.dropout is not None:
x= self.dropout[l](x)
embeds = self.embedding(inputs[:,-1])#.view((1, -1)
x = torch.cat((embeds, x),dim=1)
else:
x = self.hidden[l](x)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
x = F.relu(x)
output= self.out(x)
return output
There were a few issues. The key one was data type. I mixed float features and int indices.
sample data and training before fix:
NUM_TARGETS = 4
NUM_FEATURES = 3
NUM_TEXT_FEATURES = 1
x = np.random.rand(5, NUM_FEATURES)
y = np.random.rand(5, NUM_TARGETS)
word_ix = np.arange(5).reshape(-1,1).astype(int)
x_train = np.append(x, word_ix, axis=1)
x_train = torch.from_numpy(x).float().to(device)
y_train = torch.from_numpy(y).float().to(device)
h_sizes = [2,2]
net = Net(x_train.shape[1] , h_sizes=h_sizes, num_words=5, embed_dim=2, out_size=y_train.shape[1],dropout=.01) # define the network
print(net) # net architecture
net = net.float()
net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001, weight_decay=.01)
loss_func = torch.nn.MSELoss() # this is for regression mean squared loss
# one training loop
prediction = net(x_train) # input x and predict based on x
loss = loss_func(prediction, y_train) # must be (1. nn output, 2. target)
optimizer.zero_grad() # clear gradients for next train
loss.backward() # backpropagation, compute gradients
optimizer.step() # apply gradients
# train_losses.append(loss.detach().to('cpu').numpy())
To resolve this, I separated word index feature from x, and also removed net.float().
changed the dtypes conversion to:
x_train = torch.from_numpy(x).float().to(device)
y_train = torch.from_numpy(y).float().to(device)
# NOTE: word index needs to be long
word_ix = torch.from_numpy(word_ix).to(torch.long).to(device)
and forward method changed to :
def forward(self, inputs, word_ix):
# Feedforward
for l in range(self.num_layers):
if l == 0:
x = self.hidden[l](inputs)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
embeds = self.embedding(word_ix)
# NOTE:
# embeds has a shape of (batch_size, 1, embed_dim)
# inorder to merge this change this with x, reshape this to
# (batch_size, embed_dim)
embeds = embeds.view(embeds.shape[0], embeds.shape[2])
x = torch.cat((x, embeds.view(x.shape)),dim=1)
else:
x = self.hidden[l](x)
x = self.bnorm[l](x)
if self.dropout is not None:
x = self.dropout[l](x)
x = F.relu(x)
output= self.out(x)
return output