How to add a multihead attention layer to a CNN-LSTM model?

How to add a multihead attention layer to a CNN-LSTM model? - deep-learning

I'm trying to make a hybrid binary text classification model using a multi-head attention mechanism with CNN-LSTM. However, I'm facing an issue when trying to pass the values obtained from CNN-LSTM to the attention layer.
This was what I tried:
Here's the code for defining multihead attention layer.
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super().__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = keras.Model(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
This one is for positional encoding and tokenization
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super().__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
Defining the whole arhitecture
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
x = Conv1D(256, 3, activation='relu')(x)
x = Dropout(0.5)(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = Dropout(0.5)(x)
x = LSTM(64, return_sequences=True)(x)
x = Dropout(0.5)(x)
x = LSTM(32)(x)
x = TransformerBlock(256, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
The error
StagingError Traceback (most recent call last)
<ipython-input-19-ea38c3c52d0e> in <module>
10 x = Dropout(0.5)(x)
11 x = LSTM(32)(x)
---> 12 x = TransformerBlock(256, num_heads, ff_dim)(x)
13 x = layers.GlobalAveragePooling1D()(x)
14 x = layers.Dropout(0.1)(x)
1 frames
/usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py in wrapper(*args, **kwargs)
690 except Exception as e: # pylint:disable=broad-except
691 if hasattr(e, 'ag_error_metadata'):
--> 692 raise e.ag_error_metadata.to_exception(e)
693 else:
694 raise
StagingError: Exception encountered when calling layer "transformer_block" (type TransformerBlock).
in user code:
File "<ipython-input-17-4c5de9a08c11>", line 14, in call *
attn_output = self.att(inputs, inputs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.8/dist-packages/keras/layers/activation/softmax.py", line 98, in call
return backend.softmax(inputs, axis=self.axis[0])
IndexError: Exception encountered when calling layer "softmax" (type Softmax).
tuple index out of range
Call arguments received by layer "softmax" (type Softmax):
• inputs=tf.Tensor(shape=(None, 8), dtype=float32)
• mask=None
Call arguments received by layer "transformer_block" (type TransformerBlock):
• inputs=tf.Tensor(shape=(None, 32), dtype=float32)
• training=False

Related

What is wrong with my neural net model with LSTM for regression problem that it doesn't return the model as output?

So, the questionn is this:
What I am doing wrong when defining the neural net architecture? Look at sections Define the neural network model and Define the learning rate scheduler train the model
Details:
I have written the code of this where revenue_data shape is (1749, 2) while weather_data shape is (86990, 10) X_train shape is ([69010, 14]), y_train is ([69010]), X_val is ([17253, 14]), y_val = ([17253]) and have done the preprocesing, scaling, removing oputliers and splitting the data as here:
Convert date and time columns to datetime format
revenue_data['Date'] = pd.to_datetime(revenue_data['Date'], format='%Y%m%d')
weather_data['dt'] = pd.to_datetime(weather_data['dt'], format='%Y%m%d')
weather_data['time'] = pd.to_datetime(weather_data['time'], format='%H:%M:%S')
Convert wind and condition columns to embeddings
wind_embeddings = nn.Embedding(len(weather_data['wind'].unique()), 5)
weather_data['wind_code'] = weather_data['wind'].astype('category').cat.codes
wind_vectors = wind_embeddings(torch.tensor(weather_data['wind_code'].values, dtype=torch.long))
weather_data['wind_x'] = wind_vectors[:, 0].detach().numpy()
weather_data['wind_y'] = wind_vectors[:, 1].detach().numpy()
weather_data['wind_z'] = wind_vectors[:, 2].detach().numpy()
weather_data['wind_t'] = wind_vectors[:, 3].detach().numpy()
weather_data['wind_u'] = wind_vectors[:, 4].detach().numpy()
condition_embeddings = nn.Embedding(len(weather_data['condition'].unique()), 3)
weather_data['condition_code'] = weather_data['condition'].astype('category').cat.codes
condition_vectors = condition_embeddings(torch.tensor(weather_data['condition_code'].values, dtype=torch.long))
weather_data['condition_x'] = condition_vectors[:, 0].detach().numpy()
weather_data['condition_y'] = condition_vectors[:, 1].detach().numpy()
weather_data['condition_z'] = condition_vectors[:, 2].detach().numpy()
Group the weather data by date and hour and calculate the mean for each date and hour
weather_data = weather_data.groupby(['dt', 'time']).mean()
weather_data = weather_data.reset_index()
weather_data['Date'] = weather_data['dt']
weather_data.drop(['dt', 'time', 'wind_code', 'condition_code'], axis=1, inplace=True)
Merge the revenue and weather data on the 'Date' column and drop 'Date'
merged_data = pd.merge(revenue_data, weather_data, on='Date')
merged_data.drop('Date', axis=1, inplace=True)
merged_data.head()
Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(merged_data)
Split the data into input and target sets
X = scaled_data[:, 1:]
y = scaled_data[:, 0]
from scipy.stats import zscore
Calculate z-scores for each feature | Remove outliers that have z-scor bigger that 3
z_scores = zscore(X)
Identify rows where any feature has a z-score > 3
mask = (z_scores > 3).any(axis=1)
Remove rows with high z-scores from the x and y
features = X[~mask, :]
target = y[~mask]
Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
Convert the data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
but the I am struggling to realise what is wrong with the neural net architecture defined:
Define the neural network model
class RevenuePredictor(nn.Module):
def __init__(self):
super().__init__()
self.lstm = nn.LSTM(input_size=14, hidden_size=32, num_layers=1, batch_first=True)
self.fc1 = nn.Linear(32, 16)
self.fc2 = nn.Linear(16, 1)
def forward(self, x, lengths):
print('x shape:', x.shape)
Get the lengths of the input sequences
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lengths = lengths.to(device)
lengths = lengths.cpu()
print('lengths shape:', lengths.shape)
Sort the input sequences by length
sorted_lengths, sorted_idx = torch.sort(lengths, descending=True)
sorted_x = x[sorted_idx]
Pack the sorted input sequences
packed_x = nn.utils.rnn.pack_padded_sequence(sorted_x, sorted_lengths, batch_first=True)
Convert the packed sequence to a tensor with two dimensions
x_data, batch_sizes = nn.utils.rnn.pad_packed_sequence(packed_x, batch_first=True)
Convert the packed sequence to a tensor with two dimensions
x_data, batch_sizes = x.data, x.batch_sizes
seq_len = batch_sizes[0]
batch_size = len(batch_sizes)
x = x_data.new_zeros((batch_size, seq_len, 14))
s = 0
for i, l in enumerate(batch_sizes):
x[i, :l] = x_data[s:(s+l)]
s += l
Pass the packed input sequences through the LSTM
lstm_output, (h, c) = self.lstm(packed_x)
Unpack the LSTM output sequences
unpacked_output, _ = nn.utils.rnn.pad_packed_sequence(lstm_output, batch_first=True)
Re-sort the output sequences to their original order
unsorted_idx = sorted_idx.sort(0)
output = unpacked_output[unsorted_idx]
Pass the output sequences through the fully connected layers
output = nn.functional.relu(self.fc1(output[:, -1, :]))
output = self.fc2(output)
return output
Then Create the model
model = RevenuePredictor()
followed by loss and metrics
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
metrics = {
'mse': MeanSquaredError(),
'mae': MeanAbsoluteError(),
'r2': R2Score(),
}
Define the learning rate scheduler train the model
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
best_val_loss = np.inf
for epoch in range(num_epochs):
# Set the model to training mode
model.train()
train_loss = 0.0
num_batches = 0
for X_train, y_train in train_loader:
lengths = torch.ones(X_train.shape[0], dtype=torch.long)
optimizer.zero_grad()
output = model(X_train, lengths)
loss = loss_fn(output, y_train)
loss.backward()
optimizer.step()
train_loss += loss.item()
num_batches += 1
val_loss = 0.0
for X_val, y_val in val_loader:
lengths = torch.ones(X_val.shape[0], dtype=torch.long)
output = model(X_val, lengths)
loss = loss_fn(output, y_val)
val_loss += loss.item()
scheduler.step(val_loss)
val_loss /= len(val_loader)
val_mse = metrics['mse'].compute()
val_mae = metrics['mae'].compute()
val_r2 = metrics['r2'].compute()
for metric in metrics.values():
metric.reset()
if (epoch+1) % 100 == 0:
print('Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}, MSE: {:.4f}, MAE: {:.4f}, R2: {:.4f}'
.format(epoch+1, num_epochs, train_loss/num_batches, val_loss, val_mse, val_mae, val_r2))
I get this error which I think is because of something being wwrong in defining neural netwrok model:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-164-e20b93c25048>", line 3, in <module>
output = model(X_train, lengths)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "<ipython-input-163-43b2ef5c15db>", line 38, in forward
lstm_output, (h, c) = self.lstm(packed_x)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/rnn.py", line 772, in forward
self.check_forward_args(input, hx, batch_sizes)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/rnn.py", line 697, in check_forward_args
self.check_input(input, batch_sizes)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/rnn.py", line 206, in check_input
raise RuntimeError(
# RuntimeError: input must have 2 dimensions, got 1
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 2040, in showtraceback
stb = value._render_traceback_()
AttributeError: 'RuntimeError' object has no attribute '_render_traceback_'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
File "/usr/local/lib/python3.8/dist-packages/IPython/core/ultratb.py", line 319, in wrapped
return f(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/IPython/core/ultratb.py", line 353, in _fixed_getinnerframes
records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
File "/usr/lib/python3.8/inspect.py", line 1515, in getinnerframes
frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)
File "/usr/lib/python3.8/inspect.py", line 1473, in getframeinfo
filename = getsourcefile(frame) or getfile(frame)
File "/usr/lib/python3.8/inspect.py", line 708, in getsourcefile
if getattr(getmodule(object, filename), '__loader__', None) is not None:
File "/usr/lib/python3.8/inspect.py", line 737, in getmodule
file = getabsfile(object, _filename)
File "/usr/lib/python3.8/inspect.py", line 721, in getabsfile
return os.path.normcase(os.path.abspath(_filename))
File "/usr/lib/python3.8/posixpath.py", line 379, in abspath
cwd = os.getcwd()
FileNotFoundError: [Errno 2] No such file or directory
---------------------------------------------------------------------------
I tried coverting the packed sequence to a tensor with two dimensions in different way:
x_data, ba
tch_sizes = x.data, x.batch_sizes
seq_len = batch_sizes[0]
batch_size = len(batch_sizes)
x = x_data.new_zeros((batch_size, seq_len, 14))
s = 0
for i, l in enumerate(batch_sizes):
x[i, :l] = x_data[s:(s+l)]
s += l
Didnt work.
Then tried rehsaping x to have three dimensions like:
batch_size, seq_len, input_size = x.shape
Didn't work and finally tried:
unsqueze(-1) on output after I defined model like:
model = REvenuePredictor()
output = model(X_train, lengths).unsqueeze(-1)

ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, None, None, 3)

I am working on a MetaLearning approach to solving an image segmentation problem. I trained 7 different models on different classes related to the eye dataset provided under Oculus SBVPI (iris, pupil, sclera, canthus, periocular, vessels, eyelashes). I want to connect these several models together by having seven input heads corresponding to seven pre-trained models and then concatenating their outputs to get the final result.
I want to create a model architecture with 7 input heads every input head connects with its own encoder block and then at the end of the encoder block the output is concatenated and passed to a single decoder block to produce output.
I am not able to figure out the problem, any suggestions at all will be of great help.
I tried other similar approaches as given here and here
My code:
import tensorflow as tf
# Assume all libraries are imported
class PretrainedMetaUNet:
def __init__(self, config, save_model=True, show_summary=True, sigmoid=True, **kwargs):
super(PretrainedMetaUNet, self).__init__(**kwargs)
self.configuration = config
self.x_size = config["dataset"]["width"] # 128
self.y_size = config["dataset"]["height"] # 128
self.channels = config["dataset"]["img_channels"] # 3
self.save_model = save_model
self.add_sigmoid = sigmoid
self.meta_classes = config["dataset"]["meta_channels"] # 1
self.show_summary = show_summary
self.model_img = config["Network"]["modelpath"]
self.dropout_factor = config["Network"]['dropout']
self.maxpool = tf.keras.layers.MaxPooling2D()
def ConvolutionBN(self, input_tensor, filters):
x = tf.keras.layers.Conv2D(filters, 3, padding='same', kernel_initializer='he_normal')(input_tensor)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.ReLU()(x)
if self.dropout_factor != 0.0:
x = tf.keras.layers.Dropout(self.dropout_factor)(x)
x = tf.keras.layers.Conv2D(filters, 3, padding='same', kernel_initializer='he_normal')(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.ReLU()(x)
if self.dropout_factor != 0.0:
x = tf.keras.layers.Dropout(self.dropout_factor)(x)
return x
def return_pretrained_model_encoder_weights(self):
root_path = self.configuration['train']['Output']['weight']
path = glob.glob(os.path.join(root_path, '*_weights'), recursive=True)
# path variable has absolute address of all the seven models ['/path to sclera model', '/path to iris model', ...]
# Every model is trained on the input images of spatial dimension 128x128x3 predicting a segmentation mask of 128x128x1.
encoder_weights = []
for idx, model_path in enumerate(path):
model = tf.keras.models.load_model(filepath=model_path, compile=False)
class_name = model_path.split('/')[-1].split('_')[0] # takes class name like iris, sclera etc..
for layer in model.layers:
layer._name = layer.name + str('_' + class_name)
# I used segmentation_models library ==> sm.UNet('resnet34', encoder_weights='imagenet')
# Accessing different encoding stages of the model. Every model is Unet pretrained with resnet34 weights.
s1 = model.get_layer(name='data'+str('_' + class_name)).output
s2 = model.get_layer(name='relu0'+str('_' + class_name)).output
s3 = model.get_layer(name='stage2_unit1_relu1'+str('_' + class_name)).output
s4 = model.get_layer(name='stage3_unit1_relu1'+str('_' + class_name)).output
b1 = model.get_layer(name='stage4_unit1_relu1'+str('_' + class_name)).output
encoder_weights.append([b1, s1, s2, s3, s4])
del model
return encoder_weights
def DecoderBlock(self, input_tensor, skip_features, filters, name=None):
if name is None:
x = tf.keras.layers.Conv2DTranspose(filters, kernel_size=2, strides=2,
padding='SAME', kernel_initializer='he_normal')(input_tensor)
else:
x = tf.keras.layers.Conv2DTranspose(filters, kernel_size=2, strides=2, name=name,
padding='SAME', kernel_initializer='he_normal')(input_tensor)
x = tf.keras.layers.Concatenate()([x, skip_features])
x = self.ConvolutionBN(x, filters)
return x
def return_model(self):
input_1 = tf.keras.layers.Input(shape=(self.x_size, self.y_size, self.channels)) # (128,128,3)
input_2 = tf.keras.layers.Input(shape=(self.x_size, self.y_size, self.channels)) # (128,128,3)
input_3 = tf.keras.layers.Input(shape=(self.x_size, self.y_size, self.channels)) # (128,128,3)
input_4 = tf.keras.layers.Input(shape=(self.x_size, self.y_size, self.channels)) # (128,128,3)
input_5 = tf.keras.layers.Input(shape=(self.x_size, self.y_size, self.channels)) # (128,128,3)
input_6 = tf.keras.layers.Input(shape=(self.x_size, self.y_size, self.channels)) # (128,128,3)
input_7 = tf.keras.layers.Input(shape=(self.x_size, self.y_size, self.channels)) # (128,128,3)
inputs = [input_1, input_2, input_3, input_4, input_5, input_6, input_7]
encoder_weights = self.return_pretrained_model_encoder_weights()
# Concatenating all the relative weights together
full_b = [layer[0] for layer in encoder_weights]
full_s1 = [layer[1] for layer in encoder_weights]
full_s2 = [layer[2] for layer in encoder_weights]
full_s3 = [layer[3] for layer in encoder_weights]
full_s4 = [layer[4] for layer in encoder_weights]
concatenated_b = tf.keras.layers.Concatenate(axis=-1)(full_b)
concatenated_s1 = tf.keras.layers.Concatenate(axis=-1)(full_s1)
concatenated_s2 = tf.keras.layers.Concatenate(axis=-1)(full_s2)
concatenated_s3 = tf.keras.layers.Concatenate(axis=-1)(full_s3)
concatenated_s4 = tf.keras.layers.Concatenate(axis=-1)(full_s4)
# Creating final decoder block
d1 = self.DecoderBlock(concatenated_b, concatenated_s4, 512, name='decoder_start')
d2 = self.DecoderBlock(d1, concatenated_s3, 256, name='block2')
d3 = self.DecoderBlock(d2, concatenated_s2, 128, name='block3')
d4 = self.DecoderBlock(d3, concatenated_s1, 64, name='block4')
""" Output """
if self.add_sigmoid:
activation = 'sigmoid'
else:
activation = None
output = tf.keras.layers.Conv2D(self.meta_classes, 1, padding='same', activation=activation)(d4)
model = tf.keras.Model(inputs=inputs, outputs=output)
if self.show_summary:
print(model.summary())
if self.save_model:
tf.keras.utils.plot_model(model, show_dtype=True, show_layer_names=True, show_shapes=True,
to_file=self.model_img)
return model
if __name__ == '__main__':
# Loaded Configuration file...
pretrained_unet = PretrainedMetaUNet(config=configuration)
model = pretrained_unet.return_model()
Error:
ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, None, None, 3), dtype=tf.float32, name='data'), name='data', description="created by layer 'data_pupil'") at layer "bn_data_pupil". The following previous layers were accessed without issue: []

Swin Transformer for Facial expression recognition

I am using Swin transformer to build a Facial expression recognition model. The complete code is shared by Author: Rishit Dagli on 'https://keras.io/examples/vision/swin_transformers/'.
In Swin transformer class, the authors have used two Dense layers and I want to replace it with ResNet50. I came to know that the input to Resnet50 needs to be of the shape (none, none, 3). But when the images of shape (96, 96, 3) in the model are flowing through the initial layers -'PatchExtract' and 'PatchEmbedding', its getting converted to shape (576, 64). Then, when its getting passed to SwinTransformer with ResNet50, its giving an error:
Please suggest how can I embed Resnet model as a layer in SwinTransformer.
Swin Transformer class:
class SwinTransformer(layers.Layer):
def __init__(
self,
dim,
num_patch,
num_heads,
window_size=7,
shift_size=0,
num_mlp=1024,
qkv_bias=True,
dropout_rate=0.0,
**kwargs,
):
super(SwinTransformer, self).__init__(**kwargs)
self.dim = dim # number of input dimensions
self.num_patch = num_patch # number of embedded patches
self.num_heads = num_heads # number of attention heads
self.window_size = window_size # size of window
self.shift_size = shift_size # size of window shift
self.num_mlp = num_mlp # number of MLP nodes
self.norm1 = layers.LayerNormalization(epsilon=1e-5)
self.attn = WindowAttention(
dim,
window_size=(self.window_size, self.window_size),
num_heads=num_heads,
qkv_bias=qkv_bias,
dropout_rate=dropout_rate,
)
self.drop_path = DropPath(dropout_rate)
self.norm2 = layers.LayerNormalization(epsilon=1e-5)
self.mlp = keras.Sequential(
[
# I am trying to add pretrained ResNet50 here
layers.Dense(num_mlp),
layers.Activation(keras.activations.gelu),
layers.Dropout(dropout_rate),
layers.Dense(dim),
layers.Dropout(dropout_rate),
]
)
if min(self.num_patch) < self.window_size:
self.shift_size = 0
self.window_size = min(self.num_patch)
def build(self, input_shape):
if self.shift_size == 0:
self.attn_mask = None
else:
height, width = self.num_patch
h_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
w_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
mask_array = np.zeros((1, height, width, 1))
count = 0
for h in h_slices:
for w in w_slices:
mask_array[:, h, w, :] = count
count += 1
mask_array = tf.convert_to_tensor(mask_array)
# mask array to windows
mask_windows = window_partition(mask_array, self.window_size)
mask_windows = tf.reshape(
mask_windows, shape=[-1, self.window_size * self.window_size]
)
attn_mask = tf.expand_dims(mask_windows, axis=1) - tf.expand_dims(
mask_windows, axis=2
)
attn_mask = tf.where(attn_mask != 0, -100.0, attn_mask)
attn_mask = tf.where(attn_mask == 0, 0.0, attn_mask)
self.attn_mask = tf.Variable(initial_value=attn_mask, trainable=False)
def call(self, x):
height, width = self.num_patch
_, num_patches_before, channels = x.shape
x_skip = x
x = self.norm1(x)
x = tf.reshape(x, shape=(-1, height, width, channels))
if self.shift_size > 0:
shifted_x = tf.roll(
x, shift=[-self.shift_size, -self.shift_size], axis=[1, 2]
)
else:
shifted_x = x
x_windows = window_partition(shifted_x, self.window_size)
x_windows = tf.reshape(
x_windows, shape=(-1, self.window_size * self.window_size, channels)
)
attn_windows = self.attn(x_windows, mask=self.attn_mask)
attn_windows = tf.reshape(
attn_windows, shape=(-1, self.window_size, self.window_size, channels)
)
shifted_x = window_reverse(
attn_windows, self.window_size, height, width, channels
)
if self.shift_size > 0:
x = tf.roll(
shifted_x, shift=[self.shift_size, self.shift_size], axis=[1, 2]
)
else:
x = shifted_x
x = tf.reshape(x, shape=(-1, height * width, channels))
x = self.drop_path(x)
x = x_skip + x
x_skip = x
x = self.norm2(x)
x = self.mlp(x)
x = self.drop_path(x)
x = x_skip + x
return x
Model:
input = layers.Input(input_shape)
x = layers.RandomCrop(image_dimension, image_dimension)(input)
x = layers.RandomFlip("horizontal")(x)
x = PatchExtract(patch_size)(x)
x = PatchEmbedding(num_patch_x * num_patch_y, embed_dim)(x)
x = SwinTransformer(
dim=embed_dim,
num_patch=(num_patch_x, num_patch_y),
num_heads=num_heads,
window_size=window_size,
shift_size=0,
num_mlp=num_mlp,
qkv_bias=qkv_bias,
dropout_rate=dropout_rate,
)(x)
x = SwinTransformer(
dim=embed_dim,
num_patch=(num_patch_x, num_patch_y),
num_heads=num_heads,
window_size=window_size,
shift_size=shift_size,
num_mlp=num_mlp,
qkv_bias=qkv_bias,
dropout_rate=dropout_rate,
)(x)
x = PatchMerging((num_patch_x, num_patch_y), embed_dim=embed_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
output = layers.Dense(num_classes, activation="softmax")(x)
I added ResNet50 layer but got the below error:
ValueError: Exception encountered when calling layer "swin_transformer_2" (type SwinTransformer).
in user code:
File "<ipython-input-12-4a18cac0b25c>", line 121, in call *
x = self.mlp(x)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler **
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.7/dist-packages/keras/engine/input_spec.py", line 214, in assert_input_compatibility
raise ValueError(f'Input {input_index} of layer "{layer_name}" '
ValueError: Exception encountered when calling layer "resnet50" (type Functional).
Input 0 of layer "conv1_pad" is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: (None, 576, 64)
Call arguments received by layer "resnet50" (type Functional):
• inputs=tf.Tensor(shape=(None, 576, 64), dtype=float32)
• training=False
• mask=None
Call arguments received by layer "swin_transformer_2" (type SwinTransformer):
• x=tf.Tensor(shape=(None, 576, 64), dtype=float32)

DenseNet, Sizes of tensors must match

would you know how I can adapt this code so that sizes of tensors must match because I have this error: x = torch.cat([x1,x2],1) RuntimeError: Sizes of tensors must match except in dimension 0. Got 32 and 1 (The offending index is 0).
My images are size 416x416.
Thank you in advance for your help,
num_classes = 20
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.inc = models.inception_v3(pretrained=True)
self.inc.aux_logits = False
for child in list(self.inc.children())[:-5]:
for param in child.parameters():
param.requires_grad = False
self.inc.fc = nn.Sequential()
self.dens121 = models.densenet121(pretrained=True)
for child in list(self.dens121.children())[:-6]:
for param in child.parameters():
param.requires_grad = False
self.dens121 = nn.Sequential(*list(self.dens121.children())[:-1])
self.SiLU = nn.SiLU()
self.linear = nn.Linear(4096, num_classes)
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x1 = self.SiLU(self.dens121(x))
x1 = x1.view(-1, 2048)
x2 = self.inc(x).view(-1, 2048)
x = torch.cat([x1,x2],1)
return self.linear(self.dropout(x))

The shapes of the two tensors are very different and that's why the torch.cat() fails. I tried to run your code with the following example:
def forward(self, x):
x1 = self.SiLU(self.dens121(x))
x1 = x1.view(-1, 2048)
x2 = self.inc(x).view(-1, 2048)
print(x1.shape, x2.shape)
x = torch.cat([x1,x2], dim=1)
return self.linear(self.dropout(x))
Here's the driver code
inputs = torch.randn(2, 3, 416, 416)
model = Net()
outputs = model(inputs)
The shapes of x1 of x2 are as follows:
torch.Size([169, 2048]) torch.Size([2, 2048])
Either your DenseNet should output the same shape as the output of Inceptionv3 or vice-versa. The output from DenseNet is of shape torch.Size([2, 1024, 13, 13]) and the output from Inceptionv3 is of shape torch.Size([2, 2048]).
EDIT
Add this line to the init method:
self.conv_reshape= nn.Conv2d(1024, 2048, kernel_size=13, stride=1)
Add these lines to your forward():
x1 = self.SiLU(self.dens121(x))
out = self.conv_reshape(x1)
x1 = out.view(-1, out.size(1))
x2 = self.inc(x).view(-1, 2048)

MXNET CNN+LSTM save/serialize to json

I'm finding a hardtime figuring out how to correctly define a mxnet net so that i can serialize/convert this model to a json file.
The pipeline is composed of a CNN + biLSTM + CTC.
I now i must use HybridBlock and hybridize() but i can't seem to make it work or if its even possible or if there is any other way around.
I'm sure its lack of knowledge on my part and wonder is anyone can help.
Here is the net definition in python:
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
# conv layer
featurizer.add(gluon.nn.Conv2D(kernel_size=(3,3), padding=(1,1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
....
featurizer.hybridize()
return featurizer
class EncoderLayer(gluon.Block):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def forward(self, x):
x = x.transpose((0,3,1,2))
x = x.flatten()
x = x.split(num_outputs=SEQ_LEN, axis = 1) # (SEQ_LEN, N, CHANNELS)
x = nd.concat(*[elem.expand_dims(axis=0) for elem in x], dim=0)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.Sequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
decoder.hybridize()
return decoder
def get_net():
net = gluon.nn.Sequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
Any help would be highly appreciated.
Thank you very much.

There are few requirements for a model in Gluon to be exportable to json:
It needs to be hybridizable, meaning that each children block should be hybridizable as well and the model works in both modes
All parameters should be initialized. Since Gluon uses deferred parameter initialization, that means that you should do forward pass at least once before you can save the model.
I did some fixes for your code also introducing new constants when I needed. The most significant changes are:
Don't use split if you can avoid it, because it returns list of NDArrays. Use reshape, which works seemlessly with Symbol as well.
Starting from 1.3.0 version of MXNet, LSTM is also hybridizable, so you can wrap it in a HybridBlock instead of just a Block.
Use HybridSequential.
Here is the adjusted code with an example at the bottom how to save the model and how to load it back. You can find more information in this tutorial.
import mxnet as mx
from mxnet import gluon
from mxnet import nd
BATCH_SIZE = 1
CHANNELS = 100
ALPHABET_SIZE = 1000
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
HEIGHT = 100
WIDTH = 100
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
featurizer.add(
gluon.nn.Conv2D(kernel_size=(3, 3), padding=(1, 1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
return featurizer
class EncoderLayer(gluon.HybridBlock):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def hybrid_forward(self, F, x):
x = x.transpose((0, 3, 1, 2))
x = x.flatten()
x = x.reshape(shape=(SEQ_LEN, -1, CHANNELS)) #x.split(num_outputs=SEQ_LEN, axis=1) # (SEQ_LEN, N, CHANNELS)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.HybridSequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
return decoder
def get_net():
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
if __name__ == '__main__':
net = get_net()
net.initialize()
net.hybridize()
fake_data = mx.random.uniform(shape=(BATCH_SIZE, HEIGHT, WIDTH, CHANNELS))
out = net(fake_data)
net.export("mymodel")
deserialized_net = gluon.nn.SymbolBlock.imports("mymodel-symbol.json", ['data'],
"mymodel-0000.params", ctx=mx.cpu())
out2 = deserialized_net(fake_data)
# just to check that we get the same results
assert (out - out2).sum().asscalar() == 0

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

How to add a multihead attention layer to a CNN-LSTM model? - deep-learning

Related

What is wrong with my neural net model with LSTM for regression problem that it doesn't return the model as output?

ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, None, None, 3)

Swin Transformer for Facial expression recognition

DenseNet, Sizes of tensors must match

MXNET CNN+LSTM save/serialize to json

Categories

Resources