How to make multi-inputs and multi-outputs neural network model - deep-learning

I converted the following code from Keras to Pytorch. The main challenge here for me is to make multi-inputs and multi-outputs model similar to keras.models.Model. As how to implement the following code, in Pytorch, to accept the multi input and outputs.
from tensorflow import keras as k
import tensorflow as tf
class NetworkKeys:
NUM_UNITS = "num_units"
ACTIVATION = "activation"
L2_REG_FACT = "l2_reg_fact"
DROP_PROB = "drop_prob"
BATCH_NORM = "batch_norm"
def build_dense_network(input_dim, output_dim,
output_activation, params, with_output_layer=True):
model = k.models.Sequential()
activation = params.get(NetworkKeys.ACTIVATION, "relu")
l2_reg_fact = params.get(NetworkKeys.L2_REG_FACT, 0.0)
regularizer = k.regularizers.l2(l2_reg_fact) if l2_reg_fact > 0 else None
drop_prob = params.get(NetworkKeys.DROP_PROB, 0.0)
batch_norm = params.get(NetworkKeys.BATCH_NORM, False)
last_dim = input_dim
for i in range(len(params[NetworkKeys.NUM_UNITS])):
model.add(k.layers.Dense(units=params[NetworkKeys.NUM_UNITS][i],
kernel_regularizer=regularizer,
input_dim=last_dim))
if batch_norm:
model.add(k.layers.BatchNormalization())
model.add(k.layers.Activation(activation))
last_dim = params[NetworkKeys.NUM_UNITS][i]
if drop_prob > 0.0:
model.add(k.layers.Dropout(rate=drop_prob))
if with_output_layer:
model.add(k.layers.Dense(units=output_dim, activation=output_activation))
return model
ldre_net = build_dense_network(input_dim=input_dim, output_dim=1,
output_activation=k.activations.linear,
params=hidden_params)
p_samples = k.layers.Input(shape=(input_dim,))
q_samples = k.layers.Input(shape=(input_dim,))
train_model = k.models.Model(inputs=[p_samples, q_samples],
outputs=[ldre_net(p_samples),ldre_net(q_samples)])
Here is my attempt to convert the above code to Pytorch code:
def l2_penalty(model, l2_lambda=0.001):
"""Returns the L2 penalty of the params."""
l2_norm = sum(p.pow(2).sum() for p in model.parameters())
return l2_lambda*l2_norm
def build_dense_network(input_dim, output_dim,
output_activation, params, with_output_layer=True):
activation = params.get(NetworkKeys.ACTIVATION, "relu")
l2_reg_fact = params.get(NetworkKeys.L2_REG_FACT, 0.0)
drop_prob = params.get(NetworkKeys.DROP_PROB, 0.0)
batch_norm = params.get(NetworkKeys.BATCH_NORM, False)
layers=[]
last_dim = input_dim
for i in range(len(params[NetworkKeys.NUM_UNITS])):
layers.append(nn.Linear(last_dim,params[NetworkKeys.NUM_UNITS][i]))
if batch_norm:
layers.append(torch.nn.BatchNorm1d(params[NetworkKeys.NUM_UNITS][i]))
if activation=="relu":
layers.append(nn.ReLU())
elif activation=="LeakyRelu":
layers.append(nn.LeakyReLU(0.1,inplace=True))
else:
pass
last_dim = params[NetworkKeys.NUM_UNITS][i]
if drop_prob > 0.0:
layers.append(torch.nn.Dropout(p=drop_prob))
if with_output_layer:
layers.append(nn.Linear(params[NetworkKeys.NUM_UNITS][-1],output_dim))
model = nn.Sequential(*layers)
regularizer = l2_penalty(model, l2_lambda=0.001) if l2_reg_fact > 0 else None
return model, regularizer
class Split(torch.nn.Module):
def __init__(self, module, n_parts: int, dim=1):
super().__init__()
self._n_parts = n_parts
self._dim = dim
self._module = module
def forward(self, inputs):
output = self._module(inputs)
chunk_size = output.shape[self._dim] // self._n_parts
return torch.split(output, chunk_size, dim=self._dim)
class Net(nn.Module):
def __init__(self, hidden_params, input_dim):
self._ldre_net, ldre_regularizer = build_dense_network(input_dim=input_dim,
output_dim=1,output_activation="linear", params=hidden_params)
self._p_samples = nn.Linear(input_dim,input_dim)
self._q_samples = nn.Linear(input_dim,input_dim)
self._split_layers = Split(
self._ldre_net,
n_parts=2,
dim = 0
)
def forward(self, x, inTrain=True):
if inTrain:
p = self._p_samples(x)
q = self._q_samples(x)
p = x[:, 0, :]
q = x[:, 1, :]
combined = torch.cat((p.view(p.size(0), -1),
q.view(q.size(0), -1)), dim=0)
p_output, q_output =self._split_layers(combined)
return p_output, q_output
else:
return self._ldre_net(x)
I am wondering whether my implementation in the Net class is correct or not?

TLDR You control the number of inputs and outputs in PyTorch, in the form of a tensor (or a number of variables). Missing super initialization and the order of operations should be fixed. Also don't particularly like the way arguments are passed, recommend using *args and **kwargs.
Explanation
There were a few things for me to make it run, namely the parameters NetworkKeys are used to access the dictionary that is passed through. Seems like an overly complicated way to do things, as you tried to make default values, but in the end, it threw exceptions if there are none (namely num_units). Recommend just using args and kwargs and passing the dictionary as a parameter. Tried with the following example:
values = {NetworkKeys.BATCH_NORM: False,
NetworkKeys.L2_REG_FACT: 0.0,
NetworkKeys.DROP_PROB: 0.0,
NetworkKeys.ACTIVATION: "relu",
NetworkKeys.NUM_UNITS: [10, 10]
}
print(values)
Net(values, 10)
There were a few things to fix in the Net class
Needs initialization of super (e.g. super(Net, self).__init__())
Order of the forward pass didn't make sense, you are overriding the output of the linear layer, see that we are doing self_p_samples(p) now which is one of the dimensions p = x[:, 0, :].
class Net(nn.Module):
def __init__(self, hidden_params, input_dim):
super(Net, self).__init__()
self._ldre_net, ldre_regularizer = build_dense_network(input_dim=input_dim,
output_dim=1,output_activation="linear", params=hidden_params)
self._p_samples = nn.Linear(input_dim,input_dim)
self._q_samples = nn.Linear(input_dim,input_dim)
self._split_layers = Split(
self._ldre_net,
n_parts=2,
dim = 0
)
def forward(self, x, inTrain=True):
if inTrain:
p = x[:, 0, :]
q = x[:, 1, :]
p = self._p_samples(p)
q = self._q_samples(q)
combined = torch.cat((p.view(p.size(0), -1),
q.view(q.size(0), -1)), dim=0)
p_output, q_output =self._split_layers(combined)
return p_output, q_output
else:
return self._ldre_net(x)
While displaying the network got with a successful forward pass with input size of torch.randn((1,2,10)):
==========================================================================================
Layer (type:depth-idx) Output Shape Param #
==========================================================================================
Net -- --
├─Linear: 1-1 [1, 10] 110
├─Linear: 1-2 [1, 10] 110
├─Split: 1-3 [1, 1] --
├─Sequential: 1-4 [2, 1] --
│ └─Linear: 2-1 [2, 10] 110
│ └─ReLU: 2-2 [2, 10] --
│ └─Linear: 2-3 [2, 10] 110
│ └─ReLU: 2-4 [2, 10] --
│ └─Linear: 2-5 [2, 1] 11
==========================================================================================
Total params: 451
Trainable params: 451
Non-trainable params: 0
Total mult-adds (M): 0.00
==========================================================================================
Example output, will be in the form of:
(tensor([[-0.0699]], grad_fn=<SplitBackward0>),
tensor([[0.0394]], grad_fn=<SplitBackward0>))
Note: I didn't try to overfit this model (which you should do) to validate that it indeed can learn what you want.
Also a side note, if you really wanted multiple outputs for auxiliary which aren't part of tensor and you have to compute, you can just do return x,y in the forward pass

Related

Self attention module occpuying too large VRAM with large input

I was trying to add a self-attention module for Progressive GAN (ProGAN) and put it to the last layer before to RGB. After running a simple test file I found that when the model grows to 256x256 output, the process is killed.
Then I tried the following test code:
import torch
import torch.nn as nn
class SelfAttention(nn.Module):
def __init__(self, channels):
super(SelfAttention, self).__init__()
self.channels = channels
num_heads = 4
self.mha = nn.MultiheadAttention(channels, num_heads, batch_first=True)
self.ln = nn.LayerNorm([channels])
self.ff_self = nn.Sequential(
nn.LayerNorm([channels]),
nn.Linear(channels, channels),
nn.GELU(),
nn.Linear(channels, channels)
)
def forward(self, x):
size = x.shape[3]
print("SIZE", size)
print("CHANNELS", self.channels)
x = x.view(-1, self.channels, size * size).swapaxes(1, 2)
print()
x_ln = self.ln(x)
attention_value, _ = self.mha(x_ln, x_ln, x_ln)
attention_value = attention_value + x
attention_value = self.ff_self(attention_value) + attention_value
return attention_value.swapaxes(2, 1).view(-1, self.channels, size, size)
class SelfAttention2(nn.Module):
def __init__(self, channels):
super(SelfAttention2, self).__init__()
self.query = nn.Conv2d(channels, channels // 8, kernel_size=1)
self.key = nn.Conv2d(channels, channels // 8, kernel_size=1)
self.value = nn.Conv2d(channels, channels, kernel_size=1)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
N, C, H, W = x.size()
query = self.query(x).view(N, -1, W*H).permute(0, 2, 1) # (N, C, H*W)
key = self.key(x).view(N, -1, W*H) # (N, C, H*W)
energy = torch.bmm(query, key) # (N, H*W, H*W)
attention = self.softmax(energy)
value = self.value(x).view(N, -1, W*H) # (N, C, H*W)
out = torch.bmm(value, attention.permute(0, 2, 1))
out = out.view(N, C, H, W)
return out
if __name__ == '__main__':
x = torch.randn((1, 64, 256, 256))
sa1 = SelfAttention(64)
sa1(x)
sa2 = SelfAttention2(64)
sa2(x)
Neither module worked for trying to allocate 16G VRAM. (With this one module take up 16G I cannot even run the whole model in 3090)
And I am told explictly that the method itself, i.e. "Add attention to ProGAN or StyleGAN" will work and has been done.
So, is my understanding of the idea is false or the implementation has flaw?
Also, I have train the model to 32x32 and it worked ok.
Suggentions upon my understanding or my coding.

PyTorch: Batch size=1 but model gives 2 outputs

Here is some part of my PyTorch code:
test_loader = DataLoader(dataset = test_loader_hibiscus, batch_size = 1, shuffle=False, num_workers=0)
test_losses = []
y_pred_list = []
feat_list = []
with torch.no_grad():
model.eval()
test_loss = 0.0
if expe_temoin == False :
for test_dwi,test_adc,test_tmax,test_cbf,test_cbv,test_label in test_loader :
test_dwi = test_dwi.to(device)
test_adc = test_adc.to(device)
test_tmax = test_tmax.to(device)
test_cbf = test_cbf.to(device)
test_cbv = test_cbv.to(device)
in_imgs = torch.cat((train_dwi,train_adc,train_tmax,train_cbf,train_cbv), dim=1)
out_recon, my_feat = model(in_imgs)
print("my_feat", my_feat[0].shape)
But it prints:
my_feat torch.Size([2, 512, 1, 24, 24])
Could someone please tell me why 2 (batch?) Thanks!
Hint: When I run with test data size: 26, its OK, when run with data size: 25, mess up the batch! Is there something about being odd and even?!
Here is the Unet 3d model for 3D reconstruction and segmentation:
class Abstract3DUNet(nn.Module):
def __init__(self, in_channels, out_channels, final_sigmoid, basic_module, f_maps=64, layer_order='gcr',
num_groups=8, num_levels=4, is_segmentation=True, testing=False,
conv_kernel_size=3, pool_kernel_size=2, conv_padding=1, **kwargs):
super(Abstract3DUNet, self).__init__()
self.testing = testing
if isinstance(f_maps, int):
f_maps = number_of_features_per_level(f_maps, num_levels=num_levels)
assert isinstance(f_maps, list) or isinstance(f_maps, tuple)
assert len(f_maps) > 1, "Required at least 2 levels in the U-Net"
# create encoder path
self.encoders = create_encoders(in_channels, f_maps, basic_module, conv_kernel_size, conv_padding, layer_order,
num_groups, pool_kernel_size)
# create decoder path
self.decoders = create_decoders(f_maps, basic_module, conv_kernel_size, conv_padding, layer_order, num_groups,
upsample=True)
# in the last layer a 1×1 convolution reduces the number of output
# channels to the number of labels
self.final_conv = nn.Conv3d(f_maps[0], out_channels, 1)
if is_segmentation:
# semantic segmentation problem
if final_sigmoid:
self.final_activation = nn.Sigmoid()
else:
self.final_activation = nn.Softmax(dim=1)
else:
# regression problem
self.final_activation = None
def forward(self, x):
# encoder part
encoders_features = []
my_feat =[]
for encoder in self.encoders:
x = encoder(x)
# reverse the encoder outputs to be aligned with the decoder
encoders_features.insert(0, x)
# remove the last encoder's output from the list
# !!remember: it's the 1st in the list
my_feat = encoders_features[0:]
encoders_features = encoders_features[1:]
# decoder part
for decoder, encoder_features in zip(self.decoders, encoders_features):
# pass the output from the corresponding encoder and the output
# of the previous decoder
x = decoder(encoder_features, x)
x = self.final_conv(x)
# apply final_activation (i.e. Sigmoid or Softmax) only during prediction. During training the network outputs
# logits and it's up to the user to normalize it before visualising with tensorboard or computing validation metric
if self.testing and self.final_activation is not None:
x = self.final_activation(x)
return x, my_feat
class UNet3D(Abstract3DUNet):
def __init__(self, in_channels, out_channels, final_sigmoid=True, f_maps=64, layer_order='gcr',
num_groups=8, num_levels=4, is_segmentation=True, conv_padding=1, **kwargs):
super(UNet3D, self).__init__(in_channels=in_channels,
out_channels=out_channels,
final_sigmoid=final_sigmoid,
basic_module=DoubleConv,
f_maps=f_maps,
layer_order=layer_order,
num_groups=num_groups,
num_levels=num_levels,
is_segmentation=is_segmentation,
conv_padding=conv_padding,
**kwargs)
My train batch size was 3! When I changed it into 2 or 4 the problem solved!

PyTorch: Target 1 is out of bounds

I am new to Deep Learning and wondering how to modify my model to fix it.
It says Target 1 is out of bounds, so what parameter should I change to make it works. When the output is changed to 2, it works. However, the goal for the model is to predict 2 classes classification. Also, when output is 2, the training loss becomes nan.
The data is a dataframe with shape (15958, 4) transformed into tensor format.
Sorry Split_NN is a class:
# SplitNN
# to protect privacy and split
class SplitNN:
def __init__(self, models, optimizers):
self.models = models
self.optimizers = optimizers
self.data = []
self.remote_tensors = []
def forward(self, x):
data = []
remote_tensors = []
data.append(self.models[0](x))
if data[-1].location == self.models[1].location:
remote_tensors.append(data[-1].detach().requires_grad_())
else:
remote_tensors.append(
data[-1].detach().move(self.models[1].location).requires_grad_()
)
i = 1
while i < (len(models) - 1):
data.append(self.models[i](remote_tensors[-1]))
if data[-1].location == self.models[i + 1].location:
remote_tensors.append(data[-1].detach().requires_grad_())
else:
remote_tensors.append(
data[-1].detach().move(self.models[i + 1].location).requires_grad_()
)
i += 1
data.append(self.models[i](remote_tensors[-1]))
self.data = data
self.remote_tensors = remote_tensors
return data[-1]
def backward(self):
for i in range(len(models) - 2, -1, -1):
if self.remote_tensors[i].location == self.data[i].location:
grads = self.remote_tensors[i].grad.copy()
else:
grads = self.remote_tensors[i].grad.copy().move(self.data[i].location)
self.data[i].backward(grads)
def zero_grads(self):
for opt in self.optimizers:
opt.zero_grad()
def step(self):
for opt in self.optimizers:
opt.step()
Below are the codes:
Model set up: The Model is a sequential deep learning model, which I tried to use nn.linear to generated binary prediction.
torch.manual_seed(0)
# Define our model segments
input_size = 3
hidden_sizes = [128, 640]
output_size = 1
# original models
models = [
nn.Sequential(
nn.Linear(input_size, hidden_sizes[0]),
nn.ReLU(),
nn.Linear(hidden_sizes[0], hidden_sizes[1]),
nn.ReLU(),
),
nn.Sequential(nn.Linear(hidden_sizes[1], output_size), nn.LogSoftmax(dim=1)),
]
# Create optimisers for each segment and link to them
optimizers = [
optim.SGD(model.parameters(), lr=0.03,)
for model in models
]
Train model is here:
def train(x, target, splitNN):
#1) Zero our grads
splitNN.zero_grads()
#2) Make a prediction
pred = splitNN.forward(x)
#3) Figure out how much we missed by
criterion = nn.NLLLoss()
loss = criterion(pred, target)
#4) Backprop the loss on the end layer
loss.backward()
#5) Feed Gradients backward through the nework
splitNN.backward()
#6) Change the weights
splitNN.step()
return loss, pred
Finally the training part, also the part where problem happen:
the send function is for assigning model to the nodes, cuz this is set up to simulating federated learning.
for i in range(epochs):
running_loss = 0
correct_preds = 0
total_preds = 0
for (data, ids1), (labels, ids2) in dataloader:
# Train a model
data = data.send(models[0].location)
data = data.view(data.shape[0], -1)
labels = labels.send(models[-1].location)
# Call model
loss, preds = train(data.float(), labels, splitNN)
# Collect statistics
running_loss += loss.get()
correct_preds += preds.max(1)[1].eq(labels).sum().get().item()
total_preds += preds.get().size(0)
print(f"Epoch {i} - Training loss: {running_loss/len(dataloader):.3f} - Accuracy: {100*correct_preds/total_preds:.3f}")
The error show the problem occurs at loss, preds = train(data.float(), labels, splitNN)
The actual error message:
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
1836 .format(input.size(0), target.size(0)))
1837 if dim == 2:
-> 1838 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
1839 elif dim == 4:
1840 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
IndexError: Target 1 is out of bounds.
Please help me. Thank you

WHat does Lambda do in this code (python keras)?

def AdaIN(x):
#Normalize x[0] (image representation)
mean = K.mean(x[0], axis = [1, 2], keepdims = True)
std = K.std(x[0], axis = [1, 2], keepdims = True) + 1e-7
y = (x[0] - mean) / std
#Reshape scale and bias parameters
pool_shape = [-1, 1, 1, y.shape[-1]]
scale = K.reshape(x[1], pool_shape)
bias = K.reshape(x[2], pool_shape)#Multiply by x[1] (GAMMA) and add x[2] (BETA)
return y * scale + bias
def g_block(input_tensor, latent_vector, filters):
gamma = Dense(filters, bias_initializer = 'ones')(latent_vector)
beta = Dense(filters)(latent_vector)
out = UpSampling2D()(input_tensor)
out = Conv2D(filters, 3, padding = 'same')(out)
out = Lambda(AdaIN)([out, gamma, beta])
out = Activation('relu')(out)
return out
Please see code above. I am currently studying styleGAN. I am trying to convert this code into pytorch but I cant seem to understand what does Lambda do in g_block. AdaIN needs only one input based on its declaration but some how is gamma and beta also used as input? Please inform me what does the Lambda do in this code.
Thank you very much.
Lambda layers in keras are used to call custom functions inside the model. In g_block Lambda calls AdaIN function and passes out, gamma, beta as arguments inside a list. And AdaIN function receives these 3 tensors encapsulated within a single list as x. And also those tensors are accessed inside AdaIN function by indexing list x(x[0], x[1], x[2]).
Here's pytorch equivalent:
import torch
import torch.nn as nn
import torch.nn.functional as F
class AdaIN(nn.Module):
def forward(self, out, gamma, beta):
bs, ch = out.size()[:2]
mean = out.reshape(bs, ch, -1).mean(dim=2).reshape(bs, ch, 1, 1)
std = out.reshape(bs, ch, -1).std(dim=2).reshape(bs, ch, 1, 1) + 1e-7
y = (out - mean) / std
bias = beta.unsqueeze(-1).unsqueeze(-1).expand_as(out)
scale = gamma.unsqueeze(-1).unsqueeze(-1).expand_as(out)
return y * scale + bias
class g_block(nn.Module):
def __init__(self, filters, latent_vector_shape, input_tensor_channels):
super().__init__()
self.gamma = nn.Linear(in_features = latent_vector_shape, out_features = filters)
# Initializes all bias to 1
self.gamma.bias.data = torch.ones(filters)
self.beta = nn.Linear(in_features = latent_vector_shape, out_features = filters)
# calculate appropriate padding
self.conv = nn.Conv2d(input_tensor_channels, filters, 3, 1, padding=1)# calc padding
self.adain = AdaIN()
def forward(self, input_tensor, latent_vector):
gamma = self.gamma(latent_vector)
beta = self.beta(latent_vector)
# check default interpolation mode in keras and replace mode below if different
out = F.interpolate(input_tensor, scale_factor=2, mode='nearest')
out = self.conv(out)
out = self.adain(out, gamma, beta)
out = torch.relu(out)
return out
# Sample:
input_tensor = torch.randn((1, 3, 10, 10))
latent_vector = torch.randn((1, 5))
g = g_block(3, latent_vector.shape[1], input_tensor.shape[1])
out = g(input_tensor, latent_vector)
print(out)
Note: you need to pass latent_vector and input_tensor shapes while creating g_block.

MXNET CNN+LSTM save/serialize to json

I'm finding a hardtime figuring out how to correctly define a mxnet net so that i can serialize/convert this model to a json file.
The pipeline is composed of a CNN + biLSTM + CTC.
I now i must use HybridBlock and hybridize() but i can't seem to make it work or if its even possible or if there is any other way around.
I'm sure its lack of knowledge on my part and wonder is anyone can help.
Here is the net definition in python:
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
# conv layer
featurizer.add(gluon.nn.Conv2D(kernel_size=(3,3), padding=(1,1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
....
featurizer.hybridize()
return featurizer
class EncoderLayer(gluon.Block):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def forward(self, x):
x = x.transpose((0,3,1,2))
x = x.flatten()
x = x.split(num_outputs=SEQ_LEN, axis = 1) # (SEQ_LEN, N, CHANNELS)
x = nd.concat(*[elem.expand_dims(axis=0) for elem in x], dim=0)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.Sequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
decoder.hybridize()
return decoder
def get_net():
net = gluon.nn.Sequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
Any help would be highly appreciated.
Thank you very much.
There are few requirements for a model in Gluon to be exportable to json:
It needs to be hybridizable, meaning that each children block should be hybridizable as well and the model works in both modes
All parameters should be initialized. Since Gluon uses deferred parameter initialization, that means that you should do forward pass at least once before you can save the model.
I did some fixes for your code also introducing new constants when I needed. The most significant changes are:
Don't use split if you can avoid it, because it returns list of NDArrays. Use reshape, which works seemlessly with Symbol as well.
Starting from 1.3.0 version of MXNet, LSTM is also hybridizable, so you can wrap it in a HybridBlock instead of just a Block.
Use HybridSequential.
Here is the adjusted code with an example at the bottom how to save the model and how to load it back. You can find more information in this tutorial.
import mxnet as mx
from mxnet import gluon
from mxnet import nd
BATCH_SIZE = 1
CHANNELS = 100
ALPHABET_SIZE = 1000
NUM_HIDDEN = 200
NUM_CLASSES = 13550
NUM_LSTM_LAYER = 1
p_dropout = 0.5
SEQ_LEN = 32
HEIGHT = 100
WIDTH = 100
def get_featurizer():
featurizer = gluon.nn.HybridSequential()
featurizer.add(
gluon.nn.Conv2D(kernel_size=(3, 3), padding=(1, 1), channels=32, activation="relu"))
featurizer.add(gluon.nn.BatchNorm())
return featurizer
class EncoderLayer(gluon.HybridBlock):
def __init__(self, **kwargs):
super(EncoderLayer, self).__init__(**kwargs)
with self.name_scope():
self.lstm = mx.gluon.rnn.LSTM(NUM_HIDDEN, NUM_LSTM_LAYER, bidirectional=True)
def hybrid_forward(self, F, x):
x = x.transpose((0, 3, 1, 2))
x = x.flatten()
x = x.reshape(shape=(SEQ_LEN, -1, CHANNELS)) #x.split(num_outputs=SEQ_LEN, axis=1) # (SEQ_LEN, N, CHANNELS)
x = self.lstm(x)
x = x.transpose((1, 0, 2)) # (N, SEQ_LEN, HIDDEN_UNITS)
return x
def get_encoder():
encoder = gluon.nn.HybridSequential()
encoder.add(EncoderLayer())
encoder.add(gluon.nn.Dropout(p_dropout))
return encoder
def get_decoder():
decoder = mx.gluon.nn.Dense(units=ALPHABET_SIZE, flatten=False)
return decoder
def get_net():
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(get_featurizer())
net.add(get_encoder())
net.add(get_decoder())
return net
if __name__ == '__main__':
net = get_net()
net.initialize()
net.hybridize()
fake_data = mx.random.uniform(shape=(BATCH_SIZE, HEIGHT, WIDTH, CHANNELS))
out = net(fake_data)
net.export("mymodel")
deserialized_net = gluon.nn.SymbolBlock.imports("mymodel-symbol.json", ['data'],
"mymodel-0000.params", ctx=mx.cpu())
out2 = deserialized_net(fake_data)
# just to check that we get the same results
assert (out - out2).sum().asscalar() == 0