convert pytorch model with multiple networks to onnx - deep-learning

I am trying to convert pytorch model with multiple networks to ONNX, and encounter some problem.
The git repo: https://github.com/InterDigitalInc/HRFAE
The Trainer Class:
class Trainer(nn.Module):
def __init__(self, config):
super(Trainer, self).__init__()
# Load Hyperparameters
self.config = config
# Networks
self.enc = Encoder()
self.dec = Decoder()
self.mlp_style = Mod_Net()
self.dis = Dis_PatchGAN()
...
Here is how the trained model process image:
def gen_encode(self, x_a, age_a, age_b=0, training=False, target_age=0):
if target_age:
self.target_age = target_age
age_modif = self.target_age*torch.ones(age_a.size()).type_as(age_a)
else:
age_modif = self.random_age(age_a, diff_val=25)
# Generate modified image
self.content_code_a, skip_1, skip_2 = self.enc(x_a)
style_params_a = self.mlp_style(age_a)
style_params_b = self.mlp_style(age_modif)
x_a_recon = self.dec(self.content_code_a, style_params_a, skip_1, skip_2)
x_a_modif = self.dec(self.content_code_a, style_params_b, skip_1, skip_2)
return x_a_recon, x_a_modif, age_modif
And as following is how I did to convert to onnx:
enc = Encoder()
dec = Decoder()
mlp = Mod_Net()
layers = [enc, mlp, dec]
model = torch.nn.Sequential(*layers)
# here is my confusion: how do I specify the inputs of each layer??
# E.g. one of the outputs of 'enc' layer should be input of 'mlp' layer,
# or the outputs of 'enc' layer should be part of inputs of 'dec' layer...
params = torch.load('./logs/001/checkpoint')
model[0].load_state_dict(params['enc_state_dict'])
model[1].load_state_dict(params['mlp_style_state_dict'])
model[2].load_state_dict(params['dec_state_dict'])
torch.onnx.export(model, torch.randn([1, 3, 1024, 1024]), 'trained_hrfae.onnx', do_constant_folding=True)
Maybe the convert-part code is in wrong way??
Could anyone help, many thanks!
#20210629-11:52GMT Edit:
I found there's constraint of using torch.nn.Sequential. The output of former layer in Sequential should be consistent with latter input.
So my code shouldn't work at all because the output of 'enc' layer is not consistent with input of 'mlp' layer.
Could anyone help how to convert this type of pytorch model to onnx? Many thanks, again :)

After research and try, I found a method which maybe in correct way:
Convert each net(Encoder, Mod_Net, Decoder) to onnx model, and handle their input/output in latter logic-process or any further procedure (e.g convert to tflite model).
I'm trying to port onto Android using this method.
#Edit 20210705-03:52GMT#
Another approach may be better: write a new net combines the three nets. I've prove the output is same as origin pytorch model.
class HRFAE(nn.Module):
def __init__(self):
super(HRFAE, self).__init__()
self.enc = Encoder()
self.mlp_style = Mod_Net()
self.dec = Decoder()
def forward(self, x, age_modif):
content_code_a, skip_1, skip_2 = self.enc(x)
style_params_b = self.mlp_style(age_modif)
x_a_modif = self.dec(content_code_a, style_params_b, skip_1, skip_2)
return x_a_modif
and then convert use following:
net = HRFAE()
params = torch.load('./logs/002/checkpoint')
net.enc.load_state_dict(params['enc_state_dict'])
net.mlp_style.load_state_dict(params['mlp_style_state_dict'])
net.dec.load_state_dict(params['dec_state_dict'])
net.eval()
torch.onnx.export(net, (torch.randn([1, 3, 512, 512]), torch.randn([1]).type(torch.long)), 'test_hrfae.onnx')
This should be the answer.

Related

NMT , 'KerasTensor' object is not callable'

Here I share a code snippet for training Encoder_Decoder Model for machine translation. While Using the Embedding layer (trained previously) during inference mode( on test_data) . It threw the following error --->
# Encoder
encoder_inputs = Input(shape=(None ,))
enc_emb = Embedding(eng_vocab_size, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(deu_vocab_size, latent_dim, mask_zero = True)(decoder_inputs)
# decoder return full output sequences, and internal states as well.
# We don't use the return states in the training model,
# but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
initial_state=encoder_states)
decoder_dense = Dense(deu_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2= dec_emb(decoder_inputs) # reusing embedding layer
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs) # reusing lstm layer
decoder_outputs2 = decoder_dense(decoder_outputs2) # softmax_layer to generate prob_dist. over target vocab
# Final decoder model
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs2] )
ERROR
8 decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
---> 10 dec_emb2= dec_emb(decoder_inputs) # reusing embedding layer
11
12 decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs) # reusing lstm layer
TypeError: 'KerasTensor' object is not callableenter image description here
I read through various solutions available for this issue , but couldn't understand what 2 modes of model they were talking about and what their soltion was effectively doing .
Pls explain in detail. Thanks in advance

Why is my Transformer implementation losing to a BiLSTM?

I am dealing with a sequence tagging problem and I am using a single Transformer Encoder to obtain logits from each element of the sequence. Having experimented both with Transformer and BiLSTM it looks like in my case BiLSTM is working better, so I was wondering if maybe it is because my Transformer implementation has some problem... Below is my implementation of the Transformer Encoder and related functions for creating padding mask and positional embeddings:
def create_mask(src, lengths):
"""Create a mask hiding future tokens
Parameters:
src (tensor): the source tensor having shape [batch_size, number_of_steps, features_dimensions]
length (list): a list of integers representing the length (i.e. number_of_steps) of each sample in the batch."""
mask = []
max_len = src.shape[1]
for index, i in enumerate(src):
# The mask consists in tensors having false at the step number that doesn't need to be hidden and true otherwise
mask.append([False if (i+1)>lengths[index] else True for i in range(max_len)])
return torch.tensor(mask)
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000, device = 'cpu'):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
self.device = device
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(1, max_len, d_model)
pe[0, :, 0::2] = torch.sin(position * div_term)
pe[0, :, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1), :].to(self.device)
return self.dropout(x)
class Transformer(nn.Module):
"""Class implementing transformer ecnoder, partially based on
https://pytorch.org/tutorials/beginner/transformer_tutorial.html"""
def __init__(self, in_dim, h_dim, n_heads, n_layers, dropout=0.2, drop_out = 0.0, batch_first = True, device = 'cpu', positional_encoding = True):
super(Transformer, self).__init__()
self.model_type = 'Transformer'
self.pos_encoder = PositionalEncoding(in_dim, dropout, device = device)
encoder_layers = nn.TransformerEncoderLayer(in_dim, n_heads, h_dim, dropout)
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers, norm=nn.LayerNorm(in_dim))
self.in_dim = in_dim
self.drop_out = drop_out
self.positional_encoding = positional_encoding
def forward(self, src, mask = None, line_len=None):
src = src * math.sqrt(self.in_dim)
if self.positional_encoding:
src = self.pos_encoder(src)
if line_len is not None and mask is None:
mask = create_mask(src, line_len)
else:
mask = None
output = self.transformer_encoder(src, src_key_padding_mask = mask)
if self.drop_out:
output = F.dropout(output, p = self.drop_out)
return src, output
As it can be seen, the above network outputs the hidden states and then I pass them into an additional linear layer and train with a CrossEntropy loss over two classes and Adam optimizer. I have tried multiple combinations of hyperparameters but the BiLSTM still performs better. Can anyone spot anything off in my Transformer or suggest why I experience such a counterintuitive result?
This may be surprising, but Transformers don't always beat LSTMs. For example, Language Models with Transformers states:
Transformer architectures are suboptimal for language model itself. Neither self-attention nor the positional encoding in the Transformer is able to efficiently incorporate the word-level sequential context crucial to language modeling.
If you run the Transformer tutorial code itself (on which your code is based), you'll also see LSTM do better there. See this thread on stats.SE for more discussion on this topic (disclaimer: both the question and the answer there are mine)

CNN + RNN architecture for video recognition

I am trying to replicate the ConvNet + LSTM approach presented in this paper using pytorch. But I am struggling to find the correct way to combine the CNN and the LSTM in my model. Here is my attempt :
class VideoRNN(nn.Module):
def __init__(self, hidden_size, n_classes):
super(VideoRNN, self).__init__()
self.hidden_size = hidden_size
vgg = models.vgg16(pretrained=True)
embed = nn.Sequential(*list(vgg.classifier.children())[:-1])
vgg.classifier = embed
for param in vgg.parameters():
param.requires_grad = False
self.embedding = vgg
self.GRU = nn.GRU(4096, hidden_size)
def forward(self, input, hidden=None):
embedded = self.embedding(input)
output, hidden = self.gru(output, hidden)
output = self.classifier(output.view(-1, 4096))
return output, hidden
As my videos have variable length, I provide a PackedSequence as an input. It is created from a Tensor with shape (M,B,C,H,W) where M is the maximum sequence length and B the batch size. The C,H,W are the channels, height and width of each frame.
I want the pre-trained CNN to be part of the model as I may later unfreeze some layer to finetune the CNN for my task. That's why I didn't compute the embedding of the images separately.
My questions are then the following :
Is the shape of my input data correct in order to handle batches of videos in my context or should I use something else than a PackedSequence?
In my forward function, how can I handle the batch of sequences of images with my VGG and my GRU unit ? I cannot feed directly the PackedSequence as an input to my VGG so how can I proceed?
Does this approach seem to respect the "pytorch way of doing things" or should is my approach flawed?
I finally found the solution to make it works. Here is a simplified yet complete example of how I managed to create a VideoRNN able to use packedSequence as an input :
class VideoRNN(nn.Module):
def __init__(self, n_classes, batch_size, device):
super(VideoRNN, self).__init__()
self.batch = batch_size
self.device = device
# Loading a VGG16
vgg = models.vgg16(pretrained=True)
# Removing last layer of vgg 16
embed = nn.Sequential(*list(vgg.classifier.children())[:-1])
vgg.classifier = embed
# Freezing the model 3 last layers
for param in vgg.parameters():
param.requires_grad = False
self.embedding = vgg
self.gru = nn.LSTM(4096, 2048, bidirectional=True)
# Classification layer (*2 because bidirectionnal)
self.classifier = nn.Sequential(
nn.Linear(2048 * 2, 256),
nn.ReLU(),
nn.Linear(256, n_classes),
)
def forward(self, input):
hidden = torch.zeros(2, self.batch , 2048).to(
self.device
)
c_0 = torch.zeros(self.num_layer * 2, self.batch, 2048).to(
self.device
)
embedded = self.simple_elementwise_apply(self.embedding, input)
output, hidden = self.gru(embedded, (hidden, c_0))
hidden = hidden[0].view(-1, 2048 * 2)
output = self.classifier(hidden)
return output
def simple_elementwise_apply(self, fn, packed_sequence):
return torch.nn.utils.rnn.PackedSequence(
fn(packed_sequence.data), packed_sequence.batch_sizes
)
the key is the simple_elementwise_apply methods allowing to feed the PackedSequence in the CNN networks and to retrieve a new PackedSequence made of embedding as an output.
I hope you'll find it useful.

How to test the correctness of a Keras custom layer?

After creating a Keras custom layer with training weight, how can one test the correctness of the code? It does not seem to be described in Keras' manual.
For example, to test the expected behavior of a function, one can write a unit test. How can we do this for a Keras custom layer?
You can still do something like unit test by getting the output of the custom layer for the given input and verifying it against the manually calculated output,
Let's say your custom layer Custom takes (None, 3, 200) as input shape and returns (None, 3)
from keras.layers import Input
from keras.models import Model
inp = Input(shape=(3, 200))
out = Custom()(inp)
model = Model(inp, out)
output = model.predict(your_input)
You can verify the layer output output with your expected output for a known input your_input.
layer_test in keras utils.
https://github.com/keras-team/keras/blob/master/keras/utils/test_utils.py
They provide following code, which tests the shape, the actual result, serializing and training:
def layer_test(layer_cls, kwargs={}, input_shape=None, input_dtype=None,
input_data=None, expected_output=None,
expected_output_dtype=None, fixed_batch_size=False):
"""Test routine for a layer with a single input tensor
and single output tensor.
"""
# generate input data
if input_data is None:
assert input_shape
if not input_dtype:
input_dtype = K.floatx()
input_data_shape = list(input_shape)
for i, e in enumerate(input_data_shape):
if e is None:
input_data_shape[i] = np.random.randint(1, 4)
input_data = (10 * np.random.random(input_data_shape))
input_data = input_data.astype(input_dtype)
else:
if input_shape is None:
input_shape = input_data.shape
if input_dtype is None:
input_dtype = input_data.dtype
if expected_output_dtype is None:
expected_output_dtype = input_dtype
# instantiation
layer = layer_cls(**kwargs)
# test get_weights , set_weights at layer level
weights = layer.get_weights()
layer.set_weights(weights)
expected_output_shape = layer.compute_output_shape(input_shape)
# test in functional API
if fixed_batch_size:
x = Input(batch_shape=input_shape, dtype=input_dtype)
else:
x = Input(shape=input_shape[1:], dtype=input_dtype)
y = layer(x)
assert K.dtype(y) == expected_output_dtype
# check with the functional API
model = Model(x, y)
actual_output = model.predict(input_data)
actual_output_shape = actual_output.shape
for expected_dim, actual_dim in zip(expected_output_shape,
actual_output_shape):
if expected_dim is not None:
assert expected_dim == actual_dim
if expected_output is not None:
assert_allclose(actual_output, expected_output, rtol=1e-3)
# test serialization, weight setting at model level
model_config = model.get_config()
recovered_model = model.__class__.from_config(model_config)
if model.weights:
weights = model.get_weights()
recovered_model.set_weights(weights)
_output = recovered_model.predict(input_data)
assert_allclose(_output, actual_output, rtol=1e-3)
# test training mode (e.g. useful when the layer has a
# different behavior at training and testing time).
if has_arg(layer.call, 'training'):
model.compile('rmsprop', 'mse')
model.train_on_batch(input_data, actual_output)
# test instantiation from layer config
layer_config = layer.get_config()
layer_config['batch_input_shape'] = input_shape
layer = layer.__class__.from_config(layer_config)
# for further checks in the caller function
return actual_output

Using a metamodel in a design process using a nested approach

We are interested in using a surrogate model in an aircraft design process implemented in OpenMDAO. Basically we want to use an aerodynamic code (such as VSPaero in our aim) to produce a database (using a DOE ) and then built a surrogate that will be used in the design process. It looks like your proposal 2) in use of MOE in openMDAO and we also want to access to the "gradient" information of the surrogate to be used in the full design problem .
We started from the code you have provided in nested problem question and try to built a mock up case with simplified component for aerodynamic . The example code is below (using kriging) and we have two concerns to finish it:
we need to implement a "linearize" function in our component if we want to use surrogate gradient information: I guess we should use the "calc_gradient" function of problem to do this . Is it right ?
in our example code, the training will be done each time we call the component what is not very efficient : is there a way to call it only once or to do the surrogate training only after the setup() of the bigger problem (aircraft design in our case )?
Here is the code (sorry it is a bit long):
from openmdao.api import IndepVarComp, Group, Problem, ScipyOptimizer, ExecComp, DumpRecorder, Component, NLGaussSeidel,ScipyGMRES, Newton,SqliteRecorder,MetaModel, \
KrigingSurrogate, FloatKrigingSurrogate
from openmdao.drivers.latinhypercube_driver import LatinHypercubeDriver, OptimizedLatinHypercubeDriver
from openmdao.solvers.solver_base import NonLinearSolver
import numpy as np
import sys
alpha_test = np.array([0.56, 0.24, 0.30, 0.32, 0.20])
eta_test = np.array([-0.30, -0.14, -0.19, -0.18, -0.12])
num_elem = len(alpha_test)
class SysAeroSurrogate(Component):
""" Simulates the presence of an aero surrogate mode using linear aerodynamic model """
""" coming from pymission code """
""" https://github.com/OpenMDAO-Plugins/pyMission/blob/master/src/pyMission/aerodynamics.py """
def __init__(self, num_elem=1):
super(SysAeroSurrogate, self).__init__()
self.add_param('alpha', 0.5)
self.add_param('eta', -0.33)
self.add_param('AR', 0.0)
self.add_param('oswald', 0.0)
self.add_output('CL', val=0.0)
self.add_output('CD', val=0.0) ## Drag Coefficient
def solve_nonlinear(self, params, unknowns, resids):
""" Compute lift and drag coefficient using angle of attack and tail
rotation angles. Linear aerodynamics is assumed."""
alpha = params['alpha']
eta = params['eta']
aspect_ratio = params['AR']
oswald = params['oswald']
lift_c0 = 0.30
lift_ca = 6.00
lift_ce = 0.27
drag_c0 = 0.015
unknowns['CL'] = lift_c0 + lift_ca*alpha*1e-1 + lift_ce*eta*1e-1
unknowns['CD'] = (drag_c0 + (unknowns['CL'])**2 /(np.pi * aspect_ratio * oswald))/1e-1
class SuroMM(Group):
def __init__(self):
super(SuroMM, self).__init__()
#kriging
AeroMM = self.add("AeroMM", MetaModel())
AeroMM.add_param('alpha', val=0.)
AeroMM.add_param('eta', val=0.)
AeroMM.add_output('CL_MM', val=0., surrogate=FloatKrigingSurrogate())
AeroMM.add_output('CD_MM', val=0., surrogate=FloatKrigingSurrogate())
class SurrogateAero(Component):
def __init__(self):
super(SurrogateAero, self).__init__()
## Inputs to this subprob
self.add_param('alpha', val=0.5*np.ones(num_elem)) ## Angle of attack
self.add_param('eta', val=0.5*np.ones(num_elem)) ## Tail rotation angle
self.add_param('AR', 0.0)
self.add_param('oswald', 0.0)
## Unknowns for this sub prob
self.add_output('CD', val=np.zeros(num_elem))
self.add_output('CL', val=np.zeros(num_elem))
#####
self.problem = prob = Problem()
prob.root = Group()
prob.root.add('d1', SuroMM(), promotes=['*'])
prob.setup()
#### training of metamodel
prob['AeroMM.train:alpha'] = DOEX1
prob['AeroMM.train:eta'] = DOEX2
prob['AeroMM.train:CL_MM'] = DOEY1
prob['AeroMM.train:CD_MM'] =DOEY2
def solve_nonlinear(self, params, unknowns, resids):
CL_temp=np.zeros(num_elem)
CD_temp=np.zeros(num_elem)
prob = self.problem
# Pass values into our problem
for i in range(len(params['alpha'])):
prob['AeroMM.alpha'] = params['alpha'][i]
prob['AeroMM.eta'] = params['eta'][i]
# Run problem
prob.run()
CL_temp[i] = prob['AeroMM.CL_MM']
CD_temp[i] = prob['AeroMM.CD_MM']
# Pull values from problem
unknowns['CL'] = CL_temp
unknowns['CD'] = CD_temp
if __name__ == "__main__":
###### creation of database with DOE #####
top = Problem()
root = top.root = Group()
root.add('comp', SysAeroSurrogate(), promotes=['*'])
root.add('p1', IndepVarComp('alpha', val=0.50), promotes=['*'])
root.add('p2', IndepVarComp('eta',val=0.50), promotes=['*'])
root.add('p3', IndepVarComp('AR', 10.), promotes=['*'])
root.add('p4', IndepVarComp('oswald', 0.92), promotes=['*'])
top.driver = OptimizedLatinHypercubeDriver(num_samples=16, seed=0, population=20, generations=4, norm_method=2)
top.driver.add_desvar('alpha', lower=-5.0*(np.pi/180.0)*1e-1, upper=15.0*(np.pi/180.0)*1e-1)
top.driver.add_desvar('eta', lower=-5.0*(np.pi/180.0)*1e-1, upper=15.0*(np.pi/180.0)*1e-1)
top.driver.add_objective('CD')
recorder = SqliteRecorder('Aero')
recorder.options['record_params'] = True
recorder.options['record_unknowns'] = True
recorder.options['record_resids'] = False
recorder.options['record_metadata'] = False
top.driver.add_recorder(recorder)
top.setup()
top.run()
import sqlitedict
db = sqlitedict.SqliteDict( 'Aero', 'openmdao' )
print( list( db.keys() ) )
DOEX1 = []
DOEX2 = []
DOEY1 = []
DOEY2 = []
for i in list(db.keys()):
data = db[i]
p = data['Parameters']
DOEX1.append(p['comp.alpha'])
DOEX2.append(p['comp.eta'])
p = data['Unknowns']
DOEY1.append(p['CL'])
DOEY2.append(p['CD'])
################ use of surrogate model ######
prob2 = Problem(root=Group())
prob2.root.add('SurrAero', SurrogateAero(), promotes=['*'])
prob2.root.add('v1', IndepVarComp('alpha', val=alpha_test), promotes=['*'])
prob2.root.add('v2', IndepVarComp('eta',val=eta_test), promotes=['*'])
prob2.setup()
prob2.run()
print'CL predicted:', prob2['CL']
print'CD predicted:', prob2['CD']
The way you have your model set up seems correct. The MetaModel component will only train its data one time (the first pass through the model), as you can see in this part of the source code. Every subsequent iteration, it just uses the trained surrogate thats already there.
The meta-model is also already setup to provide analytic derivatives of the predicted output with respect to the input independent variables. Derivatives of the prediction with respect to the training point values are not available in the base implementation. That requires a more complex setup that, at least for the moment, will require some custom setup that is not in the standard library.