Calculate output size in CNN - output

What will be the output size of each layer in the following model?
'''
model = Sequential()
model.add(Conv2D(32, (8, 8), padding='same', strides=(4, 4), input_shape=(80,80,4)))
model.add(Activation('relu'))
model.add(Conv2D(64, (4, 4), padding='same', strides=(2, 2)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3), padding='same', strides=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(2))
'''
Full Traceback (most recent call)

TL;DR If "output size" means number of parameters, here's that plus memory size of model and memory required to train:
-------------------------------------
Layer # of Params
=====================================
Conv2D 544
-------------------------------------
Conv2D 8256
-------------------------------------
Conv2D 36928
-------------------------------------
Dense 13107712
-------------------------------------
Dense 1026
=====================================
Total # of Params: 13154466
Model Size: 59.47 MiB
Memory Required to Train: 1.95 GiB
-------------------------------------
and here are the relevant equations to compute each:
Model:
2D Conv Layer:
nw = kw * kh * d * d_prev
nb = d
n = nw + nb
Dense Layer:
nw = no * ni
nb = no
n = nw + nb
Training:
2D Conv Layer:
n = wi * hi * dl
Dense Layer:
n = no
Legend:
nw = Number of Weight Parameters
nb = Number of Bias Parameters
kw = Kernel Width
kh = Kernel Height
d = Depth of Current Layer
d_prev = Depth of Previous Layer
no = Number of outputs
ni = Number of inputs
nt = Number of training parameters
n = Total number of parameters
For a batch size of 1, 59.47 MiB to train. For a batch size of 32, 1.95 GiB to train.
A great resource is Memory Usage Computational Considerations, by Kevin McGuinness. You can watch his presentation on YouTube here. He gives a link to slides in the info portion of the YouTube post, but they are here for your reference (look for D2L1 (Day 2, Lecture 1).
It depends on four things:
The size of your data types,
The number of parameters in your model,
The number of parameters required to store training outputs, and
Your batch size
By default, tensorflow uses 32-bit floating point data types (these are 4 bytes in size since there are 8 bits to a byte).
Here's the code I wrote to calculate it. It's pretty much the same as what keras will output, but also includes memory requirements
Source Code:
def model_params_Conv2D(d_n, k_w, k_h, d_n_prev, s_x=1, s_y=1):
"""Calculate the number of model parameters in a 2D Convolution Layer
Args:
d_n ([int]): Depth of current layer
k_w ([int]]): Kernel width
k_h ([int]): Kernel height
d_n_prev ([int]): Depth of previous layer
s_x ([int]): Strides in x-direction
s_y ([int]): Strides in y-direction
Returns:
[int]: Number of layer parameters
"""
n_w = d_n * k_w * k_h * d_n_prev // (s_x * s_y) # Number of weight paramters
n_b = d_n # Number of bias parameters
return n_w + n_b
def model_params_Dense(n_o, n_i):
"""Calculate the number of model parameters in a dense layer
Args:
n_o ([int]): Number of output
n_i ([int]): Number of inputs
Returns:
[int]: Number of layer parameters
"""
n_w = n_o * n_i # Number of weight parameters
n_b = n_o # Number of bias parameters
return n_w + n_b
def training_params_Conv2D(w_i, h_i, d_l):
"""Calclate number of training parameters in a 2D Convolution layer
Args:
w_i (int): Input width
h_i (int): Input height
d_l (int): Layer depth
"""
return w_i * h_i * d_l
def training_params_Dense(n_o):
"""Calculate the number of training parameters in a Dense layer
Args:
n_o (int): Number of outputs
"""
return n_o
def memory_requirement(n_p, m_dt=4):
"""Size of neural network model in bytes
Args:
n_p ([int]): Number of parameters
m_dt ([int]): Memory size of data type in bytes
Returns:
[int]: Memory consumption in bytes
"""
return n_p * m_dt
def SI2ibi(mem):
"""Convert from SI bytes to ibibytes
Computers use powers of 2, so memory is represented in ibibytes, but
SI prefixes are powers of 1000)
kibi (KiB) = (2^10)^1, kilo (KB) = (10^3)^1 (1.024 KiB = 1 KB)
mebi (MiB) = (2^10)^2, mega (MB) = (10^3)^2 (1.048576 MiB = 1 MB)
gibi (GiB) = (2^10)^3, giga (GB) = (10^3)^3 (1.073741824 GiB = 1 GB)
Args:
mem ([int]): Memory size in bytes
"""
KB = 10 ** 3
MB = KB ** 2
GB = KB ** 3
KB2KiB = 1 / 1.024
MB2MiB = 1 / 1.048576
GB2GiB = 1 / 1.073741824
if mem >= GB:
mem /= GB * GB2GiB
units = "GiB"
elif mem >= MB:
mem /= MB * MB2MiB
units = "MiB"
else: # mem >= KB
mem /= KB * KB2KiB
units = "KiB"
return mem, units
if __name__ == "__main__":
# NOTE: Activation layers don't require any parameters. Use depth of
# input as d_n_prev of first layer.
input_shape = (80, 80, 4)
w_i = input_shape[0] # Input width
h_i = input_shape[1] # Input height
d_i = input_shape[2] # Input depth
conv01_params = model_params_Conv2D(
d_n=32, k_w=8, k_h=8, d_n_prev=d_i, s_x=4, s_y=4
)
conv02_params = model_params_Conv2D(d_n=64, k_w=4, k_h=4, d_n_prev=32, s_x=2, s_y=2)
conv03_params = model_params_Conv2D(d_n=64, k_w=3, k_h=3, d_n_prev=64)
dense01_params = model_params_Dense(n_i=w_i * h_i * d_i, n_o=512)
dense02_params = model_params_Dense(n_i=512, n_o=2)
num_model_params = (
conv01_params + conv02_params + conv03_params + dense01_params + dense02_params
)
header_ = "Layer\t\t\t# of Params"
len_header_ = len(repr(header_.expandtabs()))
bar_eq_ = "=" * len_header_
bar_dash_ = "-" * len_header_
num_training_params = training_params_Conv2D(w_i, h_i, 32)
num_training_params += training_params_Conv2D(w_i, h_i, 64)
num_training_params += training_params_Conv2D(w_i, h_i, 64)
num_training_params += training_params_Dense(512)
num_training_params += training_params_Dense(2)
model_memory = memory_requirement(num_model_params)
training_memory = memory_requirement(num_training_params)
total_memory = model_memory + training_memory
batch_size = 32
mem, units = SI2ibi(total_memory)
mem32, units32 = SI2ibi(total_memory * batch_size)
print(f"{bar_dash_}")
print(f"{header_}")
print(f"{bar_eq_}")
print(f"Conv2D\t\t\t{conv01_params}")
print(f"{bar_dash_}")
print(f"Conv2D\t\t\t{conv02_params}")
print(f"{bar_dash_}")
print(f"Conv2D\t\t\t{conv03_params}")
print(f"{bar_dash_}")
print(f"Dense\t\t\t{dense01_params}")
print(f"{bar_dash_}")
print(f"Dense\t\t\t{dense02_params}")
print(f"{bar_eq_}")
print(f"Total # of Params: {num_model_params}")
print(f"Model Size: {mem:.2f} {units}")
print(f"Memory Required to Train: {mem32:.2f} {units32}")
print(f"{bar_dash_}")

Related

Iterative loss function Autoencoders

I am trying to implement a custom loss function in a Pytorch Autoencoder.
The loss function tries to maximize the cosine similarity between a given output tensor U (a vector) and 100 random vectors J where both U and J have the same dimension of [300]. This is repeated for each batch.
Suppose we have 30 items per batch, then the output tensor is
train_Y.shape = [30,300]
Random_vectors.shape = [30,100,300]
I can implement the loss function in two ways:
All_Y =[]
for Y,z_r in zip(train_y, random_vectors):
Y_cosine_list =[]
for z in z_r:
cosi = torch.dot(Y,z) / (torch.norm(Y)*torch.norm(z))
Y_cosine_list.append(cosi)
All_Y.append(Y_cosine_list)
All_Y = torch.tensor(All_Y).to(device)
train_loss = torch.sum(torch.abs(All_Y))/dim_0
train_loss = torch.tensor(train_loss.data, requires_grad = True)
or
train_Y = torch.zeros([dim_0, 100])
for i, (Y,z_r) in enumerate(zip(train_Y, random_vectors)):
for j,z in enumerate(z_r):
train_Y[i,j] = cos(Y,z)
train_Y = train_Y.to(device)
train_loss = torch.sum(torch.abs(train_Y))/dim_0
The second one is more elegant and to the point. However it is giving a "Cuda illegal memory access error". I have checked that the memory is not exceeded in either case. Is there anything wrong with the second implementation?
The first implementation is inelegant and I am not sure that it makes sense from a neural net optimization perspective. But it does not give errors and am able to complete training for all the epochs.
Ps: I have tried encapsulating this code block in a loss_fn method but I get the same illegal memory access error.
I have tried everything that I could find for the illegal memory access error - changing GPUs, removing a torch.stack block etc. But I can't seem to get rid of the problem.
Here is a vectorized way to do it
class CosineLoss(nn.Module):
def __init__(self, ):
super().__init__()
pass
def forward(self, x, y):
"""
Args:
x (torch.tensor): [batchsize, N, M] - tensor.
y (torch.tensor): [batchsize, M] - tensor.
Returns:
torch.tensor: scalar mean cosine loss
"""
# dot product along dimension 'm' i.e multiply and sum along 'm'.
dotp = torch.einsum("bm, bnm -> bn", y, x)
# L2 norm along dimension 'm' and multiply by broadcasting
length = torch.norm(y, dim=-1)[:, None]*torch.norm(x, dim=-1)
# cosine = dotproduct of unit vectors
cos = dotp/length
return cos.mean()
def test():
b, n, m = 30, 100, 300
train_Y = torch.randn(b, m, device='cuda')
random_vectors = torch.randn(b, n, m, requires_grad=True, device='cuda')
print(f'{random_vectors.grad = }')
cosineloss = CosineLoss()
loss = cosineloss(random_vectors, train_Y)
print(f'{loss = }')
loss.backward()
print(f'{random_vectors.grad.shape = }')
References:
einsum
broadcasting

My code does not run on multiple GPUs in PyTorch

I used Faster R-CNN of PyTorch to train it on a dataset. It works well with one GPU. However, I have access to a system with 4 GPUs. I want to use 4 GPUs. However, when I check GPUs usage, only one GPU is used.
I select device in this manner:
if torch.cuda.is_available() == False and device_name == 'gpu':
raise ValueError('GPU is not available!')
elif device_name == 'cpu':
device = torch.device('cpu')
elif device_name == 'gpu':
if batch_size % torch.cuda.device_count() != 0:
raise ValueError('Batch Size is no dividable by number of gpus')
device = torch.device('cuda')
After that I do this:
# multi GPUs
if torch.cuda.device_count() > 1 and device_name == 'gpu':
print('=' * 50)
print('=' * 50)
print('=' * 50)
print("Let's use", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
# model = nn.DataParallel(model, device_ids=[i for i in range(torch.cuda.device_count())])
model = nn.DataParallel(model)
print('=' * 50)
print('=' * 50)
print('=' * 50)
# transfer model to selected device
model.to(device)
I move data to the device in this way:
# iterate over all batches
counter_batches = 0
for images, targets in metric_logger.log_every(data_loader, print_freq, header):
# transfer tensors to device(gpu, if not available cpu)
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
# in train mode, faster r-cnn gives losses
loss_dict = model(images, targets)
# sum of losses
losses = sum(loss for loss in loss_dict.values())
I do not know what I did wrong.
Also, I get this warning:
/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
warnings.warn('Was asked to gather along dimension 0, but all ’

pytorch nllloss function target shape mismatch

I'm training a LSTM model using pytorch with batch size of 256 and NLLLoss() as loss function.
The loss function is having problem with the data shape.
The softmax output from the forward passing has shape of torch.Size([256, 4, 1181]) where 256 is batch size, 4 is sequence length, and 1181 is vocab size.
The target is in the shape of torch.Size([256, 4]) where 256 is batch size and 4 is the output sequence length.
When I was testing earlier with batch size of 1, the model works fine but when I add batch size, it is breaking. I read that NLLLoss() can take class target as input instead of one hot encoded target.
Am I misunderstanding it? Or did I not format the shape of the target correctly?
class LSTM(nn.Module):
def __init__(self, embed_size=100, hidden_size=100, vocab_size=1181, embedding_matrix=...):
super(LSTM, self).__init__()
self.hidden_size = hidden_size
self.word_embeddings = nn.Embedding(vocab_size, embed_size)
self.word_embeddings.load_state_dict({'weight': torch.Tensor(embedding_matrix)})
self.word_embeddings.weight.requires_grad = False
self.lstm = nn.LSTM(embed_size, hidden_size)
self.hidden2out = nn.Linear(hidden_size, vocab_size)
def forward(self, tokens):
batch_size, num_steps = tokens.shape
embeds = self.word_embeddings(tokens)
lstm_out, _ = self.lstm(embeds.view(batch_size, num_steps, -1))
out_space = self.hidden2out(lstm_out.view(batch_size, num_steps, -1))
out_scores = F.log_softmax(out_space, dim=1)
return out_scores
model = LSTM(self.config.embed_size, self.config.hidden_size, self.config.vocab_size, self.embedding_matrix)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=self.config.lr)
Error:
~/anaconda3/lib/python3.7/site-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
1846 if target.size()[1:] != input.size()[2:]:
1847 raise ValueError('Expected target size {}, got {}'.format(
-> 1848 out_size, target.size()))
1849 input = input.contiguous().view(n, c, 1, -1)
1850 target = target.contiguous().view(n, 1, -1)
ValueError: Expected target size (256, 554), got torch.Size([256, 4])
Your input shape to the loss function is (N, d, C) = (256, 4, 1181) and your target shape is (N, d) = (256, 4), however, according to the docs on NLLLoss the input should be (N, C, d) for a target of (N, d).
Supposing x is your network output and y is the target then you can compute loss by transposing the incorrect dimensions of x as follows:
loss = loss_function(x.transpose(1, 2), y)
Alternatively, since NLLLoss is just averaging all the responses anyway, you can reshape x and y to be (N*d, C) and (N*d). This gives the same result without creating temporary copies of your tensors.
loss = loss_function(x.reshape(N*d, C), y.reshape(N*d))

TPU utilization low due to output fusion

I am training a U-Net on a Google Cloud TPU. It works but the utilization is very low.
Due to the fact I can not upload the traced profile here (?), a screenshot of the most slowing part is here:
The output fusion is the most harming part. with 58% of time but just 12% utilization. The next time consuming part (9%) is "convolution" with a utilization of 74%. I am not really sure which operations I need to tweak to get a better utilization of the output fusion?
Below is my code for creating the U-Net, maybe there is a slow layer inside?:
class UNet:
def create(self, input, start_ch, depth, inc_rate,
dropout, batchnorm, maxpool, upconv, residual, leaky_relu_alpha):
with tf.variable_scope('Generator', reuse=tf.AUTO_REUSE):
o = self._level_block(input, start_ch, depth, inc_rate, dropout, batchnorm, maxpool, upconv, residual,
leaky_relu_alpha)
out_ch = input.shape[3]
o = tf.layers.conv2d(o, out_ch, 1)
o = tf.tanh(o)
return o
def _conv_block(self, m, dim, bn, res, leaky_relu_alpha, do=0):
n = tf.layers.conv2d(m, dim, 3, padding='same')
n = tf.nn.leaky_relu(n, alpha=leaky_relu_alpha)
n = tf.layers.batch_normalization(n) if bn else n
n = tf.layers.dropout(n, do) if do else n
n = tf.layers.conv2d(n, dim, 3, padding='same')
n = tf.nn.leaky_relu(n, alpha=leaky_relu_alpha)
n = tf.layers.batch_normalization(n)if bn else n
return tf.concat([m, n], axis=-1) if res else n
def _level_block(self, m, dim, depth, inc, do, bn, mp, up, res, leaky_relu_alpha):
if depth > 0:
n = self._conv_block(m, dim, bn, res, leaky_relu_alpha)
m = tf.layers.max_pooling2d(n, [2, 2], [2, 2]) if mp else tf.layers.conv2d(n, dim, 3, strides=2, padding='same')
m = self._level_block(m, int(inc * dim), depth - 1, inc, do, bn, mp, up, res, leaky_relu_alpha)
if up:
m = tf.image.resize_nearest_neighbor(m, (2*m.shape[1], 2*m.shape[2]))
m = tf.layers.conv2d(m, dim, 2, padding='same')
m = tf.nn.leaky_relu(m, alpha=leaky_relu_alpha)
else:
m = tf.layers.conv2d_transpose(m, dim, 3, strides=2, padding='same')
m = tf.nn.leaky_relu(m, alpha=leaky_relu_alpha)
n = tf.concat([n, m], axis=-1)
m = self._conv_block(n, dim, bn, res, leaky_relu_alpha)
else:
m = self._conv_block(m, dim, bn, res, leaky_relu_alpha, do)
return m
My input batch size is 128. U-Net depth is 4. Using no BatchNorm layers (batchnorm=False), conv2d_transpose (upconv=False), residual=False and maxpool=True. So U-Net consists just of Conv2D, Conv2D_Transpose, Dropout, Leaky ReLU, Max Pooling and Concatenation Layers.
Any Idea what I need to tweak to get a better "output fusion" utilization? Or at least what does affect the output fusion?
I can see the convolution.114 has a lot of padding, 16*1 out of 128*8. Since your global batch size is 128, the local batch size for each core is only 16 (128/8). Would it be possible for you to increase the batch size of your model to 1024?

Visualizing features and activations in CNN using Keras example

I'm following the keras blog post code to visualize the features learned and activations at different layers. The code has randomly generated a gray-image of dimension (1,3,img_width, img_height) and visualized it. here it is:
from __future__ import print_function
from scipy.misc import imsave
import numpy as np
import time
from keras.applications import vgg16
from keras import backend as K
# dimensions of the generated pictures for each filter.
img_width = 128
img_height = 128
# the name of the layer we want to visualize
# (see model definition at keras/applications/vgg16.py)
layer_name = 'block5_conv1'
# util function to convert a tensor into a valid image
def deprocess_image(x):
# normalize tensor: center on 0., ensure std is 0.1
x -= x.mean()
x /= (x.std() + 1e-5)
x *= 0.1
# clip to [0, 1]
x += 0.5
x = np.clip(x, 0, 1)
# convert to RGB array
x *= 255
if K.image_data_format() == 'channels_first':
x = x.transpose((1, 2, 0))
x = np.clip(x, 0, 255).astype('uint8')
return x
# build the VGG16 network with ImageNet weights
model = vgg16.VGG16(weights='imagenet', include_top=False)
print('Model loaded.')
model.summary()
# this is the placeholder for the input images
input_img = model.input
# get the symbolic outputs of each "key" layer (we gave them unique names).
layer_dict = dict([(layer.name, layer) for layer in model.layers[1:]])
def normalize(x):
# utility function to normalize a tensor by its L2 norm
return x / (K.sqrt(K.mean(K.square(x))) + 1e-5)
kept_filters = []
for filter_index in range(0, 200):
# we only scan through the first 200 filters,
# but there are actually 512 of them
print('Processing filter %d' % filter_index)
start_time = time.time()
# we build a loss function that maximizes the activation
# of the nth filter of the layer considered
layer_output = layer_dict[layer_name].output
if K.image_data_format() == 'channels_first':
loss = K.mean(layer_output[:, filter_index, :, :])
else:
loss = K.mean(layer_output[:, :, :, filter_index])
# we compute the gradient of the input picture wrt this loss
grads = K.gradients(loss, input_img)[0]
# normalization trick: we normalize the gradient
grads = normalize(grads)
# this function returns the loss and grads given the input picture
iterate = K.function([input_img], [loss, grads])
# step size for gradient ascent
step = 1.
# we start from a gray image with some random noise
if K.image_data_format() == 'channels_first':
input_img_data = np.random.random((1, 3, img_width, img_height))
else:
input_img_data = np.random.random((1, img_width, img_height, 3))
input_img_data = (input_img_data - 0.5) * 20 + 128
# we run gradient ascent for 20 steps
for i in range(20):
loss_value, grads_value = iterate([input_img_data])
input_img_data += grads_value * step
print('Current loss value:', loss_value)
if loss_value <= 0.:
# some filters get stuck to 0, we can skip them
break
# decode the resulting input image
if loss_value > 0:
img = deprocess_image(input_img_data[0])
kept_filters.append((img, loss_value))
end_time = time.time()
print('Filter %d processed in %ds' % (filter_index, end_time - start_time))
# we will stich the best 64 filters on a 8 x 8 grid.
n = 8
# the filters that have the highest loss are assumed to be better-looking.
# we will only keep the top 64 filters.
kept_filters.sort(key=lambda x: x[1], reverse=True)
kept_filters = kept_filters[:n * n]
# build a black picture with enough space for
# our 8 x 8 filters of size 128 x 128, with a 5px margin in between
margin = 5
width = n * img_width + (n - 1) * margin
height = n * img_height + (n - 1) * margin
stitched_filters = np.zeros((width, height, 3))
# fill the picture with our saved filters
for i in range(n):
for j in range(n):
img, loss = kept_filters[i * n + j]
stitched_filters[(img_width + margin) * i: (img_width + margin) * i + img_width,
(img_height + margin) * j: (img_height + margin) * j + img_height, :] = img
# save the result to disk
imsave('stitched_filters_%dx%d.png' % (n, n), stitched_filters)
Could you please let me know how can I modify these statements in the code:
input_img_data = np.random.random((1, img_width, img_height, 3))
input_img_data = (input_img_data - 0.5) * 20 + 128
to insert my own data and visualize the features learned and activations? My image is a RGB image of dimensions 150, 150. thanks for your assistance.
If you want to process single image:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
img = load_img('data/XXXX.jpg') # this is a PIL image
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
If you want to process batch:
from keras.preprocessing.image import ImageDataGenerator
data_gen_args = dict(featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=90.,
width_shift_range=0.1,
height_shift_range=0.1,
zoom_range=0.2)
image_datagen = ImageDataGenerator(**data_gen_args)
image_generator = image_datagen.flow_from_directory(
'data/images',
class_mode=None,
seed=seed)
To see documentation : https://keras.io/preprocessing/image/#imagedatagenerator
Update
# we start from a gray image with some random noise
if K.image_data_format() == 'channels_first':
img = load_img('images/1/1.png') # this is a PIL image
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
else:
#input_img_data = np.random.random((1, img_width, img_height, 3))
img = load_img('images/1/1.png') # this is a PIL image
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
input_img_data = x
input_img_data = (input_img_data - 0.5) * 20 + 128