I'm following the keras blog post code to visualize the features learned and activations at different layers. The code has randomly generated a gray-image of dimension (1,3,img_width, img_height) and visualized it. here it is:
from __future__ import print_function
from scipy.misc import imsave
import numpy as np
import time
from keras.applications import vgg16
from keras import backend as K
# dimensions of the generated pictures for each filter.
img_width = 128
img_height = 128
# the name of the layer we want to visualize
# (see model definition at keras/applications/vgg16.py)
layer_name = 'block5_conv1'
# util function to convert a tensor into a valid image
def deprocess_image(x):
# normalize tensor: center on 0., ensure std is 0.1
x -= x.mean()
x /= (x.std() + 1e-5)
x *= 0.1
# clip to [0, 1]
x += 0.5
x = np.clip(x, 0, 1)
# convert to RGB array
x *= 255
if K.image_data_format() == 'channels_first':
x = x.transpose((1, 2, 0))
x = np.clip(x, 0, 255).astype('uint8')
return x
# build the VGG16 network with ImageNet weights
model = vgg16.VGG16(weights='imagenet', include_top=False)
print('Model loaded.')
model.summary()
# this is the placeholder for the input images
input_img = model.input
# get the symbolic outputs of each "key" layer (we gave them unique names).
layer_dict = dict([(layer.name, layer) for layer in model.layers[1:]])
def normalize(x):
# utility function to normalize a tensor by its L2 norm
return x / (K.sqrt(K.mean(K.square(x))) + 1e-5)
kept_filters = []
for filter_index in range(0, 200):
# we only scan through the first 200 filters,
# but there are actually 512 of them
print('Processing filter %d' % filter_index)
start_time = time.time()
# we build a loss function that maximizes the activation
# of the nth filter of the layer considered
layer_output = layer_dict[layer_name].output
if K.image_data_format() == 'channels_first':
loss = K.mean(layer_output[:, filter_index, :, :])
else:
loss = K.mean(layer_output[:, :, :, filter_index])
# we compute the gradient of the input picture wrt this loss
grads = K.gradients(loss, input_img)[0]
# normalization trick: we normalize the gradient
grads = normalize(grads)
# this function returns the loss and grads given the input picture
iterate = K.function([input_img], [loss, grads])
# step size for gradient ascent
step = 1.
# we start from a gray image with some random noise
if K.image_data_format() == 'channels_first':
input_img_data = np.random.random((1, 3, img_width, img_height))
else:
input_img_data = np.random.random((1, img_width, img_height, 3))
input_img_data = (input_img_data - 0.5) * 20 + 128
# we run gradient ascent for 20 steps
for i in range(20):
loss_value, grads_value = iterate([input_img_data])
input_img_data += grads_value * step
print('Current loss value:', loss_value)
if loss_value <= 0.:
# some filters get stuck to 0, we can skip them
break
# decode the resulting input image
if loss_value > 0:
img = deprocess_image(input_img_data[0])
kept_filters.append((img, loss_value))
end_time = time.time()
print('Filter %d processed in %ds' % (filter_index, end_time - start_time))
# we will stich the best 64 filters on a 8 x 8 grid.
n = 8
# the filters that have the highest loss are assumed to be better-looking.
# we will only keep the top 64 filters.
kept_filters.sort(key=lambda x: x[1], reverse=True)
kept_filters = kept_filters[:n * n]
# build a black picture with enough space for
# our 8 x 8 filters of size 128 x 128, with a 5px margin in between
margin = 5
width = n * img_width + (n - 1) * margin
height = n * img_height + (n - 1) * margin
stitched_filters = np.zeros((width, height, 3))
# fill the picture with our saved filters
for i in range(n):
for j in range(n):
img, loss = kept_filters[i * n + j]
stitched_filters[(img_width + margin) * i: (img_width + margin) * i + img_width,
(img_height + margin) * j: (img_height + margin) * j + img_height, :] = img
# save the result to disk
imsave('stitched_filters_%dx%d.png' % (n, n), stitched_filters)
Could you please let me know how can I modify these statements in the code:
input_img_data = np.random.random((1, img_width, img_height, 3))
input_img_data = (input_img_data - 0.5) * 20 + 128
to insert my own data and visualize the features learned and activations? My image is a RGB image of dimensions 150, 150. thanks for your assistance.
If you want to process single image:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
img = load_img('data/XXXX.jpg') # this is a PIL image
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
If you want to process batch:
from keras.preprocessing.image import ImageDataGenerator
data_gen_args = dict(featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=90.,
width_shift_range=0.1,
height_shift_range=0.1,
zoom_range=0.2)
image_datagen = ImageDataGenerator(**data_gen_args)
image_generator = image_datagen.flow_from_directory(
'data/images',
class_mode=None,
seed=seed)
To see documentation : https://keras.io/preprocessing/image/#imagedatagenerator
Update
# we start from a gray image with some random noise
if K.image_data_format() == 'channels_first':
img = load_img('images/1/1.png') # this is a PIL image
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
else:
#input_img_data = np.random.random((1, img_width, img_height, 3))
img = load_img('images/1/1.png') # this is a PIL image
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
input_img_data = x
input_img_data = (input_img_data - 0.5) * 20 + 128
Related
What will be the output size of each layer in the following model?
'''
model = Sequential()
model.add(Conv2D(32, (8, 8), padding='same', strides=(4, 4), input_shape=(80,80,4)))
model.add(Activation('relu'))
model.add(Conv2D(64, (4, 4), padding='same', strides=(2, 2)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3), padding='same', strides=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(2))
'''
Full Traceback (most recent call)
TL;DR If "output size" means number of parameters, here's that plus memory size of model and memory required to train:
-------------------------------------
Layer # of Params
=====================================
Conv2D 544
-------------------------------------
Conv2D 8256
-------------------------------------
Conv2D 36928
-------------------------------------
Dense 13107712
-------------------------------------
Dense 1026
=====================================
Total # of Params: 13154466
Model Size: 59.47 MiB
Memory Required to Train: 1.95 GiB
-------------------------------------
and here are the relevant equations to compute each:
Model:
2D Conv Layer:
nw = kw * kh * d * d_prev
nb = d
n = nw + nb
Dense Layer:
nw = no * ni
nb = no
n = nw + nb
Training:
2D Conv Layer:
n = wi * hi * dl
Dense Layer:
n = no
Legend:
nw = Number of Weight Parameters
nb = Number of Bias Parameters
kw = Kernel Width
kh = Kernel Height
d = Depth of Current Layer
d_prev = Depth of Previous Layer
no = Number of outputs
ni = Number of inputs
nt = Number of training parameters
n = Total number of parameters
For a batch size of 1, 59.47 MiB to train. For a batch size of 32, 1.95 GiB to train.
A great resource is Memory Usage Computational Considerations, by Kevin McGuinness. You can watch his presentation on YouTube here. He gives a link to slides in the info portion of the YouTube post, but they are here for your reference (look for D2L1 (Day 2, Lecture 1).
It depends on four things:
The size of your data types,
The number of parameters in your model,
The number of parameters required to store training outputs, and
Your batch size
By default, tensorflow uses 32-bit floating point data types (these are 4 bytes in size since there are 8 bits to a byte).
Here's the code I wrote to calculate it. It's pretty much the same as what keras will output, but also includes memory requirements
Source Code:
def model_params_Conv2D(d_n, k_w, k_h, d_n_prev, s_x=1, s_y=1):
"""Calculate the number of model parameters in a 2D Convolution Layer
Args:
d_n ([int]): Depth of current layer
k_w ([int]]): Kernel width
k_h ([int]): Kernel height
d_n_prev ([int]): Depth of previous layer
s_x ([int]): Strides in x-direction
s_y ([int]): Strides in y-direction
Returns:
[int]: Number of layer parameters
"""
n_w = d_n * k_w * k_h * d_n_prev // (s_x * s_y) # Number of weight paramters
n_b = d_n # Number of bias parameters
return n_w + n_b
def model_params_Dense(n_o, n_i):
"""Calculate the number of model parameters in a dense layer
Args:
n_o ([int]): Number of output
n_i ([int]): Number of inputs
Returns:
[int]: Number of layer parameters
"""
n_w = n_o * n_i # Number of weight parameters
n_b = n_o # Number of bias parameters
return n_w + n_b
def training_params_Conv2D(w_i, h_i, d_l):
"""Calclate number of training parameters in a 2D Convolution layer
Args:
w_i (int): Input width
h_i (int): Input height
d_l (int): Layer depth
"""
return w_i * h_i * d_l
def training_params_Dense(n_o):
"""Calculate the number of training parameters in a Dense layer
Args:
n_o (int): Number of outputs
"""
return n_o
def memory_requirement(n_p, m_dt=4):
"""Size of neural network model in bytes
Args:
n_p ([int]): Number of parameters
m_dt ([int]): Memory size of data type in bytes
Returns:
[int]: Memory consumption in bytes
"""
return n_p * m_dt
def SI2ibi(mem):
"""Convert from SI bytes to ibibytes
Computers use powers of 2, so memory is represented in ibibytes, but
SI prefixes are powers of 1000)
kibi (KiB) = (2^10)^1, kilo (KB) = (10^3)^1 (1.024 KiB = 1 KB)
mebi (MiB) = (2^10)^2, mega (MB) = (10^3)^2 (1.048576 MiB = 1 MB)
gibi (GiB) = (2^10)^3, giga (GB) = (10^3)^3 (1.073741824 GiB = 1 GB)
Args:
mem ([int]): Memory size in bytes
"""
KB = 10 ** 3
MB = KB ** 2
GB = KB ** 3
KB2KiB = 1 / 1.024
MB2MiB = 1 / 1.048576
GB2GiB = 1 / 1.073741824
if mem >= GB:
mem /= GB * GB2GiB
units = "GiB"
elif mem >= MB:
mem /= MB * MB2MiB
units = "MiB"
else: # mem >= KB
mem /= KB * KB2KiB
units = "KiB"
return mem, units
if __name__ == "__main__":
# NOTE: Activation layers don't require any parameters. Use depth of
# input as d_n_prev of first layer.
input_shape = (80, 80, 4)
w_i = input_shape[0] # Input width
h_i = input_shape[1] # Input height
d_i = input_shape[2] # Input depth
conv01_params = model_params_Conv2D(
d_n=32, k_w=8, k_h=8, d_n_prev=d_i, s_x=4, s_y=4
)
conv02_params = model_params_Conv2D(d_n=64, k_w=4, k_h=4, d_n_prev=32, s_x=2, s_y=2)
conv03_params = model_params_Conv2D(d_n=64, k_w=3, k_h=3, d_n_prev=64)
dense01_params = model_params_Dense(n_i=w_i * h_i * d_i, n_o=512)
dense02_params = model_params_Dense(n_i=512, n_o=2)
num_model_params = (
conv01_params + conv02_params + conv03_params + dense01_params + dense02_params
)
header_ = "Layer\t\t\t# of Params"
len_header_ = len(repr(header_.expandtabs()))
bar_eq_ = "=" * len_header_
bar_dash_ = "-" * len_header_
num_training_params = training_params_Conv2D(w_i, h_i, 32)
num_training_params += training_params_Conv2D(w_i, h_i, 64)
num_training_params += training_params_Conv2D(w_i, h_i, 64)
num_training_params += training_params_Dense(512)
num_training_params += training_params_Dense(2)
model_memory = memory_requirement(num_model_params)
training_memory = memory_requirement(num_training_params)
total_memory = model_memory + training_memory
batch_size = 32
mem, units = SI2ibi(total_memory)
mem32, units32 = SI2ibi(total_memory * batch_size)
print(f"{bar_dash_}")
print(f"{header_}")
print(f"{bar_eq_}")
print(f"Conv2D\t\t\t{conv01_params}")
print(f"{bar_dash_}")
print(f"Conv2D\t\t\t{conv02_params}")
print(f"{bar_dash_}")
print(f"Conv2D\t\t\t{conv03_params}")
print(f"{bar_dash_}")
print(f"Dense\t\t\t{dense01_params}")
print(f"{bar_dash_}")
print(f"Dense\t\t\t{dense02_params}")
print(f"{bar_eq_}")
print(f"Total # of Params: {num_model_params}")
print(f"Model Size: {mem:.2f} {units}")
print(f"Memory Required to Train: {mem32:.2f} {units32}")
print(f"{bar_dash_}")
So, I have applied kalman filter on this dataset. Following is my code (please note I am adding entire code for one to reproduce results on his/her machine).
# Multi dimensional Kalman filter
import os
from math import *
class matrix:
# implements basic operations of a matrix class
def __init__(self, value):
if isinstance(value, basestring):
print "lol"
self.value = value
self.dimx = len(value)
self.dimy = len(value[0])
if value == [[]]:
self.dimx = 0
def zero(self, dimx, dimy):
# check if valid dimensions
if dimx < 1 or dimy < 1:
raise ValueError, "Invalid size of matrix"
else:
self.dimx = dimx
self.dimy = dimy
self.value = [[0 for row in range(dimy)] for col in range(dimx)]
def identity(self, dim):
# check if valid dimension
if dim < 1:
raise ValueError, "Invalid size of matrix"
else:
self.dimx = dim
self.dimy = dim
self.value = [[0 for row in range(dim)] for col in range(dim)]
for i in range(dim):
self.value[i][i] = 1
def show(self):
for i in range(self.dimx):
print self.value[i]
print ' '
def __add__(self, other):
# check if correct dimensions
if self.dimx != other.dimx or self.dimy != other.dimy:
raise ValueError, "Matrices must be of equal dimensions to add"
else:
# add if correct dimensions
res = matrix([[]])
res.zero(self.dimx, self.dimy)
for i in range(self.dimx):
for j in range(self.dimy):
res.value[i][j] = self.value[i][j] + other.value[i][j]
return res
def __sub__(self, other):
# check if correct dimensions
if self.dimx != other.dimx or self.dimy != other.dimy:
raise ValueError, "Matrices must be of equal dimensions to subtract"
else:
# subtract if correct dimensions
res = matrix([[]])
res.zero(self.dimx, self.dimy)
for i in range(self.dimx):
for j in range(self.dimy):
res.value[i][j] = self.value[i][j] - other.value[i][j]
return res
def __mul__(self, other):
# check if correct dimensions
if self.dimy != other.dimx:
raise ValueError, "Matrices must be m*n and n*p to multiply"
else:
# subtract if correct dimensions
res = matrix([[]])
res.zero(self.dimx, other.dimy)
for i in range(self.dimx):
for j in range(other.dimy):
for k in range(self.dimy):
res.value[i][j] += self.value[i][k] * other.value[k][j]
return res
def transpose(self):
# compute transpose
res = matrix([[]])
res.zero(self.dimy, self.dimx)
for i in range(self.dimx):
for j in range(self.dimy):
res.value[j][i] = self.value[i][j]
return res
# Thanks to Ernesto P. Adorio for use of Cholesky and CholeskyInverse functions
def Cholesky(self, ztol=1.0e-5):
# Computes the upper triangular Cholesky factorization of
# a positive definite matrix.
res = matrix([[]])
res.zero(self.dimx, self.dimx)
for i in range(self.dimx):
S = sum([(res.value[k][i])**2 for k in range(i)])
d = self.value[i][i] - S
if abs(d) < ztol:
res.value[i][i] = 0.0
else:
if d < 0.0:
raise ValueError, "Matrix not positive-definite"
res.value[i][i] = sqrt(d)
for j in range(i+1, self.dimx):
S = sum([res.value[k][i] * res.value[k][j] for k in range(self.dimx)])
if abs(S) < ztol:
S = 0.0
res.value[i][j] = (self.value[i][j] - S)/res.value[i][i]
return res
def CholeskyInverse(self):
# Computes inverse of matrix given its Cholesky upper Triangular
# decomposition of matrix.
res = matrix([[]])
res.zero(self.dimx, self.dimx)
# Backward step for inverse.
for j in reversed(range(self.dimx)):
tjj = self.value[j][j]
S = sum([self.value[j][k]*res.value[j][k] for k in range(j+1, self.dimx)])
res.value[j][j] = 1.0/tjj**2 - S/tjj
for i in reversed(range(j)):
res.value[j][i] = res.value[i][j] = -sum([self.value[i][k]*res.value[k][j] for k in range(i+1, self.dimx)])/self.value[i][i]
return res
def inverse(self):
aux = self.Cholesky()
res = aux.CholeskyInverse()
return res
def __repr__(self):
return repr(self.value)
########################################
# filter function
def kalman_filter(x, P):
# measurement update
y = measurements - H * x
s = H * P * H.transpose() + R
K = P * H.transpose() * s.inverse()
x = x + K * y
P = (I - K * H) * P
# prediction
x = F * x
P = F * P * F.transpose()
return x,P
files = []
x = matrix([[0.], [0.], [0.]]) # initial state (location and velocity)
P = matrix([[1000., 0.,0.], [0., 1000.,0.] , [0.,0.,1000.]]) # initial uncertainty
u = matrix([[0.], [0.]]) # external motion
F = matrix([[1., 0., 0.], [0., 1.,0.], [0.,0.,1.] ]) # next state function
H = matrix([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) # measurement function
R = matrix([[1., 0., 0.], [0.,1.,0.], [0., 0., 1.]]) # measurement uncertainty
I = matrix([[1., 0.,0.], [0., 1.,0.], [0.,0.,1.]]) # identity matrix
for i in os.listdir("/home/fatima/Downloads/thesis/HMP_Dataset/Climb_stairs"):
if i.endswith('.txt'):
files.append(i)
for j in range(len(files)-1):
print "iterationh number"
print j
with open("/home/fatima/Downloads/thesis/HMP_Dataset/Climb_stairs/"+files[j]) as f:
content = f.readlines()
for e in range(len(content)-1):
content1 = [z.strip() for z in content]
content2 = content1[e].split(" ")
content2[0] =-14.709 + (float(content2[0])/63)*(2*14.709);
content2[2] =-14.709 + (float(content2[2])/63)*(2*14.709);
content2[1] =-14.709 + (float(content2[1])/63)*(2*14.709);
measurements =matrix([ [float(content2[0]) ], [float(content2[1])] , [float(content2[2])] ] )
print measurements
x,p= kalman_filter(x, P)
print x
Kalman filter is at the end of the code. I haven't added any noise in the prediction of state matrix as I am unsure of how to.
About Dataset:
It is an ADL wrist-worn accelerometer dataset. There are a total of 14 activities and the acceleration is recorded in x,y and z-direction. Values of acceleration vary from 0 to 63. For the calculation, it is normalized to -14.7 to +14.7.
Question:
My question is whether I am headed in the right direction or not. Does the output seem to be correct? Any improvements?
The code looks good and you definitely heading in the right direction!
Here are some thinks you can do to check your implementation:
consider simple cases for which you know the solution, for example H = I, P = sigma²*I and R = sigma'² * R. x should tend toward the x of the previous time step of sigma tends to zero and x should tend toward the measurements if sigma' tends to zero. Or if sigma = sigma', then the Kalman Filter analysis should be the average of the previous state and the measurements and P should be reduced by a half. You might want to write a unit test for these cases.
check that the matrix P stays always symmetric and positive defined (all eigenvalues are positive)
Implement an alternate Kalman Filter update step, see equation (28) on page 39 of the document http://modb.oce.ulg.ac.be/mediawiki/upload/Alex/AssimLecture/assim_lecture.pdf. Essentially you can compute the Kalman gain also as:
K = inv(inv(P) + H' * inv(R) H ) H' inv(R)
where inv(P) is the inverse of the matrix P. Both should be equal up to the numerical precision.
For the model noise, you can just add a small covariance matrix to the equation P = F * P * F' + Q. It is often a matrix proportional to the identity matrix. If Q is to small, measurement will no longer affect the analysis after some iterations. If Q is too large, you model state x will be quite noisy.
Some coding remarks:
If measurements, H and R are also parameters of the Kalman_filter function, then your function would be more easily portable to a different case.
Are you aware of numpy for matrix operations? It has a extensive support for matrix and array operations and uses highly optimized libraries for the computation.
Let me know if this helps!
I'd like to force to zero all elements of a vector which are below a certain threshold. And I'd like to do it so that I can still propagate gradient through non-zero ones.
For example, in theano I could write:
B = theano.tensor.switch(A < .1, 0, A)
Is there a solution for that in pytorch?
As of pytorch 0.4+, you can do it easily with torch.where(see doc,Merged PR)
It is as easy as in Theano. See yourself with an example:
import torch
from torch.autograd import Variable
x = Variable(torch.arange(0,4), requires_grad=True) # x = [0 1 2 3]
zeros = Variable(torch.zeros(*x.shape)) # zeros = [0 0 0 0]
y = x**2 # y = [0 1 4 9]
z = torch.where(y < 5, zeros, y) # z = [0 0 0 9]
# dz/dx = (dz/dy)(dy/dx) = (y < 5)(0) + (y ≥ 5)(2x) = 2x(x**2 ≥ 5)
z.backward(torch.Tensor([1.0]))
x.grad # (dz/dx) = [0 0 0 6]
I do not suppose that switch is implemented by default in PyTorch. But, you can define your own function in PyTorch by extending the torch.autograd.Function
So, the switch function will look something like
class switchFunction(Function):
#staticmethod
def forward(ctx, flag, value, tensor):
ctx.save_for_backward(flag)
tensor[flag] = value
return tensor
#staticmethod
def backward(ctx, grad_output):
flag, = ctx.saved_variables
grad_output[flag] = 0
return grad_output
switch = switchFunction.apply
Now, you can simply call switch as switch(A < 0.1, 0, A)
Edit
There is actually a function that does this. It is called Threshold. You can use it like
import torch.nn as nn
m = nn.Threshold(0.1, 0)
B = m(A)
I've a keras convolutional neural network model. When I execute:
def get_model_weights():
model_weights = {}
counter = 0
for layer in model.layers:
weights = layer.get_weights()
if (len(weights) != 0):
counter += 1
model_weights['W' + str(counter)] = weights
return model_weights
x = np.zeros((1, 1, D, D))
x[0, 0, :, :] = cur_x #cur_x is a grayscale (1 channel) image (D x D pixels)
y = np.zeros((1, 2))
y[0, :] = [1, 0]
init_weights = get_model_weights()
model.fit(x, y, batch_size = 1, nb_epoch = 1, verbose = 0)
cur_weights = get_model_weights()
cur_weights and init_weights are the same! It seems that weights are not updated! Can anyone explain this behaviour?! Is the "fit" implementation correct?!
Thanks
P.S. I'm using "stochastic gradient descent" as optimiser and "binary cross-entropy" as loss function
I have just begun using lasagne and Theano to do some machine learning on Python.
I am trying to modify the softmax class in Theano. I want to change how the activation function(softmax) is calculated. Instead of dividing e_x by e_x.sum(axis=1), I want to divide e_x by sum of three consecutive numbers.
For instance, the result will be as follows:
sm[0] = e_x[0]/(e_x[0]+e_x[1]+e_x[2])
sm[1] = e_x[1]/(e_x[0]+e_x[1]+e_x[2])
sm[2] = e_x[2]/(e_x[0]+e_x[1]+e_x[2])
sm[3] = e_x[3]/(e_x[3]+e_x[4]+e_x[5])
sm[4] = e_x[4]/(e_x[3]+e_x[4]+e_x[5])
sm[5] = e_x[5]/(e_x[3]+e_x[4]+e_x[5])
and so on...
The problem is that I cannot quite grasp how theano carries out the computation.
Here is my main question. Does it suffice to just change the perform() function in the softmax class?
Here is the original perform() function:
def perform(self, node, input_storage, output_storage):
x, = input_storage
e_x = numpy.exp(x - x.max(axis=1)[:, None])
sm = e_x / e_x.sum(axis=1)[:, None]
output_storage[0][0] = sm
Here is my modified perform()
def myPerform(self, node, input_storage, output_storage):
x, = input_storage
e_x = numpy.exp(x - x.max(axis=1)[:, None])
sm = numpy.zeros_like(e_x)
for i in range(0,symbolCount):
total = e_x[3*i] + e_x[3*i+1] + e_x[3*i+2]
sm[3*i] = e_x[3*i]/total
sm[3*i+1] = e_x[3*i+1]/total
sm[3*i+2] = e_x[3*i+2]/total
output_storage[0][0] = sm
With the current code, I am getting 'unorderable types:int()>str()' error when I use the predict method in lasagne.
For something like this you're probably better off constructing a custom softmax via symbolic expressions rather than creating (or modifying) an operation.
Your custom softmax can be defined in terms of symbolic expressions. Doing it this way will give you gradients (and other Theano operation bits and pieces) "for free" but might run slightly slower than a custom operation could.
Here's an example:
import numpy
import theano
import theano.tensor as tt
x = tt.matrix()
# Use the built in softmax operation
y1 = tt.nnet.softmax(x)
# A regular softmax operation defined via ordinary Theano symbolic expressions
y2 = tt.exp(x)
y2 = y2 / y2.sum(axis=1)[:, None]
# Custom softmax operation
def custom_softmax(a):
b = tt.exp(a)
b1 = b[:, :3] / b[:, :3].sum(axis=1)[:, None]
b2 = b[:, 3:] / b[:, 3:].sum(axis=1)[:, None]
return tt.concatenate([b1, b2], axis=1)
y3 = custom_softmax(x)
f = theano.function([x], outputs=[y1, y2, y3])
x_value = [[.1, .2, .3, .4, .5, .6], [.1, .3, .5, .2, .4, .6]]
y1_value, y2_value, y3_value = f(x_value)
assert numpy.allclose(y1_value, y2_value)
assert y3_value.shape == y1_value.shape
a = numpy.exp(.1) + numpy.exp(.2) + numpy.exp(.3)
b = numpy.exp(.4) + numpy.exp(.5) + numpy.exp(.6)
c = numpy.exp(.1) + numpy.exp(.3) + numpy.exp(.5)
d = numpy.exp(.2) + numpy.exp(.4) + numpy.exp(.6)
assert numpy.allclose(y3_value, [
[numpy.exp(.1) / a, numpy.exp(.2) / a, numpy.exp(.3) / a, numpy.exp(.4) / b, numpy.exp(.5) / b, numpy.exp(.6) / b],
[numpy.exp(.1) / c, numpy.exp(.3) / c, numpy.exp(.5) / c, numpy.exp(.2) / d, numpy.exp(.4) / d, numpy.exp(.6) / d]
]), y3_value