So, I have applied kalman filter on this dataset. Following is my code (please note I am adding entire code for one to reproduce results on his/her machine).
# Multi dimensional Kalman filter
import os
from math import *
class matrix:
# implements basic operations of a matrix class
def __init__(self, value):
if isinstance(value, basestring):
print "lol"
self.value = value
self.dimx = len(value)
self.dimy = len(value[0])
if value == [[]]:
self.dimx = 0
def zero(self, dimx, dimy):
# check if valid dimensions
if dimx < 1 or dimy < 1:
raise ValueError, "Invalid size of matrix"
else:
self.dimx = dimx
self.dimy = dimy
self.value = [[0 for row in range(dimy)] for col in range(dimx)]
def identity(self, dim):
# check if valid dimension
if dim < 1:
raise ValueError, "Invalid size of matrix"
else:
self.dimx = dim
self.dimy = dim
self.value = [[0 for row in range(dim)] for col in range(dim)]
for i in range(dim):
self.value[i][i] = 1
def show(self):
for i in range(self.dimx):
print self.value[i]
print ' '
def __add__(self, other):
# check if correct dimensions
if self.dimx != other.dimx or self.dimy != other.dimy:
raise ValueError, "Matrices must be of equal dimensions to add"
else:
# add if correct dimensions
res = matrix([[]])
res.zero(self.dimx, self.dimy)
for i in range(self.dimx):
for j in range(self.dimy):
res.value[i][j] = self.value[i][j] + other.value[i][j]
return res
def __sub__(self, other):
# check if correct dimensions
if self.dimx != other.dimx or self.dimy != other.dimy:
raise ValueError, "Matrices must be of equal dimensions to subtract"
else:
# subtract if correct dimensions
res = matrix([[]])
res.zero(self.dimx, self.dimy)
for i in range(self.dimx):
for j in range(self.dimy):
res.value[i][j] = self.value[i][j] - other.value[i][j]
return res
def __mul__(self, other):
# check if correct dimensions
if self.dimy != other.dimx:
raise ValueError, "Matrices must be m*n and n*p to multiply"
else:
# subtract if correct dimensions
res = matrix([[]])
res.zero(self.dimx, other.dimy)
for i in range(self.dimx):
for j in range(other.dimy):
for k in range(self.dimy):
res.value[i][j] += self.value[i][k] * other.value[k][j]
return res
def transpose(self):
# compute transpose
res = matrix([[]])
res.zero(self.dimy, self.dimx)
for i in range(self.dimx):
for j in range(self.dimy):
res.value[j][i] = self.value[i][j]
return res
# Thanks to Ernesto P. Adorio for use of Cholesky and CholeskyInverse functions
def Cholesky(self, ztol=1.0e-5):
# Computes the upper triangular Cholesky factorization of
# a positive definite matrix.
res = matrix([[]])
res.zero(self.dimx, self.dimx)
for i in range(self.dimx):
S = sum([(res.value[k][i])**2 for k in range(i)])
d = self.value[i][i] - S
if abs(d) < ztol:
res.value[i][i] = 0.0
else:
if d < 0.0:
raise ValueError, "Matrix not positive-definite"
res.value[i][i] = sqrt(d)
for j in range(i+1, self.dimx):
S = sum([res.value[k][i] * res.value[k][j] for k in range(self.dimx)])
if abs(S) < ztol:
S = 0.0
res.value[i][j] = (self.value[i][j] - S)/res.value[i][i]
return res
def CholeskyInverse(self):
# Computes inverse of matrix given its Cholesky upper Triangular
# decomposition of matrix.
res = matrix([[]])
res.zero(self.dimx, self.dimx)
# Backward step for inverse.
for j in reversed(range(self.dimx)):
tjj = self.value[j][j]
S = sum([self.value[j][k]*res.value[j][k] for k in range(j+1, self.dimx)])
res.value[j][j] = 1.0/tjj**2 - S/tjj
for i in reversed(range(j)):
res.value[j][i] = res.value[i][j] = -sum([self.value[i][k]*res.value[k][j] for k in range(i+1, self.dimx)])/self.value[i][i]
return res
def inverse(self):
aux = self.Cholesky()
res = aux.CholeskyInverse()
return res
def __repr__(self):
return repr(self.value)
########################################
# filter function
def kalman_filter(x, P):
# measurement update
y = measurements - H * x
s = H * P * H.transpose() + R
K = P * H.transpose() * s.inverse()
x = x + K * y
P = (I - K * H) * P
# prediction
x = F * x
P = F * P * F.transpose()
return x,P
files = []
x = matrix([[0.], [0.], [0.]]) # initial state (location and velocity)
P = matrix([[1000., 0.,0.], [0., 1000.,0.] , [0.,0.,1000.]]) # initial uncertainty
u = matrix([[0.], [0.]]) # external motion
F = matrix([[1., 0., 0.], [0., 1.,0.], [0.,0.,1.] ]) # next state function
H = matrix([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) # measurement function
R = matrix([[1., 0., 0.], [0.,1.,0.], [0., 0., 1.]]) # measurement uncertainty
I = matrix([[1., 0.,0.], [0., 1.,0.], [0.,0.,1.]]) # identity matrix
for i in os.listdir("/home/fatima/Downloads/thesis/HMP_Dataset/Climb_stairs"):
if i.endswith('.txt'):
files.append(i)
for j in range(len(files)-1):
print "iterationh number"
print j
with open("/home/fatima/Downloads/thesis/HMP_Dataset/Climb_stairs/"+files[j]) as f:
content = f.readlines()
for e in range(len(content)-1):
content1 = [z.strip() for z in content]
content2 = content1[e].split(" ")
content2[0] =-14.709 + (float(content2[0])/63)*(2*14.709);
content2[2] =-14.709 + (float(content2[2])/63)*(2*14.709);
content2[1] =-14.709 + (float(content2[1])/63)*(2*14.709);
measurements =matrix([ [float(content2[0]) ], [float(content2[1])] , [float(content2[2])] ] )
print measurements
x,p= kalman_filter(x, P)
print x
Kalman filter is at the end of the code. I haven't added any noise in the prediction of state matrix as I am unsure of how to.
About Dataset:
It is an ADL wrist-worn accelerometer dataset. There are a total of 14 activities and the acceleration is recorded in x,y and z-direction. Values of acceleration vary from 0 to 63. For the calculation, it is normalized to -14.7 to +14.7.
Question:
My question is whether I am headed in the right direction or not. Does the output seem to be correct? Any improvements?
The code looks good and you definitely heading in the right direction!
Here are some thinks you can do to check your implementation:
consider simple cases for which you know the solution, for example H = I, P = sigma²*I and R = sigma'² * R. x should tend toward the x of the previous time step of sigma tends to zero and x should tend toward the measurements if sigma' tends to zero. Or if sigma = sigma', then the Kalman Filter analysis should be the average of the previous state and the measurements and P should be reduced by a half. You might want to write a unit test for these cases.
check that the matrix P stays always symmetric and positive defined (all eigenvalues are positive)
Implement an alternate Kalman Filter update step, see equation (28) on page 39 of the document http://modb.oce.ulg.ac.be/mediawiki/upload/Alex/AssimLecture/assim_lecture.pdf. Essentially you can compute the Kalman gain also as:
K = inv(inv(P) + H' * inv(R) H ) H' inv(R)
where inv(P) is the inverse of the matrix P. Both should be equal up to the numerical precision.
For the model noise, you can just add a small covariance matrix to the equation P = F * P * F' + Q. It is often a matrix proportional to the identity matrix. If Q is to small, measurement will no longer affect the analysis after some iterations. If Q is too large, you model state x will be quite noisy.
Some coding remarks:
If measurements, H and R are also parameters of the Kalman_filter function, then your function would be more easily portable to a different case.
Are you aware of numpy for matrix operations? It has a extensive support for matrix and array operations and uses highly optimized libraries for the computation.
Let me know if this helps!
Related
I am trying to implement a custom loss function in a Pytorch Autoencoder.
The loss function tries to maximize the cosine similarity between a given output tensor U (a vector) and 100 random vectors J where both U and J have the same dimension of [300]. This is repeated for each batch.
Suppose we have 30 items per batch, then the output tensor is
train_Y.shape = [30,300]
Random_vectors.shape = [30,100,300]
I can implement the loss function in two ways:
All_Y =[]
for Y,z_r in zip(train_y, random_vectors):
Y_cosine_list =[]
for z in z_r:
cosi = torch.dot(Y,z) / (torch.norm(Y)*torch.norm(z))
Y_cosine_list.append(cosi)
All_Y.append(Y_cosine_list)
All_Y = torch.tensor(All_Y).to(device)
train_loss = torch.sum(torch.abs(All_Y))/dim_0
train_loss = torch.tensor(train_loss.data, requires_grad = True)
or
train_Y = torch.zeros([dim_0, 100])
for i, (Y,z_r) in enumerate(zip(train_Y, random_vectors)):
for j,z in enumerate(z_r):
train_Y[i,j] = cos(Y,z)
train_Y = train_Y.to(device)
train_loss = torch.sum(torch.abs(train_Y))/dim_0
The second one is more elegant and to the point. However it is giving a "Cuda illegal memory access error". I have checked that the memory is not exceeded in either case. Is there anything wrong with the second implementation?
The first implementation is inelegant and I am not sure that it makes sense from a neural net optimization perspective. But it does not give errors and am able to complete training for all the epochs.
Ps: I have tried encapsulating this code block in a loss_fn method but I get the same illegal memory access error.
I have tried everything that I could find for the illegal memory access error - changing GPUs, removing a torch.stack block etc. But I can't seem to get rid of the problem.
Here is a vectorized way to do it
class CosineLoss(nn.Module):
def __init__(self, ):
super().__init__()
pass
def forward(self, x, y):
"""
Args:
x (torch.tensor): [batchsize, N, M] - tensor.
y (torch.tensor): [batchsize, M] - tensor.
Returns:
torch.tensor: scalar mean cosine loss
"""
# dot product along dimension 'm' i.e multiply and sum along 'm'.
dotp = torch.einsum("bm, bnm -> bn", y, x)
# L2 norm along dimension 'm' and multiply by broadcasting
length = torch.norm(y, dim=-1)[:, None]*torch.norm(x, dim=-1)
# cosine = dotproduct of unit vectors
cos = dotp/length
return cos.mean()
def test():
b, n, m = 30, 100, 300
train_Y = torch.randn(b, m, device='cuda')
random_vectors = torch.randn(b, n, m, requires_grad=True, device='cuda')
print(f'{random_vectors.grad = }')
cosineloss = CosineLoss()
loss = cosineloss(random_vectors, train_Y)
print(f'{loss = }')
loss.backward()
print(f'{random_vectors.grad.shape = }')
References:
einsum
broadcasting
I want to calculate the relative error for two array. The pure numpy code is:
# a1, a2 are the two array
np.abs( 1-a2/a1 ).max()
How can I use numba.cuda to accelarate the above code?
In my thought:
#cuda.jit
def calculate(a1, a2):
start = cuda.blockDim.x*cuda.blockIdx.x + cuda.threadIdx.x
grid = cuda.gridDim.x*cuda.blockDim.x
for id in range(start, a1.size, grid):
r = abs(1-a2[id]/a1[id])
ca1 = cuda.to_device(a1)
ca2 = cuda.to_device(a2)
But, how can I compare the r between different thread?
One possible method to do this is to write your own shared memory parallel reduction.
As indicated in the comments, another possible method is to use numba's built-in reduce decorator.
Here is an example demonstrating both:
$ cat t79.py
from numba import cuda, float32, vectorize
import numpy as np
from numpy import random
#values of 0..10 are legal here
TPBP2 = 9
TPB = 2**TPBP2
TPBH = TPB//2
ds = 4096
#method 1: standard cuda parallel max-finding reduction
#cuda.jit
def max_error(a1, a2, err):
s = cuda.shared.array(shape=(TPB), dtype=float32)
x = cuda.grid(1)
st = cuda.gridsize(1)
tx = cuda.threadIdx.x
s[tx] = 0
cuda.syncthreads()
for i in range(x, a1.size, st):
s[tx] = max(s[tx], abs(1-a2[i]/a1[i]))
mid = TPBH
for i in range(TPBP2):
cuda.syncthreads()
if tx < mid:
s[tx] = max(s[tx], s[tx+mid])
mid >>= 1
if tx == 0:
err[cuda.blockIdx.x] = s[0]
# data
# for best performance we should choose blocks based on GPU occupancy
# but for demonstration since we don't know the GPU:
blocks = (ds+TPB-1)//TPB
a1= np.random.rand(ds).astype(np.float32)
a1 += 1
a2= np.random.rand(ds).astype(np.float32)
err = np.zeros(blocks).astype(np.float32)
# Start the kernel
max_error[blocks, TPB](a1,a2, err)
# we could perform another stage of GPU reduction here, but for simplicity:
my_err = np.max(err)
print(my_err)
#method 2: using numba features
#vectorize(['float32(float32,float32)'], target = 'cuda')
def my_error(a1,a2):
return abs(1-a2/a1)
#cuda.reduce
def max_reduce(a,b):
return max(a,b)
r = my_error(a1,a2)
my_err = max_reduce(r)
print(my_err)
$ python t79.py
0.9999707
0.9999707
$
I have to solve the following boundary value problem which is
also it is defined in my Matlab code below, but my code doesn't work. I mean I didn't get the approximate solution of my system.
I want to know where is the problem in my code or just the version of matlab that I have can't compile the kind of function I have used , Thanks
Explanation of method I have used : I have used the finite element method or what we called Galerkin Method based on investigation about assembly matrix and stiffness matrix. I have multiplied the system by weight function which satisfies the boundary condition then I have integrated over elements (integration of elementary matrix over the range ]-1,1[). I have four elementary matrix. For more information about that Method I used please check this paper(page:6,7,8)
Note The error I have got upon the compilation of my code is
The current use of "MatElt2Nd" is inconsistent with it previous use or definition in line 7
Code
function [U] = EquaDiff2(n)
% ----------------------------------
% -d²u/dx² + 6*u = (-4*x^2-6)exp(x^2)
% u(-1) = 0 u(1)= 0
%----------------------------------
function [Ke, Fe] = MatElt2Nd(x1,x2)
% déclaration de la fonction,
% function of computing matrix and elementary matrix (assembly matrix)
% ----------------------------------
x = [-1:2/n:1]'; % modification d1 of bound d’intégration
K = zeros(n+1) ;
F = zeros(n+1,1) ;
for i = 1:n
j = i+1;
t = [i j];
x1 = x(i);
x2 = x(j);
[Ke,Fe] = MatElt2Nd(x1,x2);
K(t,t) = K(t,t) + Ke;
F(t) = F(t) + Fe;
end;
K(1,:) = [];
K(:,1) = [];
F(1) = [];
U = K\F;
U = [0.0;U];
t = 0:0.01:1;
return
%-------------------------------------------
% calculation of matrix Ke and vector Fe
%-------------------------------------------
function [Ke,Fe] = MatElt2Nd0(x1,x2)
% NEWly named nested function is introduced
Ke1 = 1/(x2-x1)*[ 1 -1 % no modification done
-1 1 ] ; % essentiellement que les matrices
Ke2 =(x2-x1)* [ 2 1 % élémentaires
1 2 ] ;
N = [(x-x2)/(x1-x2) (x-x1)/(x2-x1)] % function of form
Fe =simple( int(N' * (-4*x^2-6)*exp(x^2) , x, x1, x2) ) % vecteur Fe ;
Ke = Ke1 + 6*Ke2 ;
return
Edit I have got a general code for that but I can't do changes in the general code to solve my system , Any help ?
General Code
% au'(x)+bu"(x)=0 for 0<=x<=d
% BC: u(0)=0 and u(d)=h
%==============================================================
% ======Example======
% Finding an approximate solution to the following BVP using 4 elements of
% equal length.
% u'(x)-u"(x)=0 : 0<=x<=1
% BC: u(0)=0 and u(1)=1
% Solution:
% >> Galerkin(4,1,-1,1,1)
% ==============================================================
% The output of this program is
% 1- The approximate solution (plotted in blue)
% 2- The exact solution (plotted in red)
% 3- The percentage error (plotted in magenta)
%=======================Program Begin==========================
function Galerkin(ne1,a,b,d,h) % Declare function
clc % Clear workspace
% Define the Coefficients of the exact solution
% The Exact solution is : u(x)=C1+C2*exp(-ax/b)
% where C2=h/(exp(-a*d/b)-1)and C1=-C2
C2=h/((exp(-a*d/b))-1);
C1=-C2;
% Define element length
le = d/ne1;
% Define x matrix
x = zeros (ne1+1,1); %
for i=2:ne1 +1
x(i,1) = x(i-1,1)+le;
end
% K1 matrix corresponding to the diffusion term (u"(x))
K1 = (b/le) * [1,-1;-1,1]
% K2 matrix corresponding to the convection term (u'(x))
K2 = a*[-1/2 1/2;-1/2 1/2]
% Element stiffness Matrix
Ke = K1+K2
% Global stiffness matrix
%********************Begin Assembly***************************
k = zeros(ne1+1);
for i=1:ne1+1
for j=1:ne1 +1
if (i==j)
if(i==1)
k(i,j)=Ke(1,1);
elseif(i==ne1+1)
k(i,j)=Ke(2,2);
else
k(i,j)=Ke(1,1)+Ke(2,2);
end
elseif(i==j+1)
k(i,j)=Ke(1,2);
elseif(j==i+1)
k(i,j)=Ke(2,1);
else
k(i,j)=0;
end
end
end
%********************End Assembly*****************************
%The Global f Matrix
f = zeros(ne1+1,1);
%BC apply u(0) = 0
f(1,1) = 0;
%BC apply u(d) = h
f(ne1+1,1) = h;
% Display the Global stifness matrix before striking row
K_Global=k
%Striking first row (u1=0)
k(1,1) = 1;
for i=2:ne1+1
k(1,i) = 0;
k(ne1+1,i) = 0;
end
k(ne1+1,ne1+1) = 1;
% Display the solvable stifness matrix
K_strike=k
%solving the result and finding the displacement matrix, {u}
u=inv(k)*f
hold on
% ======Calculating Approximate Solution and plotting============
syms X
U_sym=sym(zeros(ne1,1));
dU_sym=sym(zeros(ne1,1));
for i=1:ne1
N1x=1-((X-x(i))/le);
N2x=(X-x(i))/le;
U_X=(u(i)*N1x)+(u(i+1)*N2x);
U_sym(i)=U_X;
dU_sym(i)=diff(U_sym(i));
subplot(3,1,1)
hold on
ezplot(U_sym(i),[x(i) x(i+1)])
subplot(3,1,2)
hold on
% du/dx approximate
ezplot(dU_sym(i),[x(i) x(i+1)])
end
I'm following the keras blog post code to visualize the features learned and activations at different layers. The code has randomly generated a gray-image of dimension (1,3,img_width, img_height) and visualized it. here it is:
from __future__ import print_function
from scipy.misc import imsave
import numpy as np
import time
from keras.applications import vgg16
from keras import backend as K
# dimensions of the generated pictures for each filter.
img_width = 128
img_height = 128
# the name of the layer we want to visualize
# (see model definition at keras/applications/vgg16.py)
layer_name = 'block5_conv1'
# util function to convert a tensor into a valid image
def deprocess_image(x):
# normalize tensor: center on 0., ensure std is 0.1
x -= x.mean()
x /= (x.std() + 1e-5)
x *= 0.1
# clip to [0, 1]
x += 0.5
x = np.clip(x, 0, 1)
# convert to RGB array
x *= 255
if K.image_data_format() == 'channels_first':
x = x.transpose((1, 2, 0))
x = np.clip(x, 0, 255).astype('uint8')
return x
# build the VGG16 network with ImageNet weights
model = vgg16.VGG16(weights='imagenet', include_top=False)
print('Model loaded.')
model.summary()
# this is the placeholder for the input images
input_img = model.input
# get the symbolic outputs of each "key" layer (we gave them unique names).
layer_dict = dict([(layer.name, layer) for layer in model.layers[1:]])
def normalize(x):
# utility function to normalize a tensor by its L2 norm
return x / (K.sqrt(K.mean(K.square(x))) + 1e-5)
kept_filters = []
for filter_index in range(0, 200):
# we only scan through the first 200 filters,
# but there are actually 512 of them
print('Processing filter %d' % filter_index)
start_time = time.time()
# we build a loss function that maximizes the activation
# of the nth filter of the layer considered
layer_output = layer_dict[layer_name].output
if K.image_data_format() == 'channels_first':
loss = K.mean(layer_output[:, filter_index, :, :])
else:
loss = K.mean(layer_output[:, :, :, filter_index])
# we compute the gradient of the input picture wrt this loss
grads = K.gradients(loss, input_img)[0]
# normalization trick: we normalize the gradient
grads = normalize(grads)
# this function returns the loss and grads given the input picture
iterate = K.function([input_img], [loss, grads])
# step size for gradient ascent
step = 1.
# we start from a gray image with some random noise
if K.image_data_format() == 'channels_first':
input_img_data = np.random.random((1, 3, img_width, img_height))
else:
input_img_data = np.random.random((1, img_width, img_height, 3))
input_img_data = (input_img_data - 0.5) * 20 + 128
# we run gradient ascent for 20 steps
for i in range(20):
loss_value, grads_value = iterate([input_img_data])
input_img_data += grads_value * step
print('Current loss value:', loss_value)
if loss_value <= 0.:
# some filters get stuck to 0, we can skip them
break
# decode the resulting input image
if loss_value > 0:
img = deprocess_image(input_img_data[0])
kept_filters.append((img, loss_value))
end_time = time.time()
print('Filter %d processed in %ds' % (filter_index, end_time - start_time))
# we will stich the best 64 filters on a 8 x 8 grid.
n = 8
# the filters that have the highest loss are assumed to be better-looking.
# we will only keep the top 64 filters.
kept_filters.sort(key=lambda x: x[1], reverse=True)
kept_filters = kept_filters[:n * n]
# build a black picture with enough space for
# our 8 x 8 filters of size 128 x 128, with a 5px margin in between
margin = 5
width = n * img_width + (n - 1) * margin
height = n * img_height + (n - 1) * margin
stitched_filters = np.zeros((width, height, 3))
# fill the picture with our saved filters
for i in range(n):
for j in range(n):
img, loss = kept_filters[i * n + j]
stitched_filters[(img_width + margin) * i: (img_width + margin) * i + img_width,
(img_height + margin) * j: (img_height + margin) * j + img_height, :] = img
# save the result to disk
imsave('stitched_filters_%dx%d.png' % (n, n), stitched_filters)
Could you please let me know how can I modify these statements in the code:
input_img_data = np.random.random((1, img_width, img_height, 3))
input_img_data = (input_img_data - 0.5) * 20 + 128
to insert my own data and visualize the features learned and activations? My image is a RGB image of dimensions 150, 150. thanks for your assistance.
If you want to process single image:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
img = load_img('data/XXXX.jpg') # this is a PIL image
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
If you want to process batch:
from keras.preprocessing.image import ImageDataGenerator
data_gen_args = dict(featurewise_center=True,
featurewise_std_normalization=True,
rotation_range=90.,
width_shift_range=0.1,
height_shift_range=0.1,
zoom_range=0.2)
image_datagen = ImageDataGenerator(**data_gen_args)
image_generator = image_datagen.flow_from_directory(
'data/images',
class_mode=None,
seed=seed)
To see documentation : https://keras.io/preprocessing/image/#imagedatagenerator
Update
# we start from a gray image with some random noise
if K.image_data_format() == 'channels_first':
img = load_img('images/1/1.png') # this is a PIL image
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
else:
#input_img_data = np.random.random((1, img_width, img_height, 3))
img = load_img('images/1/1.png') # this is a PIL image
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
input_img_data = x
input_img_data = (input_img_data - 0.5) * 20 + 128
I have just begun using lasagne and Theano to do some machine learning on Python.
I am trying to modify the softmax class in Theano. I want to change how the activation function(softmax) is calculated. Instead of dividing e_x by e_x.sum(axis=1), I want to divide e_x by sum of three consecutive numbers.
For instance, the result will be as follows:
sm[0] = e_x[0]/(e_x[0]+e_x[1]+e_x[2])
sm[1] = e_x[1]/(e_x[0]+e_x[1]+e_x[2])
sm[2] = e_x[2]/(e_x[0]+e_x[1]+e_x[2])
sm[3] = e_x[3]/(e_x[3]+e_x[4]+e_x[5])
sm[4] = e_x[4]/(e_x[3]+e_x[4]+e_x[5])
sm[5] = e_x[5]/(e_x[3]+e_x[4]+e_x[5])
and so on...
The problem is that I cannot quite grasp how theano carries out the computation.
Here is my main question. Does it suffice to just change the perform() function in the softmax class?
Here is the original perform() function:
def perform(self, node, input_storage, output_storage):
x, = input_storage
e_x = numpy.exp(x - x.max(axis=1)[:, None])
sm = e_x / e_x.sum(axis=1)[:, None]
output_storage[0][0] = sm
Here is my modified perform()
def myPerform(self, node, input_storage, output_storage):
x, = input_storage
e_x = numpy.exp(x - x.max(axis=1)[:, None])
sm = numpy.zeros_like(e_x)
for i in range(0,symbolCount):
total = e_x[3*i] + e_x[3*i+1] + e_x[3*i+2]
sm[3*i] = e_x[3*i]/total
sm[3*i+1] = e_x[3*i+1]/total
sm[3*i+2] = e_x[3*i+2]/total
output_storage[0][0] = sm
With the current code, I am getting 'unorderable types:int()>str()' error when I use the predict method in lasagne.
For something like this you're probably better off constructing a custom softmax via symbolic expressions rather than creating (or modifying) an operation.
Your custom softmax can be defined in terms of symbolic expressions. Doing it this way will give you gradients (and other Theano operation bits and pieces) "for free" but might run slightly slower than a custom operation could.
Here's an example:
import numpy
import theano
import theano.tensor as tt
x = tt.matrix()
# Use the built in softmax operation
y1 = tt.nnet.softmax(x)
# A regular softmax operation defined via ordinary Theano symbolic expressions
y2 = tt.exp(x)
y2 = y2 / y2.sum(axis=1)[:, None]
# Custom softmax operation
def custom_softmax(a):
b = tt.exp(a)
b1 = b[:, :3] / b[:, :3].sum(axis=1)[:, None]
b2 = b[:, 3:] / b[:, 3:].sum(axis=1)[:, None]
return tt.concatenate([b1, b2], axis=1)
y3 = custom_softmax(x)
f = theano.function([x], outputs=[y1, y2, y3])
x_value = [[.1, .2, .3, .4, .5, .6], [.1, .3, .5, .2, .4, .6]]
y1_value, y2_value, y3_value = f(x_value)
assert numpy.allclose(y1_value, y2_value)
assert y3_value.shape == y1_value.shape
a = numpy.exp(.1) + numpy.exp(.2) + numpy.exp(.3)
b = numpy.exp(.4) + numpy.exp(.5) + numpy.exp(.6)
c = numpy.exp(.1) + numpy.exp(.3) + numpy.exp(.5)
d = numpy.exp(.2) + numpy.exp(.4) + numpy.exp(.6)
assert numpy.allclose(y3_value, [
[numpy.exp(.1) / a, numpy.exp(.2) / a, numpy.exp(.3) / a, numpy.exp(.4) / b, numpy.exp(.5) / b, numpy.exp(.6) / b],
[numpy.exp(.1) / c, numpy.exp(.3) / c, numpy.exp(.5) / c, numpy.exp(.2) / d, numpy.exp(.4) / d, numpy.exp(.6) / d]
]), y3_value