cython code using prange runs unexpectedly only on a single thread - cython

I have a cython code that I try to parallelize using a prange cython command. The code compiles but when I run it, it runs only on a single thread/core. I read in other posts that most of the cases this was due to the gil which was not properly released but when I look at my code I do not see where this happens. Would you have any idea about what is wrong with my code ?
UPDATE:
compiler: gcc 7.5
cython: 0.29.21
OS: ubuntu 20.04
Cython code:
import cython
from cython.parallel import prange
cimport numpy as cnp
import numpy as np
cdef extern from "math.h" nogil:
double floor(double x)
double ceil(double x)
double sqrt(double x)
cdef inline double round(double r) nogil:
return floor(r + 0.5) if (r > 0.0) else ceil(r - 0.5)
#cython.cdivision(True)
#cython.boundscheck(False)
#cython.wraparound(False)
cdef int atoms_in_shell_inner(double[:,:] coords, int[:,:] indexes, double[:,:] cell, double[:,:] rcell, double[:] boxed_center, double r2, int mol_idx) nogil:
cdef double r, rx, ry, rz, sdx, sdy, sdz, x, y, z, x_boxed, y_boxed, z_boxed
cdef int at_idx, i
# Loop over the selected atoms j of molecule i
for 0 <= i < indexes.shape[0]:
if indexes[mol_idx,i] == -1:
return 0
at_idx = indexes[mol_idx,i]
x = coords[at_idx,0]
y = coords[at_idx,1]
z = coords[at_idx,2]
# Convert real coordinates to box coordinates
x_boxed = x*rcell[0,0] + y*rcell[0,1] + z*rcell[0,2]
y_boxed = x*rcell[1,0] + y*rcell[1,1] + z*rcell[1,2]
z_boxed = x*rcell[2,0] + y*rcell[2,1] + z*rcell[2,2]
sdx = x_boxed - boxed_center[0]
sdy = y_boxed - boxed_center[1]
sdz = z_boxed - boxed_center[2]
# Apply the PBC to the box coordinates distance vector between atom j and the center of the shell
sdx -= round(sdx)
sdy -= round(sdy)
sdz -= round(sdz)
# Convert back the box coordinates distance vector to real coordinates distance vector
rx = sdx*cell[0,0] + sdy*cell[0,1] + sdz*cell[0,2]
ry = sdx*cell[1,0] + sdy*cell[1,1] + sdz*cell[1,2]
rz = sdx*cell[2,0] + sdy*cell[2,1] + sdz*cell[2,2]
# Compute the squared norm of the distance vector in real coordinates
r = rx*rx + ry*ry + rz*rz
# If the distance is below the cutoff mark the molecule i as being in the shell
if r < r2:
return 1
return 0
#cython.cdivision(True)
#cython.boundscheck(False)
#cython.wraparound(False)
def atoms_in_shell(double[:,:] coords,
double[:,:] cell,
double[:,:] rcell,
int[:,:] indexes,
cnp.int32_t center,
cnp.float64_t radius):
cdef int i, n_molecules
cdef double[:] shell_center = coords[center,:]
cdef double[:] boxed_center = np.empty(3,dtype=np.float)
cdef int[:] in_shell = np.zeros(indexes.shape[0],dtype=np.int32)
n_molecules = indexes.shape[0]
boxed_center[0] = shell_center[0]*rcell[0,0] + shell_center[1]*rcell[0,1] + shell_center[2]*rcell[0,2]
boxed_center[1] = shell_center[0]*rcell[1,0] + shell_center[1]*rcell[1,1] + shell_center[2]*rcell[1,2]
boxed_center[2] = shell_center[0]*rcell[2,0] + shell_center[1]*rcell[2,1] + shell_center[2]*rcell[2,2]
# Loop over the molecules
for i in prange(n_molecules,nogil=True):
in_shell[i] = atoms_in_shell_inner(coords, indexes, cell, rcell, boxed_center, radius*radius, i)
return in_shell.base
setup.py file:
from Cython.Distutils import build_ext
from distutils.core import setup, Extension
import numpy
INCLUDE_DIR = [numpy.get_include()]
EXTENSIONS = [Extension('atoms_in_shell',
include_dirs=INCLUDE_DIR,
sources=["atoms_in_shell.pyx"],
extra_compile_args = ["-O3", "-ffast-math", "-march=native", "-fopenmp" ],
extra_link_args=['-fopenmp']),
]
setup(ext_modules=EXTENSIONS, cmdclass={'build_ext': build_ext})
Python code:
from atoms_in_shell import atoms_in_shell
import numpy as np
coords = np.random.uniform(-1000.0,1000.0,(500000000,3))
cell = np.identity(3)
rcell = np.identity(3)
indexes = np.empty((10000,6),dtype=np.int32)
indexes.fill(-1)
for row in indexes:
n_atoms = np.random.randint(1,6)
row[:n_atoms] = np.random.choice(coords.shape[0]-1,n_atoms)
print(atoms_in_shell(coords, cell, rcell, indexes, 5, 1))

Related

Why is the time identical for Cython and Python versions of function?

I am just starting out learning about Cython. I have some code written in pure Python, and started to convert it to Cython. The first function is below (both versions). I expected Cython to be faster, but there is no difference in speed. Why is this? Am I doing something wrong, or is it expected?
import numpy as np
cimport numpy as np
import time
ctypedef np.int8_t DTYPE_int
cpdef np.ndarray[DTYPE_int, ndim=2] init_population(int N, int pop_size):
cdef np.ndarray[DTYPE_int, ndim=2] p
cdef np.ndarray[DTYPE_int, ndim=2] idx
cdef double t1
t1=time.time()
p = np.full((pop_size, N), np.arange(N, dtype=np.int8))
idx = np.random.rand(p.shape[0], p.shape[1]).argsort(axis=1).astype(np.int8)
print("%.20f" % (time.time() - t1))
return np.take_along_axis(p, idx, axis=1)
def py_init_population(N, pop_size):
t1=time.time()
p = np.full((pop_size, N), np.arange(N, dtype=np.int8))
idx = np.random.rand(p.shape[0], p.shape[1]).argsort(axis=1).astype(np.int8)
print("%.20f" % (time.time() - t1))
return np.take_along_axis(p, idx, axis=1)
init_population(1000, 1000)
py_init_population(1000, 1000)
Output:
7.24499845504760742188
7.26293945312500000000

Integrating a function with a very large parameter values in Python

I have a function defined as below
\begin{equation}
f(x) = e^{-k1/x}x^{-2}(k1/x+56)^{81}
\end{equation}
Now I want to find integrate the function from 0 to infinite.
\begin{equation}
S = \int^{\inf}_{0} f(x) dx
\end{equation}
And then I want to find the cumulative function defined as below
\begin{equation}
CDF(p) = \int^{p}_{0} \frac{f(x)}{S} dx
\end{equation}
To do so, I wrote a program in Python.
from matplotlib import pyplot as plt
from scipy.integrate import quad
from math import pi, exp
import numpy as np
def func(x, k1, n):
w = -1.8*n+15 # scale the function down.
return (10**w)*exp(-k1/x)*x**(-2)*(k1/x+56)**n
def S(k1, n):
return quad(func, 0, 1e+28, args=(k1, n))[0] + quad(func, 1e+28, 1e+33, args=(k1, n))[0]
def CDF(x, k1, n):
return quad(func, 0, x, args=(k1, n))[0]/S(k1, n)
k1 = 7.7e+27 # When it's <3, CDF does not generate error.
n = 81
print(S(k1, n))
print(CDF(1.1e+27, k1, n))
But unfortunately, the CDF(1.1e+27) throws the error "results out of range".
How could I obtain CDF(1.1e+27)?

Bug in the translation of a C file to a Cython file

I've succeeded in converting a C file in cython. But the two codes are giving me different results and I really cannot find where the bug is.
The relevant C code of my script is the following:
double empirical_measure(int N, double K, double d, double T, double dt, FILE *store){
/*Define variables*/
int kt;
int kt_max = (int) ((double)T/(double)dt);
double *xt;
xt=(double *)malloc(N*sizeof(double));
double bruit;
double S=0.0;
double xtmp;
double xtilde;
double x_diff;
double xi;
int i;
int j;
int l;
/*initial condition*/
for(i=0; i<N; i++){
xt [i]=rand()/((double) RAND_MAX)*2*M_PI;
}
/*Compute trajectories and empirical measure*/
for(kt=0; kt<kt_max; kt++){
for(i=0; i<N; i++){
S = 0.0;
xi = xt[i];
for(j=0; j<N; j++){
x_diff = xt[j] - xi;
S = S + sin(x_diff);
}
bruit= d*sqrt(dt)*gaussian();
xtilde = xi + ((K/N)*S)*dt + bruit;
xt[i] = fmod(xtilde, 2.0*M_PI);
}
}
return 0;
}
The gaussian function is a function which gives a random number from a Normal(0,1) distribution.
I translated into this cython code (where the output is a matrix with all the computations of xt[:,k] for all k, and not, for each k, a vector as in the C code):
def simul(int N, double K, double d, double T, double dt):
cdef int kt_max = int(T/dt)
cdef double S1
cdef double xtilde, x_diff, xtmp
cdef double[:] bruit = d*sqrt(dt)*np.random.standard_normal(N) #bruit generator
cdef double[:, ::1] xt = np.zeros((N, kt_max), dtype=np.float64)
cdef int kt, i, j, k
#initial conditions
X=np.random.uniform(0,2*np.pi,N)
for i in range(N):
xt[i,0]=X[i]
#Compute trajectories and empirical measure
for kt in range(kt_max-1):
for i in range(N):
S1 = 0.0
for j in range(N):
x_diff = xt[j,kt] - xt[i,kt]
S1 = S1 + sin(x_diff)
xtilde = xt[i,kt] + ((K/N)*S1)*dt + bruit[i]
xt[i,kt+1] = xtilde%(2*np.pi)
return xt
The problem is that if I run the two scripts with the same values I have really different results. For example, given:
N=600
K=5
T=2
dt=0.01
d=1
I obtain for the C code, for the last k:
where from the cython code:
I really can't find the problem in the code. Where could a bug be present?
Update
If I run the code with d=0 (which means that the "bruit part" can be neglected) I still obtain different results:
Histogram for d=0
The blu is the C simulation and the other colors are three simulations from python.
This means that there's something wrong in this section:
for kt in range(kt_max-1):
for i in range(N):
S1 = 0.0
for j in range(N):
x_diff = xt[j,kt] - xt[i,kt]
S1 = S1 + sin(x_diff)
xtilde = xt[i,kt] + ((K/N)*S1)*dt
xt[i,kt+1] = xtilde%(2*np.pi)
Any ideas? Has the function sin some tricky argument to put in the code?
I've also done the computation of the sin sum, getting:
sin sum simulations
(black is C code, the rest is python simulations)

Modifying perform function in Theano.tensor.nnet.softmax

I have just begun using lasagne and Theano to do some machine learning on Python.
I am trying to modify the softmax class in Theano. I want to change how the activation function(softmax) is calculated. Instead of dividing e_x by e_x.sum(axis=1), I want to divide e_x by sum of three consecutive numbers.
For instance, the result will be as follows:
sm[0] = e_x[0]/(e_x[0]+e_x[1]+e_x[2])
sm[1] = e_x[1]/(e_x[0]+e_x[1]+e_x[2])
sm[2] = e_x[2]/(e_x[0]+e_x[1]+e_x[2])
sm[3] = e_x[3]/(e_x[3]+e_x[4]+e_x[5])
sm[4] = e_x[4]/(e_x[3]+e_x[4]+e_x[5])
sm[5] = e_x[5]/(e_x[3]+e_x[4]+e_x[5])
and so on...
The problem is that I cannot quite grasp how theano carries out the computation.
Here is my main question. Does it suffice to just change the perform() function in the softmax class?
Here is the original perform() function:
def perform(self, node, input_storage, output_storage):
x, = input_storage
e_x = numpy.exp(x - x.max(axis=1)[:, None])
sm = e_x / e_x.sum(axis=1)[:, None]
output_storage[0][0] = sm
Here is my modified perform()
def myPerform(self, node, input_storage, output_storage):
x, = input_storage
e_x = numpy.exp(x - x.max(axis=1)[:, None])
sm = numpy.zeros_like(e_x)
for i in range(0,symbolCount):
total = e_x[3*i] + e_x[3*i+1] + e_x[3*i+2]
sm[3*i] = e_x[3*i]/total
sm[3*i+1] = e_x[3*i+1]/total
sm[3*i+2] = e_x[3*i+2]/total
output_storage[0][0] = sm
With the current code, I am getting 'unorderable types:int()>str()' error when I use the predict method in lasagne.
For something like this you're probably better off constructing a custom softmax via symbolic expressions rather than creating (or modifying) an operation.
Your custom softmax can be defined in terms of symbolic expressions. Doing it this way will give you gradients (and other Theano operation bits and pieces) "for free" but might run slightly slower than a custom operation could.
Here's an example:
import numpy
import theano
import theano.tensor as tt
x = tt.matrix()
# Use the built in softmax operation
y1 = tt.nnet.softmax(x)
# A regular softmax operation defined via ordinary Theano symbolic expressions
y2 = tt.exp(x)
y2 = y2 / y2.sum(axis=1)[:, None]
# Custom softmax operation
def custom_softmax(a):
b = tt.exp(a)
b1 = b[:, :3] / b[:, :3].sum(axis=1)[:, None]
b2 = b[:, 3:] / b[:, 3:].sum(axis=1)[:, None]
return tt.concatenate([b1, b2], axis=1)
y3 = custom_softmax(x)
f = theano.function([x], outputs=[y1, y2, y3])
x_value = [[.1, .2, .3, .4, .5, .6], [.1, .3, .5, .2, .4, .6]]
y1_value, y2_value, y3_value = f(x_value)
assert numpy.allclose(y1_value, y2_value)
assert y3_value.shape == y1_value.shape
a = numpy.exp(.1) + numpy.exp(.2) + numpy.exp(.3)
b = numpy.exp(.4) + numpy.exp(.5) + numpy.exp(.6)
c = numpy.exp(.1) + numpy.exp(.3) + numpy.exp(.5)
d = numpy.exp(.2) + numpy.exp(.4) + numpy.exp(.6)
assert numpy.allclose(y3_value, [
[numpy.exp(.1) / a, numpy.exp(.2) / a, numpy.exp(.3) / a, numpy.exp(.4) / b, numpy.exp(.5) / b, numpy.exp(.6) / b],
[numpy.exp(.1) / c, numpy.exp(.3) / c, numpy.exp(.5) / c, numpy.exp(.2) / d, numpy.exp(.4) / d, numpy.exp(.6) / d]
]), y3_value

Why cannot I pass a c array to a function which expects memory view in nogil content?

cdef double testB(double[:] x) nogil:
return x[0]
def test():
cdef double xx[2]
with nogil:
testB(xx)
# compiler error: Operation not allowed without gil
If with gil, it works fine.
Is it because that when pass in an c array, it creates a memory view and such creation action actually requires gil? So the memory view is not completely a c object?
Update
%%cython --annotate
cimport cython
cdef double testA(double[:] x) nogil:
return x[0]
cpdef myf():
cdef double pd[8]
cdef double[:] x = pd
testA(x)
cdef double[:] x = pd is compiled to:
__pyx_t_3 = __pyx_format_from_typeinfo(&__Pyx_TypeInfo_double);
__pyx_t_2 = Py_BuildValue((char*) "(" __PYX_BUILD_PY_SSIZE_T ")", ((Py_ssize_t)8));
if (unlikely(!__pyx_t_3 || !__pyx_t_2 || !PyBytes_AsString(__pyx_t_3))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_3);
__Pyx_GOTREF(__pyx_t_2);
__pyx_t_1 = __pyx_array_new(__pyx_t_2, sizeof(double), PyBytes_AS_STRING(__pyx_t_3), (char *) "fortran", (char *) __pyx_v_pd);
if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
__Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
__Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
__pyx_t_4 = __Pyx_PyObject_to_MemoryviewSlice_ds_double(((PyObject *)__pyx_t_1));
if (unlikely(!__pyx_t_4.memview)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0;
__pyx_v_x = __pyx_t_4;
__pyx_t_4.memview = NULL;
__pyx_t_4.data = NULL;
There exists __Pyx_PyObject_to_MemoryviewSlice_ds_double. So it seems when binding a memory view it does require gil.
You should use a numpy array, as your cdef double[:] declaration gets wrapped by a Python object, and its use is restricted without gil. You can see it by trying to slice a double[:]
def test()
cdef double[:] asd
with nogil:
asd[:1]
Your output will be:
with nogil:
asd[:1]
^
------------------------------------------------------------
prueba.pyx:16:11: Slicing Python object not allowed without gil
Using a numpy array would compile; numpy uses Python buffer protocole, and is smoothly integrated with Cython (a Google Summercamp project was financed for this). So no wrapping conflict arises inside the def:
import numpy as np
cdef double testA(double[:] x) nogil:
return x[0]
cpdef test():
xx = np.zeros(2, dtype = 'double')
with nogil:
a = testB(xx)
print(a)
This will build your module with test() on it. But it crashes, and in an ugly way (at least with mi PC):
Process Python segmentation fault (core dumped)
If I may insist with my (now deleted) previous answer, in my own experience, when dealing with Cython memoryviews and C arrays, passing pointers works just like one would expect in C. And most wrapping is avoided (actually, you are writing the code passing exactly the directions you want, thus making unnecesary wrapping). This compiles and functions as expected:
cdef double testB(double* x) nogil:
return x[0]
def test():
cdef double asd[2]
asd[0] = 1
asd[1] = 2
with nogil:
a = testB(asd)
print(a)
And, after compilig:
In [5]: import prueba
In [6]: prueba.test()
1.0
Memoryviews are not, by themselves, Python objects, but they can be wrapped in one. I am not a proficient Cython programmer, so sometimes I get unexpected wrappings or code that remains at Python level when I supposed it would be at C. Trial and error got me to the pointer strategy.