Template parse error compiling with thrust - cuda

I am trying to compile some code which allows some CPU routines to call a function which uses the GPU to speed up some calculations. The GPU code uses Thrust, specifically reduce and device_ptr. When I build the GPU code as a standalone using nvcc, there are no problems. However, attempting to integrate the GPU code with CPU C++ code causes the following compiler error when compiling the final "wrapper":
nvcc -O2 -c NLC_2D_TFIM.cpp -lcuda -lcudart -lcublas -lcusparse -L../CUDA/Lanczos/sort/sort/gnu/release -lmgpusort
In file included from /usr/local/cuda/bin/../include/thrust/pair.h:265:0,
from /usr/local/cuda/bin/../include/thrust/tuple.h:35,
from /usr/local/cuda/bin/../include/thrust/detail/functional/actor.h:29,
from /usr/local/cuda/bin/../include/thrust/detail/functional/placeholder.h:20,
from /usr/local/cuda/bin/../include/thrust/functional.h:26,
from /usr/local/cuda/bin/../include/thrust/system/detail/error_category.inl:22,
from /usr/local/cuda/bin/../include/thrust/system/error_code.h:516,
from /usr/local/cuda/bin/../include/thrust/system/cuda_error.h:26,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/malloc.inl:26,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/malloc.h:50,
from /usr/local/cuda/bin/../include/thrust/detail/backend/dispatch/malloc.h:22,
from /usr/local/cuda/bin/../include/thrust/detail/device_malloc.inl:23,
from /usr/local/cuda/bin/../include/thrust/device_malloc.h:102,
from /usr/local/cuda/bin/../include/thrust/detail/backend/internal_allocator.h:22,
from /usr/local/cuda/bin/../include/thrust/detail/uninitialized_array.h:23,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/copy_cross_space.inl:20,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/copy_cross_space.h:57,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/dispatch/copy.h:23,
from /usr/local/cuda/bin/../include/thrust/detail/backend/cuda/copy.h:21,
from /usr/local/cuda/bin/../include/thrust/detail/backend/dispatch/copy.h:24,
from /usr/local/cuda/bin/../include/thrust/detail/backend/copy.inl:20,
from /usr/local/cuda/bin/../include/thrust/detail/backend/copy.h:44,
from /usr/local/cuda/bin/../include/thrust/detail/copy.inl:20,
from /usr/local/cuda/bin/../include/thrust/detail/copy.h:39,
from /usr/local/cuda/bin/../include/thrust/detail/reference_base.inl:18,
from /usr/local/cuda/bin/../include/thrust/detail/reference_base.h:138,
from /usr/local/cuda/bin/../include/thrust/device_reference.h:27,
from /usr/local/cuda/bin/../include/thrust/detail/device_ptr.inl:23,
from /usr/local/cuda/bin/../include/thrust/device_ptr.h:181,
from ../CUDA/Lanczos/hamiltonian.h:32,
from ../CUDA/Lanczos/lanczos.h:8,
from NLC_2D_TFIM.cpp:17:
/usr/local/cuda/bin/../include/thrust/detail/pair.inl: In function ‘bool thrust::operator<(const thrust::pair<T1, T2>&, const thrust::pair<T1, T2>&)’:
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:22: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:46: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:36: error: parse error in template argument list
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:36: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:58: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:69: error: ‘.’ cannot appear in a constant-expression
/usr/local/cuda/bin/../include/thrust/detail/pair.inl:72:12: error: parse error in template argument list
make: *** [NLC_2D_TFIM.o] Error 1
NLC_2D_TFIM works with another module (Graphs) which uses std::pairs, but none of these are passed to the module which talks to the GPU. Every header uses std as its namespace, not thrust. All the parameters I'm passing to the GPU handler are regular C arrays, ints, etc.
The lines referenced above are:
#include"lanczos.h"
Which uses:
#include"hamiltonian.h"
And then from there:
#include<thrust/device_ptr.h>
In NLC_2D_TFIM.cu, the "wrapper", the code is:
ReadGraphsFromFile(fileGraphs, "rectanglegraphs.dat", TypeFlag); //graphs the information generated by the Graphs module
double J=1.;
for(int hh=1; hh<10; hh++) {
h = hh;
//Create some storage for things to be used in GPU functions
d_hamiltonian* HamilLancz = (d_hamiltonian*)malloc(HowMany*sizeof(d_hamiltonian));
parameters* data = (parameters*)malloc(HowMany*sizeof(parameters));
double** groundstates = (double**)malloc(HowMany*sizeof(double*));
double** eigenvalues = (double**)malloc(HowMany*sizeof(double*));
int* NumElem = (int*)malloc(HowMany*sizeof(int));
int** Bonds = (int**)malloc(HowMany*sizeof(int*));
//Go through each graph we read in earlier
unsigned int i = 1;
while ( i<fileGraphs.size() && fileGraphs.at(i)->Order < 14) { //skip the zeroth graph
//CPU gets the energy for smaller graphs
GENHAM HV(fileGraphs.at(i)->Order, J, h, fileGraphs.at(i)->AdjacencyList, TypeFlag);
LANCZOS lancz(HV.Vdim); //dimension of reduced Hilbert space (Sz sector)
HV.SparseHamJQ(); //generates sparse matrix Hamiltonian for Lanczos
energy = lancz.Diag(HV, 1, prm.valvec_, eVec);
i++;
}
if( argv[0] == "--gpu" || argv[0] == "-g" )
{
while ( i < fileGraphs.size() )
{
i += 30;
for( int j = 0; j < HowMany; j++)
{
Bonds[ j ] = (int*)malloc(sizeof(int)*3*fileGraphs.at(i - j)->Order);
for(unsigned int k = 0; k < fileGraphs.at(i - j)->Order; k++)
{
Bonds[ j ][ k ] = k;
Bonds[ j ][ k + fileGraphs.at(i - j)->Order ] = fileGraphs.at(i - j)->AdjacencyList.at(2*k).second;
Bonds[ j ][ k + 2*fileGraphs.at(i - j)->Order ] = fileGraphs.at(i - j)->AdjacencyList.at(2*k + 1).second;
}
data[ j ].Sz = 0;
data[ j ].dimension = 2;
data[ j ].J1 = J;
data[ j ].J2 = h;
data[ j ].modelType = 2;
eigenvalues[ j ] = (double*)malloc(sizeof(double));
}
//Calls the CPU functions which will talk to the GPU, including Thrust
ConstructSparseMatrix(HowMany, Bonds, HamilLancz, data, NumElem, 1);
lanczos(HowMany, NumElem, HamilLancz, groundstates, eigenvalues, 200, 1, 1e-12);
So there's nothing with an std::pair that's getting passed to the GPU. Here are the thrust calls:
for(int i = 0; i < howMany; i++)
{
thrust::device_ptr<int> red_ptr(d_H[i].set);
numElem[i] = thrust::reduce(red_ptr, red_ptr + rawSize[i]);
}

I'm not sure this is the right answer but if your file extension is cpp doesn't nvcc just pass it to the regular c++ compiler? What happens if you rename the file .cu?
(Also I am not sure if having -c and all the libraries in the same compile command is needed - -c usually suggests no linking is done.)

It turned out that problem was that I was linking against code using Blitz. Removing all the Blitz data structures and the include statements for it cleared up my compilation problem. Blitz uses its own namespace, so perhaps something in there was conflicting with thrust or there is a missing } or > somewhere.

Related

#pragma acc host_data use_device() with complex variables

I need to pass to #pragma acc host_data use_device() an element of a vector of pointers of vectors:
static double *send_bufL[3];
static double *recv_bufL[3];
send_bufL[IDIR] = ARRAY_1D(NVAR*grid->nghost[IDIR]*nx2*nx3, double);
recv_bufL[IDIR] = ARRAY_1D(NVAR*grid->nghost[IDIR]*nx2*nx3, double);
#pragma acc enter data copyin(send_bufL[IDIR:1][:NVAR*grid->nghost[IDIR]*nx2*nx3], \
recv_bufL[IDIR:1][:NVAR*grid->nghost[IDIR]*nx2*nx3])
#pragma acc parallel loop collapse(4) present(d, grid, send_bufL[1:1][:NVAR*grid->nghost[JDIR]*nx1*nx3])
for (nv = 0; nv < NVAR; nv++){
for (k = kbeg; k <= kend; k++){
for (i = ibeg; i <= iend; i++){
for (j = 0; j < grid->nghost[JDIR]; j++){
index = nv*nx3*nx1*grid->nghost[JDIR]+(k-kbeg)*nx1*grid->nghost[JDIR]+(i-ibeg)*grid->nghost[JDIR]+j;
send_bufL[JDIR][index] = d->Vc[nv][k][jbeg+j][i];
}}}}
count = NVAR*nx3*nx2*nghost;
#pragma acc host_data use_device(send_bufL, recv_bufL)
{
MPI_Isend (send_bufL[IDIR], count, MPI_DOUBLE, nrnks[IDIR][0], 0, MPI_COMM_WORLD, &req[0]);
MPI_Irecv (recv_bufL[IDIR], count, MPI_DOUBLE, nrnks[IDIR][0], 0, MPI_COMM_WORLD, &req[1]);
}
Writing like this, with send_bufL, recv_bufL only I get:
[marco-Inspiron-7501:41130:0:41130] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f7f4fafa608)
[marco-Inspiron-7501:41131:0:41131] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7fd697afa608)*
Trying to add dimensions I get compiling errors. What can I do?
I have to add the apparently, for some elements of the vector of pointers there's no need to use #pragma acc host_data use_device(). The compiler seems able to use the device buffers correctly. Anyway, for others elements this doesn't work and it uses host buffers, generating wrong results.
The problem is with the "send_bufL[IDIR]" and "recv_bufL[IDIR]". Since these are device pointers, dereferencing them on the host will give a seg fault.
I'm thinking that the best solution here would be to use temp pointers to the correct element in the buffers. Something like:
double * sbtmp;
double * rbtmp;
...
sbtmp = send_bufL[IDIR];
rbtmp = recv_bufL[IDIR];
#pragma acc host_data use_device(sbtmp, rbtmp)
{
MPI_Isend (sbtmp, count, MPI_DOUBLE, nrnks[IDIR][0], 0, MPI_COMM_WORLD, &req[0]);
MPI_Irecv (rbtmp, count, MPI_DOUBLE, nrnks[IDIR][0], 0, MPI_COMM_WORLD, &req[1]);
}
If this doesn't work, please post a small reproducing example and I'll see if we can find a solution.

Nested Directives in OpenACC

I'm trying to use nested feature of OpenACC to active dynamic parallelism of my gpu card. I've Tesla 40c and my OpenACC compiler is PGI version 15.7.
My code is so simple. When I try to compile following code compiler returns me these messages
PGCC-S-0155-Illegal context for pragma: acc parallel loop (test.cpp: 158)
PGCC/x86 Linux 15.7-0: compilation completed with severe errors
My code structure:
#pragma acc parallel loop
for( i = 0; i < N; i++ )
{
// << computation >>
int ss = A[tid].start;
int ee = A[tid].end;
#pragma acc parallel loop
for(j = ss; j< ( ee + ss); j++)
{
// << computation >>
}
I've also tried to change my code to use routine directives. But I couldn't compile again
#pragma acc routine workers
foo(...)
{
#pragma acc parallel loop
for(j = ss; j< ( ee + ss); j++)
{
// << computation >>
}
}
#pragma acc parallel loop
for( i = 0; i < N; i++ )
{
// << computation >>
int ss = A[tid].start;
int ee = A[tid].end;
foo(...);
}
I've tried of course only with routine (seq,worker,gang) without inner parallel loop directive. It has been compiler but dynamic parallelism hasn't been activated.
37, Generating acc routine worker
Generating Tesla code
42, #pragma acc loop vector, worker /* threadIdx.x threadIdx.y */
Loop is parallelizable
How am I supposed to use dynamic parallelism in OpenACC?
How am I supposed to use dynamic parallelism in OpenACC?
Although nested regions (which would presumably use dynamic parallelism) is a new feature in the OpenACC 2.0 specification, I don't believe it is implemented yet in PGI 15.7. PGI 15.7 represents a partial implementation of the OpenACC 2.0 specification.
This limitation is documented in the PGI 15.7 release notes that should ship with your PGI 15.7 compiler (pgirn157.pdf) in section 2.7 (those release notes are currently available here):
OpenACC 2.0 Missing Features
‣ The declare link directive for global data is not implemented.
‣ Nested parallelism (parallel and kernels constructs within a parallel or kernels region) is not
implemented.
Based on the comments, there is some concern about #pragma acc routine worker, so here is a fully worked example with PGI 15.7 of that:
$ cat t1.c
#include <stdio.h>
#include <stdlib.h>
#define D1 4096
#define D2 4096
#define OFFS 2
#pragma acc routine worker
void my_set(int *d, int len, int val){
int i;
for (i = 0; i < len; i++) d[i] += val+OFFS;
}
int main(){
int i,*data;
data = (int *)malloc(D1*D2*sizeof(int));
for (i = 0; i < D1*D2; i++) data[i] = 1;
#pragma acc kernels copy(data[0:D1*D2])
for (i = 0; i < D1; i++)
my_set(data+(i*D2), D2, 1);
printf("%d\n", data[0]);
return 0;
}
$ pgcc -acc -ta=tesla -Minfo=accel t1.c -o t1
my_set:
8, Generating acc routine worker
Generating Tesla code
10, #pragma acc loop vector, worker /* threadIdx.x threadIdx.y */
Loop is parallelizable
main:
20, Generating copy(data[:16777216])
21, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
21, #pragma acc loop gang /* blockIdx.x */
$ ./t1
4
$
Note that the gang parallelism has been performed at the outer loop, and the worker parallelism has been performed in the inner (routine) loop.
This method does not depend on dynamic parallelism (instead, it relies on a partitioning of parallelism between worker at the routine level and gang at the caller level) and will not invoke dynamic parallelism.
The native use of dynamic parallelism (CDP) is not supported currently in PGI 15.7. It should be possible to call (i.e. interoperate with) other functions (e.g. CUDA, or libraries) that make use of CDP from OpenACC code, but currently, natively it is not used (and not supported) in PGI 15.7
try replacing "#pragma acc parallel loop" with #pragma acc loop"

Heisenbug in CUDA kernel, global memory access

About two years ago, I wrote a kernel for work on several numerical grids simultaneously. Some very strange behaviour emerged, which resulted in wrong results. When hunting down the bug utilizing printf()-statements inside the kernel, the bug vanished.
Due to deadline constraints, I kept it that way, though recently I figured that this was no appropriate coding style. So I revisited my kernel and boiled it down to what you see below.
__launch_bounds__(672, 2)
__global__ void heisenkernel(float *d_u, float *d_r, float *d_du, int radius,
int numNodesPerGrid, int numBlocksPerSM, int numGridsPerSM, int numGrids)
{
__syncthreads();
int id_sm = blockIdx.x / numBlocksPerSM; // (arbitrary) ID of Streaming Multiprocessor (SM) this thread works upon - (constant over lifetime of thread)
int id_blockOnSM = blockIdx.x % numBlocksPerSM; // Block number on this specific SM - (constant over lifetime of thread)
int id_r = id_blockOnSM * (blockDim.x - 2*radius) + threadIdx.x - radius; // Grid point number this thread is to work upon - (constant over lifetime of thread)
int id_grid = id_sm * numGridsPerSM; // Grid ID this thread is to work upon - (not constant over lifetime of thread)
while(id_grid < numGridsPerSM * (id_sm + 1)) // this loops over numGridsPerSM grids
{
__syncthreads();
int id_numInArray = id_grid * numNodesPerGrid + id_r; // Entry in array this thread is responsible for (read and possibly write) - (not constant over lifetime of thread)
float uchange = 0.0f;
//uchange = 1.0f; // if this line is uncommented, results will be computed correctly ("Solution 1")
float du = 0.0f;
if((threadIdx.x > radius-1) && (threadIdx.x < blockDim.x - radius) && (id_r < numNodesPerGrid) && (id_grid < numGrids))
{
if (id_r == 0) // FO-forward difference
du = (d_u[id_numInArray+1] - d_u[id_numInArray])/(d_r[id_numInArray+1] - d_r[id_numInArray]);
else if (id_r == numNodesPerGrid - 1) // FO-rearward difference
du = (d_u[id_numInArray] - d_u[id_numInArray-1])/(d_r[id_numInArray] - d_r[id_numInArray-1]);
else if (id_r == 1 || id_r == numNodesPerGrid - 2) //SO-central difference
du = (d_u[id_numInArray+1] - d_u[id_numInArray-1])/(d_r[id_numInArray+1] - d_r[id_numInArray-1]);
else if(id_r > 1 && id_r < numNodesPerGrid - 2)
du = d_fourpoint_constant * ((d_u[id_numInArray+1] - d_u[id_numInArray-1])/(d_r[id_numInArray+1] - d_r[id_numInArray-1])) + (1-d_fourpoint_constant) * ((d_u[id_numInArray+2] - d_u[id_numInArray-2])/(d_r[id_numInArray+2] - d_r[id_numInArray-2]));
else
du = 0;
}
__syncthreads();
if((threadIdx.x > radius-1 && threadIdx.x < blockDim.x - radius) && (id_r < numNodesPerGrid) && (id_grid < numGrids))
{
d_u[ id_numInArray] = d_u[id_numInArray] * uchange; // if this line is commented out, results will be computed correctly ("Solution 2")
d_du[ id_numInArray] = du;
}
__syncthreads();
++id_grid;
}
This kernel computes the derivative of some value at all grid points for a number of numerical 1D-grids.
Things to consider: (see full code base at the bottom)
a grid consists of 1300 grid points
each grid has to be worked upon by two blocks (due to memory/register limitations)
each block successively works on 37 grids (or better: grid halves, the while-loop takes care of that)
each thread is responsible for the same grid point in each grid
for the derivative to be computed, the threads need access to data from the four next grid points
in order to keep the blocks indepentend from each other, a small overlap on the grid is introduced (grid points 666, 667, 668, 669 of each grid are read from by two threads from different blocks, though only one thread is writing to them, it is this overlap where the problems occur)
due to the boiling down process, the two threads on each side of the blocks do no computations, in the original they are responsible for writing the corresponing grid values to shared memory
The values of the grids are stored in u_arr, du_arr and r_arr (and their corresponding device arrays d_u, d_du and d_r).
Each grid occupies 1300 consecutive values in each of these arrays.
The while-loop in the kernel iterates over 37 grids for each block.
To evaluate the workings of the kernel, each grid is initialized with the exact same values, so a deterministic program will produce the same result for each grid.
This does not happen with my code.
The weirdness of the Heisenbug:
I compared the computed values of grid 0 with each of the other grids, and there are differences at the overlap (grid points 666-669), though not consistently. Some grids have the right values, some do not. Two consecutive runs will mark different grids as erroneous.
The first thing that came to mind was that two threads at this overlap try to concurrently write to memory, though that does not seem to be the case (I checked.... and re-checked).
Commenting or un-commenting lines or using printf() for debugging purposes will alter
the outcome of the program as well: When "asking" the threads responsible for the grid points in question, they tell me that everything is allright, and they are actually correct. As soon as I force a thread to print out its variables, they will be computed (and more importantly: stored) correctly.
The same goes for debugging with Nsight Eclipse.
Memcheck / Racecheck:
cuda-memcheck (memcheck and racecheck) report no memory/racecondition problems, though even the usage of one of these tools have the ability to impact the correctness of the results.
Valgrind gives some warnings, though I think they have something to do with the CUDA API which I can not influence and which seem unrelated to my problem.
(Update)
As pointed out, cuda-memcheck --tool racecheck only works for shared memory race conditions, whereas the problem at hand has a race condition on d_u, i.e., global memory.
Testing environment:
The original kernel has been tested on different CUDA devices and with different compute capabilities (2.0, 3.0 and 3.5) with the bug showing up in every configuration (in some form or another).
My (main) testsystem is the following:
2 x GTX 460, tested on both the GPU that ran the X-server as well as
the other one
Driver Version: 340.46
Cuda Toolkit 6.5
Linux Kernel 3.11.0-12-generic (Linux Mint 16 - Xfce)
State of solution:
By now I am pretty sure that some memory access is the culprit, maybe some optimization from the compiler or use of uninitialized values, and that I obviously do not understand some fundamental CUDA paradigm.
The fact that printf() statements inside the kernel (which through some dark magic have to utilize device and host memory as well) and memcheck algorithms (cuda-memcheck and valgrind) influence
the bevavior point in the same direction.
I am sorry for this somewhat complicated kernel, but I boiled the original kernel and invocation down as much as I could, and this is as far as I got. By now I have learned to admire this problem, and I am looking forward to learning what is going on here.
Two "solutions", which force the kernel do work as intended, are marked in the code.
(Update) As mentioned in the correct answer below, the problem with my code is a race condition at the border of the thread-blocks. As there are two blocks working on each grid and there is no guarantee as to which block works first, resulting in the behavior outlined below. It also explains the correct results when employing "Solution 1" as mentioned in the code, because the input/output value d_u is not altered when uchange = 1.0.
The simple solution is to split this kernel into two kernels, one computing d_u, the other computing the derivative d_du. It would be more desirable to have just one kernel invocation instead of two, though I do not know how to accomplish this with -arch=sm_20. With -arch=sm_35 one could probably use dynamic parallelism to achieve that, though the overhead for the second kernel invocation is negligible.
heisenbug.cu:
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
const float r_sol = 6.955E8f;
__constant__ float d_fourpoint_constant = 0.2f;
__launch_bounds__(672, 2)
__global__ void heisenkernel(float *d_u, float *d_r, float *d_du, int radius,
int numNodesPerGrid, int numBlocksPerSM, int numGridsPerSM, int numGrids)
{
__syncthreads();
int id_sm = blockIdx.x / numBlocksPerSM; // (arbitrary) ID of Streaming Multiprocessor (SM) this thread works upon - (constant over lifetime of thread)
int id_blockOnSM = blockIdx.x % numBlocksPerSM; // Block number on this specific SM - (constant over lifetime of thread)
int id_r = id_blockOnSM * (blockDim.x - 2*radius) + threadIdx.x - radius; // Grid point number this thread is to work upon - (constant over lifetime of thread)
int id_grid = id_sm * numGridsPerSM; // Grid ID this thread is to work upon - (not constant over lifetime of thread)
while(id_grid < numGridsPerSM * (id_sm + 1)) // this loops over numGridsPerSM grids
{
__syncthreads();
int id_numInArray = id_grid * numNodesPerGrid + id_r; // Entry in array this thread is responsible for (read and possibly write) - (not constant over lifetime of thread)
float uchange = 0.0f;
//uchange = 1.0f; // if this line is uncommented, results will be computed correctly ("Solution 1")
float du = 0.0f;
if((threadIdx.x > radius-1) && (threadIdx.x < blockDim.x - radius) && (id_r < numNodesPerGrid) && (id_grid < numGrids))
{
if (id_r == 0) // FO-forward difference
du = (d_u[id_numInArray+1] - d_u[id_numInArray])/(d_r[id_numInArray+1] - d_r[id_numInArray]);
else if (id_r == numNodesPerGrid - 1) // FO-rearward difference
du = (d_u[id_numInArray] - d_u[id_numInArray-1])/(d_r[id_numInArray] - d_r[id_numInArray-1]);
else if (id_r == 1 || id_r == numNodesPerGrid - 2) //SO-central difference
du = (d_u[id_numInArray+1] - d_u[id_numInArray-1])/(d_r[id_numInArray+1] - d_r[id_numInArray-1]);
else if(id_r > 1 && id_r < numNodesPerGrid - 2)
du = d_fourpoint_constant * ((d_u[id_numInArray+1] - d_u[id_numInArray-1])/(d_r[id_numInArray+1] - d_r[id_numInArray-1])) + (1-d_fourpoint_constant) * ((d_u[id_numInArray+2] - d_u[id_numInArray-2])/(d_r[id_numInArray+2] - d_r[id_numInArray-2]));
else
du = 0;
}
__syncthreads();
if((threadIdx.x > radius-1 && threadIdx.x < blockDim.x - radius) && (id_r < numNodesPerGrid) && (id_grid < numGrids))
{
d_u[ id_numInArray] = d_u[id_numInArray] * uchange; // if this line is commented out, results will be computed correctly ("Solution 2")
d_du[ id_numInArray] = du;
}
__syncthreads();
++id_grid;
}
}
bool gridValuesEqual(float *matarray, uint id0, uint id1, const char *label, int numNodesPerGrid){
bool retval = true;
for(uint i=0; i<numNodesPerGrid; ++i)
if(matarray[id0 * numNodesPerGrid + i] != matarray[id1 * numNodesPerGrid + i])
{
printf("value %s at position %u of grid %u not equal that of grid %u: %E != %E, diff: %E\n",
label, i, id0, id1, matarray[id0 * numNodesPerGrid + i], matarray[id1 * numNodesPerGrid + i],
matarray[id0 * numNodesPerGrid + i] - matarray[id1 * numNodesPerGrid + i]);
retval = false;
}
return retval;
}
int main(int argc, const char* argv[])
{
float *d_u;
float *d_du;
float *d_r;
float *u_arr;
float *du_arr;
float *r_arr;
int numNodesPerGrid = 1300;
int numBlocksPerSM = 2;
int numGridsPerSM = 37;
int numSM = 7;
int TPB = 672;
int radius = 2;
int numGrids = 259;
int memsize_grid = sizeof(float) * numNodesPerGrid;
int numBlocksPerGrid = numNodesPerGrid / (TPB - 2 * radius) + (numNodesPerGrid%(TPB - 2 * radius) == 0 ? 0 : 1);
printf("---------------------------------------------------------------------------\n");
printf("--- Heisenbug Extermination Tracker ---------------------------------------\n");
printf("---------------------------------------------------------------------------\n\n");
cudaSetDevice(0);
cudaDeviceReset();
cudaMalloc((void **) &d_u, memsize_grid * numGrids);
cudaMalloc((void **) &d_du, memsize_grid * numGrids);
cudaMalloc((void **) &d_r, memsize_grid * numGrids);
u_arr = new float[numGrids * numNodesPerGrid];
du_arr = new float[numGrids * numNodesPerGrid];
r_arr = new float[numGrids * numNodesPerGrid];
for(uint k=0; k<numGrids; ++k)
for(uint i=0; i<numNodesPerGrid; ++i)
{
uint index = k * numNodesPerGrid + i;
if (i < 585)
r_arr[index] = i * (6000.0f);
else
{
if (i == 585)
r_arr[index] = r_arr[index - 1] + 8.576E-6f * r_sol;
else
r_arr[index] = r_arr[index - 1] + 1.02102f * ( r_arr[index - 1] - r_arr[index - 2] );
}
u_arr[index] = 1E-10f * (i+1);
du_arr[index] = 0.0f;
}
/*
printf("\n\nbefore kernel start\n\n");
for(uint k=0; k<numGrids; ++k)
printf("matrix->du_arr[k*paramH.numNodes + 668]:\t%E\n", du_arr[k*numNodesPerGrid + 668]);//*/
bool equal = true;
for(int k=1; k<numGrids; ++k)
{
equal &= gridValuesEqual(u_arr, 0, k, "u", numNodesPerGrid);
equal &= gridValuesEqual(du_arr, 0, k, "du", numNodesPerGrid);
equal &= gridValuesEqual(r_arr, 0, k, "r", numNodesPerGrid);
}
if(!equal)
printf("Input values are not identical for different grids!\n\n");
else
printf("All grids contain the same values at same grid points.!\n\n");
cudaMemcpy(d_u, u_arr, memsize_grid * numGrids, cudaMemcpyHostToDevice);
cudaMemcpy(d_du, du_arr, memsize_grid * numGrids, cudaMemcpyHostToDevice);
cudaMemcpy(d_r, r_arr, memsize_grid * numGrids, cudaMemcpyHostToDevice);
printf("Configuration:\n\n");
printf("numNodesPerGrid:\t%i\nnumBlocksPerSM:\t\t%i\nnumGridsPerSM:\t\t%i\n", numNodesPerGrid, numBlocksPerSM, numGridsPerSM);
printf("numSM:\t\t\t\t%i\nTPB:\t\t\t\t%i\nradius:\t\t\t\t%i\nnumGrids:\t\t\t%i\nmemsize_grid:\t\t%i\n", numSM, TPB, radius, numGrids, memsize_grid);
printf("numBlocksPerGrid:\t%i\n\n", numBlocksPerGrid);
printf("Kernel launch parameters:\n\n");
printf("moduleA2_3<<<%i, %i, %i>>>(...)\n\n", numBlocksPerSM * numSM, TPB, 0);
printf("Launching Kernel...\n\n");
heisenkernel<<<numBlocksPerSM * numSM, TPB, 0>>>(d_u, d_r, d_du, radius, numNodesPerGrid, numBlocksPerSM, numGridsPerSM, numGrids);
cudaDeviceSynchronize();
cudaMemcpy(u_arr, d_u, memsize_grid * numGrids, cudaMemcpyDeviceToHost);
cudaMemcpy(du_arr, d_du, memsize_grid * numGrids, cudaMemcpyDeviceToHost);
cudaMemcpy(r_arr, d_r, memsize_grid * numGrids, cudaMemcpyDeviceToHost);
/*
printf("\n\nafter kernel finished\n\n");
for(uint k=0; k<numGrids; ++k)
printf("matrix->du_arr[k*paramH.numNodes + 668]:\t%E\n", du_arr[k*numNodesPerGrid + 668]);//*/
equal = true;
for(int k=1; k<numGrids; ++k)
{
equal &= gridValuesEqual(u_arr, 0, k, "u", numNodesPerGrid);
equal &= gridValuesEqual(du_arr, 0, k, "du", numNodesPerGrid);
equal &= gridValuesEqual(r_arr, 0, k, "r", numNodesPerGrid);
}
if(!equal)
printf("Results are wrong!!\n");
else
printf("All went well!\n");
cudaFree(d_u);
cudaFree(d_du);
cudaFree(d_r);
delete [] u_arr;
delete [] du_arr;
delete [] r_arr;
return 0;
}
Makefile:
CUDA = 1
DEFINES =
ifeq ($(CUDA), 1)
DEFINES += -DCUDA
CUDAPATH = /usr/local/cuda-6.5
CUDAINCPATH = -I$(CUDAPATH)/include
CUDAARCH = -arch=sm_20
endif
CXX = g++
CXXFLAGS = -pipe -g -std=c++0x -fPIE -O0 $(DEFINES)
VALGRIND = valgrind
VALGRIND_FLAGS = -v --leak-check=yes --log-file=out.memcheck
CUDAMEMCHECK = cuda-memcheck
CUDAMC_FLAGS = --tool memcheck
RACECHECK = $(CUDAMEMCHECK)
RACECHECK_FLAGS = --tool racecheck
INCPATH = -I. $(CUDAINCPATH)
LINK = g++
LFLAGS = -O0
LIBS =
ifeq ($(CUDA), 1)
NVCC = $(CUDAPATH)/bin/nvcc
LIBS += -L$(CUDAPATH)/lib64/
LIBS += -lcuda -lcudart -lcudadevrt
NVCCFLAGS = -g -G -O0 --ptxas-options=-v
NVCCFLAGS += -lcuda -lcudart -lcudadevrt -lineinfo --machine 64 -x cu $(CUDAARCH) $(DEFINES)
endif
all:
$(NVCC) $(NVCCFLAGS) $(INCPATH) -c -o $(DST_DIR)heisenbug.o $(SRC_DIR)heisenbug.cu
$(LINK) $(LFLAGS) -o heisenbug heisenbug.o $(LIBS)
clean:
rm heisenbug.o
rm heisenbug
memrace: all
./heisenbug > out
$(VALGRIND) $(VALGRIND_FLAGS) ./heisenbug > out.memcheck.log
$(CUDAMEMCHECK) $(CUDAMC_FLAGS) ./heisenbug > out.cudamemcheck
$(RACECHECK) $(RACECHECK_FLAGS) ./heisenbug > out.racecheck
Note that in the entirety of your writeup, I do not see a question being explicitly asked, therefore I am responding to:
I am looking forward to learning what is going on here.
You have a race condition on d_u.
by your own statement:
•in order to keep the blocks indepentend from each other, a small overlap on the grid is introduced (grid points 666, 667, 668, 669 of each grid are read from by two threads from different blocks, though only one thread is writing to them, it is this overlap where the problems occur)
Furthermore, if you comment out the write to d_u, according to your statement in the code, the problem disappears.
CUDA threadblocks can execute in any order. You have at least 2 different blocks that are reading from grid points 666, 667, 668, 669. The results will be different depending on which case actually occurs:
both blocks read the value before any writes occur.
one block reads the value, then a write occurs, then the other block reads the value.
The blocks are not independent of each other (contrary to your statement) if one block is reading a value that can be written to by another block. The order of block execution will determine the result in this case, and CUDA does not specify the order of block execution.
Note that cuda-memcheck with the -tool racecheck option only captures race conditions related to __shared__ memory usage. Your kernel as posted uses no __shared__ memory, therefore I would not expect cuda-memcheck to report anything.
cuda-memcheck, in order to gather its data, does influence the order of block execution, so it's not surprising that it affects the behavior.
in-kernel printf represents a costly function call, writing to a global memory buffer. So it also affects execution behavior/patterns. And if you are printing out a large amount of data, exceeding the buffer lines of output, the effect is extremely costly (in terms of execution time) in the event of buffer overflow.
As an aside, Linux Mint is not a supported distro for CUDA, as far as I can see. However I don't think this is relevant to your issue; I can reproduce the behavior on a supported config.

Why is my rather trivial CUDA program erring with certain arguments?

I made a simple CUDA program for practice. It simply copies over data from one array to another:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.compiler import SourceModule
# Global constants
N = 2**20 # size of array a
a = np.linspace(0, 1, N)
e = np.empty_like(a)
block_size_x = 512
# Instantiate block and grid sizes.
block_size = (block_size_x, 1, 1)
grid_size = (N / block_size_x, 1)
# Create the CUDA kernel, and run it.
mod = SourceModule("""
__global__ void D2x_kernel(double* a, double* e, int N) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid > 0 && tid < N - 1) {
e[tid] = a[tid];
}
}
""")
func = mod.get_function('D2x_kernel')
func(a, cuda.InOut(e), np.int32(N), block=block_size, grid=grid_size)
print str(e)
However, I get this error: pycuda._driver.LogicError: cuLaunchKernel failed: invalid value
When I get rid of the second argument double* e in my kernel function and invoke the kernel without the argument e, the error goes away. Why is that? What does this error mean?
Your a array does not exist in device memory, so I suspect that PyCUDA is ignoring (or otherwise handling) the first argument to your kernel invocation and only passing in e and N...so you get an error because the kernel was expecting three arguments and it has only received two. Removing double* e from your kernel definition might eliminate the error message you're getting, but your kernel still won't work properly.
A quick fix to this should be to wrap a in a cuda.In() call, which instructs PyCUDA to copy a to the device before launching the kernel. That is, your kernel launch line should be:
func(cuda.In(a), cuda.InOut(e), np.int32(N), block=block_size, grid=grid_size)
Edit: Also, do you realize that your kernel is not copying the first and last elements of a to e? Your if (tid > 0 && tid < N - 1) statement is preventing that. For the entire array, it should be if (tid < N).

Use of shared memory with OpenACC

I'm trying to use shared memory to cache things with OpenACC.
Basically what I'm working on is a matrix multiplication, and what I have is this:
typedef float ff;
// Multiplies two square row-major matrices a and b, puts the result in c.
void mmul(const restrict ff* a,
const restrict ff* b,
restrict ff* c,
const int n) {
#pragma acc data copyin(a[0:n*n], b[0:n*n]) copy(c[0:n*n])
{
#pragma acc region
{
#pragma acc loop independent vector(16)
for (int i = 0; i < n; ++i) {
#pragma acc loop independent vector(16)
for (int j = 0; j < n; ++j) {
ff sum = 0;
for (int k = 0; k < n; ++k) {
sum += a[i + n * k] * b[k + n * j];
}
c[i + n * j] = sum;
}
}
}
}
}
What I would like to do is use shared memory to cache tiles of the matrices 'a' and 'b' to use in the computation of 'c', in a similar fashion to what the CUDA mmul algorithm does.
Basically on CUDA I would know the exact size of my blocks, and would be able to:
declare a shared memory with the size of the block
copy the 'relevant' part of the data to the block
use this data
I understand I can use the
#pragma acc cached
directive, and that I can specify block sizes with the vector and gang options, but I'm having some trouble understanding how that's gonna be mapped to the CUDA architecture.
Is there a way to achieve something similar with OpenACC? Is there a good tutorial/resource on the use of the cached directive or on how to map some of the power of shared memory from CUDA to OpenACC?
If you are using PGI Accelerator Compiler, you can dump out the generated PTX file and see what is going on in underling of execution:
pgcc -acc -fast -Minfo -ta=nvidia,cc13,keepptx matrixMult.c -o matrixMult
The generated PTX will be stored in the current directory.
EDIT: You may prefer to see the high-level code (CUDA for C or Fortran). So use following -ta=nvidia,cc13,keepptx,keepgpu .