CUDA kernel launch fails when using various offsets into input data

CUDA kernel launch fails when using various offsets into input data - cuda

My code is giving an error message and I am trying to track down the cause of it. To make it easier to find the problem, I have stripped away code that apparently is not relevant to causing the error message. If you can tell me why the following simple code produces an error message, then I think I should be able to fix my original code:
#include "cuComplex.h"
#include <cutil.h>
__device__ void compute_energy(void *data, int isample, int nsamples) {
cuDoubleComplex * const nminusarray = (cuDoubleComplex*)data;
cuDoubleComplex * const f = (cuDoubleComplex*)(nminusarray+101);
double * const abs_est_errorrow_all = (double*)(f+3);
double * const rel_est_errorrow_all = (double*)(abs_est_errorrow_all+nsamples*51);
int * const iid_all = (int*)(rel_est_errorrow_all+nsamples*51);
int * const iiu_all = (int*)(iid_all+nsamples*21);
int * const piv_all = (int*)(iiu_all+nsamples*21);
cuDoubleComplex * const energyrow_all = (cuDoubleComplex*)(piv_all+nsamples*12);
cuDoubleComplex * const refinedenergyrow_all = (cuDoubleComplex*)(energyrow_all+nsamples*51);
cuDoubleComplex * const btplus_all = (cuDoubleComplex*)(refinedenergyrow_all+nsamples*51);
cuDoubleComplex * const btplus = btplus_all+isample*21021;
btplus[0] = make_cuDoubleComplex(0.0, 0.0);
}
__global__ void computeLamHeight(void *data, int nlambda) {
compute_energy(data, blockIdx.x, nlambda);
}
int main(int argc, char *argv[]) {
void *device_data;
CUT_DEVICE_INIT(argc, argv);
CUDA_SAFE_CALL(cudaMalloc(&device_data, 184465640));
computeLamHeight<<<dim3(101, 1, 1), dim3(512, 1, 1), 45000>>>(device_data, 101);
CUDA_SAFE_CALL(cudaThreadSynchronize());
}
I am using a GeForce GTX 480 and I am compiling the code like so:
nvcc -L /soft/cuda-sdk/4.0.17/C/lib -I /soft/cuda-sdk/4.0.17/C/common/inc -lcutil_x86_64 -arch sm_13 -O3 -Xopencc "-Wall" Main.cu
The output is:
Using device 0: GeForce GTX 480
Cuda error in file 'Main.cu' in line 31 : unspecified launch failure.
EDIT: I have now further simplified the code. The following simpler code still produces the error message:
#include <cutil.h>
__global__ void compute_energy(void *data) {
*(double*)((int*)data+101) = 0.0;
}
int main(int argc, char *argv[]) {
void *device_data;
CUT_DEVICE_INIT(argc, argv);
CUDA_SAFE_CALL(cudaMalloc(&device_data, 101*sizeof(int)+sizeof(double)));
compute_energy<<<dim3(1, 1, 1), dim3(1, 1, 1)>>>(device_data);
CUDA_SAFE_CALL(cudaThreadSynchronize());
}
Now it is easy to see that the offset should be valid. I tried running cuda-memcheck and it says the following:
========= CUDA-MEMCHECK
Using device 0: GeForce GTX 480
Cuda error in file 'Main.cu' in line 13 : unspecified launch failure.
========= Invalid __global__ write of size 8
========= at 0x00000020 in compute_energy
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x200200194 is misaligned
=========
========= ERROR SUMMARY: 1 error
I tried searching the internet to find what is meant by the address being misaligned, but I failed to find an explanation. What is the deal?

It was very hard to parse your original code with all of those magic constants, but your updated repro case makes the problem immediately obvious. The GPU architecture requires all pointers to be aligned to word boundaries. Your kernel contains a pointer access which is not correctly word aligned. Doubles are an 64 bit type, and your addressing is not aligned to an even 64 bit boundary. This:
*(double*)((int*)data+100) = 0.0; // 50th double
or this:
*(double*)((int*)data+102) = 0.0; // 51st double
are both legal. This:
*(double*)((int*)data+101) = 0.0; // not aligned to a 64 bit boundary
is not.

the error indicates out of bound memory access, please check the offset value.

Related

CUDA mapped memory: device -> host writes are not visible on host

What I was trying to do is modifying a variable which resides in mapped memory that would cause the main program to exit.
But instead of this the main program keeps spinning on while (var == 0) ; line. I don't know how the new value could be flushed out so it would be visible on the host side too.
Btw. the variable is declared as volatile everywhere and I tried using the __threadfence_system() function with no success.
The host -> device direction works well.
System: Windows 7 x64, driver 358.50, GTX 560
Here is the piece of code that I can't get working:
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof (int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
echoKernel <<< 1, 1 >>> (devptr);
while (var == 0) ;
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}

When I run your code on linux, it runs as-is without issue.
However on windows, there is a problem around WDDM command batching. In effect, your kernel does not launch and is not getting launched before you enter the while-loop that hangs.
The WDDM command queue is a queue of commands that will eventually go to the GPU device. Various events will cause this queue to be "flushed" and the contents to be delivered as a "batch" of commands to the GPU.
Various cuda runtime API calls may effectively force the "flushing" of the command queue, such as cudaDeviceSynchronize() or cudaMemcpy(). However after the kernel launch, you are not issuing any runtime API calls before entering your while-loop. As a result, in this scenario it seems that the kernel call is getting "stuck" in the queue and never "flushed".
You can work around this in a variety of ways, for example by recording an event after the launch of the kernel and then querying the status of that event. This will have the effect of flushing the queue, which will launch the kernel.
Here's an example modification of your code that works for me:
#include <stdio.h>
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))
__global__ void echoKernel(volatile int* semaphore)
{
*semaphore = 1;
__threadfence_system();
}
int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
volatile int var = 0;
volatile int *devptr;
CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof(int), cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));
cudaEvent_t my_event;
CUDA_ERROR_CHECK(cudaEventCreate(&my_event));
echoKernel << < 1, 1 >> > (devptr);
CUDA_ERROR_CHECK(cudaEventRecord(my_event));
cudaEventQuery(my_event);
while (var == 0);
CUDA_ERROR_CHECK(cudaDeviceSynchronize());
CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
CUDA_ERROR_CHECK(cudaDeviceReset());
return 0;
}
Tested on CUDA 7.5, Driver 358.50, Win7 x64 release project, GTX460M.
Note that we don't wrap the cudaEventQuery call in a standard error checker, because the expected behavior for it is to return a non-zero status when the event has not been completed yet.

Loading multiple modules in JCuda is not working

In jCuda one can load cuda files as PTX or CUBIN format and call(launch) __global__ functions (kernels) from Java.
With keeping that in mind, I want to develop a framework with JCuda that gets user's __device__ function in a .cu file at run-time, loads and runs it.
And I have already implemented a __global__ function, in which each thread finds out the start point of its related data, perform some computation, initialization and then call user's __device__ function.
Here is my kernel pseudo code:
extern "C" __device__ void userFunc(args);
extern "C" __global__ void kernel(){
// initialize
userFunc(args);
// rest of the kernel
}
And user's __device__ function:
extern "C" __device__ void userFunc(args){
// do something
}
And in Java side, here is the part that I load the modules(modules are made from ptx files which are successfully created from cuda files with this command: nvcc -m64 -ptx path/to/cudaFile -o cudaFile.ptx)
CUmodule kernelModule = new CUmodule(); // 1
CUmodule userFuncModule = new CUmodule(); // 2
cuModuleLoad(kernelModule, ptxKernelFileName); // 3
cuModuleLoad(userFuncModule, ptxUserFuncFileName); // 4
When I try to run it I got error at line 3 : CUDA_ERROR_NO_BINARY_FOR_GPU. After some searching I get that my ptx file has some syntax error. After running this suggested command:
ptxas -arch=sm_30 kernel.ptx
I got:
ptxas fatal : Unresolved extern function 'userFunc'
Even when I replace line 3 with 4 to load userFunc before kernel I get this error. I got stuck at this phase. Is this the correct way to load multiple modules that need to be linked together in JCuda? Or is it even possible?
Edit:
Second part of the question is here

The really short answer is: No, you can't load multiple modules into a context in the runtime API.
You can do what you want, but it requires explicit setup and execution of a JIT linking call. I have no idea how (or even whether) that has been implemented in JCUDA, but I can show you how to do it with the standard driver API. Hold on...
If you have a device function in one file, and a kernel in another, for example:
// test_function.cu
#include <math.h>
__device__ float mathop(float &x, float &y, float &z)
{
float res = sin(x) + cos(y) + sqrt(z);
return res;
}
and
// test_kernel.cu
extern __device__ float mathop(float & x, float & y, float & z);
__global__ void kernel(float *xvals, float * yvals, float * zvals, float *res)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
res[tid] = mathop(xvals[tid], yvals[tid], zvals[tid]);
}
You can compile them to PTX as usual:
$ nvcc -arch=sm_30 -ptx test_function.cu
$ nvcc -arch=sm_30 -ptx test_kernel.cu
$ head -14 test_kernel.ptx
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324607
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_30
.address_size 64
// .globl _Z6kernelPfS_S_S_
.extern .func (.param .b32 func_retval0) _Z6mathopRfS_S_
At runtime, your code must create a JIT link session, add each PTX to the linker session, then finalise the linker session. This will give you a handle to a compiled cubin image which can be loaded as a module as usual. The simplest possible driver API code to put this together looks like this:
#include <cstdio>
#include <cuda.h>
#define drvErrChk(ans) { drvAssert(ans, __FILE__, __LINE__); }
inline void drvAssert(CUresult code, const char *file, int line, bool abort=true)
{
if (code != CUDA_SUCCESS) {
fprintf(stderr, "Driver API Error %04d at %s %d\n", int(code), file, line);
exit(-1);
}
}
int main()
{
cuInit(0);
CUdevice device;
drvErrChk( cuDeviceGet(&device, 0) );
CUcontext context;
drvErrChk( cuCtxCreate(&context, 0, device) );
CUlinkState state;
drvErrChk( cuLinkCreate(0, 0, 0, &state) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_function.ptx", 0, 0, 0) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_kernel.ptx" , 0, 0, 0) );
size_t sz;
char * image;
drvErrChk( cuLinkComplete(state, (void **)&image, &sz) );
CUmodule module;
drvErrChk( cuModuleLoadData(&module, image) );
drvErrChk( cuLinkDestroy(state) );
CUfunction function;
drvErrChk( cuModuleGetFunction(&function, module, "_Z6kernelPfS_S_S_") );
return 0;
}
You should be able to compile and run this as posted and verify it works OK. It should serve as a template for a JCUDA implementation, if they have JIT linking support implemented.

cudaMemcpyFromSymbol on a device variable

I am trying to apply a kernel function on a __device__ variable, which, according to the specs, resides "in global memory"
#include <stdio.h>
#include "sys_data.h"
#include "my_helper.cuh"
#include "helper_cuda.h"
#include <cuda_runtime.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
int main(void) {
checkCudaErrors(cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double)));
vector_projection<double><<<1,10>>>(DEV_X, 10);
getLastCudaError("oops");
checkCudaErrors(cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double)));
return 0;
}
The kernel function vector_projection is defined in my_helper.cuh as follows:
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
As you can see, I use cudaMemcpyToSymbol and cudaMemcpyFromSymbol to transfer data to and from the device. However, I'm getting the following error:
CUDA error at ../src/vectorAdd.cu:19 code=4(cudaErrorLaunchFailure)
"cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double))"
Footnote: I can of course avoid to use __device__ variables and go for something like this which works fine; I just want to see how to do the same thing (if possible) with __device__ variables.
Update: The output of cuda-memcheck can be found at http://pastebin.com/AW9vmjFs. The error messages I get are as follows:
========= Invalid __global__ read of size 8
========= at 0x000000c8 in /home/ubuntu/Test0001/Debug/../src/my_helper.cuh:75:void vector_projection<double>(double*, int)
========= by thread (9,0,0) in block (0,0,0)
========= Address 0x000370e8 is out of bounds

The root of the problem is that you are not allowed to take the address of a device variable in ordinary host code:
vector_projection<double><<<1,10>>>(DEV_X, 10);
^
Although this seems to compile correctly, the actual address passed is garbage.
To take the address of a device variable in host code, we can use cudaGetSymbolAddress
Here is a worked example that compiles and runs correctly for me:
$ cat t577.cu
#include <stdio.h>
double X[10] = {1,-2,3,-4,5,-6,7,-8,9,-10};
double Y[10] = {0};
__device__ double DEV_X[10];
template<typename T> __global__ void vector_projection(T *dx, int n) {
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) {
if (dx[tid] < 0)
dx[tid] = (T) 0;
}
}
int main(void) {
cudaMemcpyToSymbol(DEV_X, X,10*sizeof(double));
double *my_dx;
cudaGetSymbolAddress((void **)&my_dx, DEV_X);
vector_projection<double><<<1,10>>>(my_dx, 10);
cudaMemcpyFromSymbol(Y, DEV_X, 10*sizeof(double));
for (int i = 0; i < 10; i++)
printf("%d: %f\n", i, Y[i]);
return 0;
}
$ nvcc -arch=sm_35 -o t577 t577.cu
$ cuda-memcheck ./t577
========= CUDA-MEMCHECK
0: 1.000000
1: 0.000000
2: 3.000000
3: 0.000000
4: 5.000000
5: 0.000000
6: 7.000000
7: 0.000000
8: 9.000000
9: 0.000000
========= ERROR SUMMARY: 0 errors
$
This is not the only way to address this. It is legal to take the address of a device variable in device code, so you could modify your kernel with a line something like this:
T *dx = DEV_X;
and forgo passing of the device variable as a kernel parameter. As suggested in the comments, you could also modify your code to use Unified Memory.
Regarding error checking, if you deviate from proper cuda error checking and are not careful in your deviations, the results may be confusing. Most cuda API calls can, in addition to errors arising from their own behavior, return an error that resulted from some previous CUDA asynchronous activity (usually kernel calls).

CUDA invalid device symbol error

the code below compiles just fine. But when i try to run it, i got
GPUassert: invalid device symbol file.cu 114
When i comment lines marked by (!!!) the error wont show up. My question is what is causing this error because it gives me no sense.
Compiling with nvcc file.cu -arch compute_11
#include "stdio.h"
#include <algorithm>
#include <ctime>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define THREADS 64
#define BLOCKS 256
#define _dif (((1ll<<32)-121)/(THREADS*BLOCKS)+1)
#define HASH_SIZE 1024
#define ROUNDS 16
#define HASH_ROW (HASH_SIZE/ROUNDS)+(HASH_SIZE%ROUNDS==0?0:1)
#define HASH_COL 1000000000/HASH_SIZE
typedef unsigned long long ull;
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
//fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ unsigned int primes[1024];
//__device__ unsigned char primes[(1<<28)+1];
__device__ long long n = 1ll<<32;
__device__ ull dev_base;
__device__ unsigned int dev_hash;
__device__ unsigned int dev_index;
time_t curtime;
__device__ int hashh(long long x) {
return (x>>1)%1024;
}
// compute (x^e)%n
__device__ ull mulmod(ull x,ull e,ull n) {
ull ans = 1;
while(e>0) {
if(e&1) ans = (ans*x)%n;
x = (x*x)%n;
e>>=1;
}
return ans;
}
// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(ull a,ull n) {
int d=0;
ull t = n-1;
while(t%2==0) {
++d;
t>>=1;
}
ull x = mulmod(a,t,n);
if(x==1) return 1;
for(int i=0;i<d;++i) {
if(x==n-1) return 1;
x=(x*x)%n;
}
return 0;
}
__device__ int prime(long long x) {
//unsigned long long b = 2;
//return is_SPRP(b,(unsigned long long)x);
return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}
__global__ void find(unsigned int *out,unsigned int *c) {
unsigned int buff[HASH_ROW][256];
int local_c[HASH_ROW];
for(int i=0;i<HASH_ROW;++i) local_c[i]=0;
long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*_dif;
long long e = b+_dif;
if(b%2==0) ++b;
for(long long i=b;i<e && i<n;i+=2) {
if(i%3==0 || i%5==0 || i%7==0) continue;
int hash_num = hashh(i)-(dev_hash*(HASH_ROW));
if(0<=hash_num && hash_num<HASH_ROW) {
if(prime(i)) continue;
buff[hash_num][local_c[hash_num]++]=(unsigned int)i;
if(local_c[hash_num]==256) {
int start = atomicAdd(c+hash_num,local_c[hash_num]);
if(start+local_c[hash_num]>=HASH_COL) return;
unsigned int *out_offset = out+hash_num*(HASH_COL)*4;
for(int i=0;i<local_c[hash_num];++i) out_offset[i+start]=buff[hash_num][i]; //(!!!)
local_c[hash_num]=0;
}
}
}
for(int i=0;i<HASH_ROW;++i) {
int start = atomicAdd(c+i,local_c[i]);
if(start+local_c[i]>=HASH_COL) return;
unsigned int *out_offset = out+i*(HASH_COL)*4;
for(int j=0;j<local_c[i];++j) out_offset[j+start]=buff[i][j]; //(!!!)
}
}
int main(void) {
printf("HASH_ROW: %d\nHASH_COL: %d\nPRODUCT: %d\n",(int)HASH_ROW,(int)HASH_COL,(int)(HASH_ROW)*(HASH_COL));
ull *base_adr;
gpuErrchk(cudaGetSymbolAddress((void**)&base_adr,dev_base));
gpuErrchk(cudaMemset(base_adr,0,7));
gpuErrchk(cudaMemset(base_adr,0x02,1));
}

A rather unusual error.
The failure is occurring because:
By specifying a virtual architecture only (-arch compute_11) you defer the PTX compile step until runtime (i.e. you are forcing JIT-compile)
The JIT-compile is failing (at runtime)
The failure of the JIT-compile (and link) means device symbols cannot be properly established
Due to the problem with device symbols, the operation cudaGetSymbolAddress on the device symbol dev_base fails, and throws an error.
Why is the JIT-compile failing? You can find out yourself by triggering the machine code compile (which runs the ptxas assembler) by specifying -arch=sm_11 instead of -arch compute_11. If you do that, you'll get this result:
ptxas error : Entry function '_Z4findPjS_' uses too much local data (0x10100 bytes, 0x4000 max)
So even though your code doesn't call the find kernel, it must compile successfully to have a sane device environment for symbols.
Why does this compile error occur? Because you are requesting too much local memory per thread. cc 1.x devices are limited to 16KB local memory per thread, and your find kernel is requesting quite a bit more than that (over 64KB).
When I initially tried it on my device, I was using a cc2.0 device which has a higher limit (512KB per thread) and so the JIT-compile step succeeded.
In general, I would recommend specifying both a virtual architecture and a machine architecture, and the shorthand way to do that is:
nvcc -arch=sm_11 ....
(for a cc1.1 device)
This question/answer may also be of interest, and the nvcc manual has more details about virtual vs. machine architecture, and how to specify the compilation phases for each.
I believe the reason the error goes away when you comment out those particular lines in the kernel, is that with those commented out, the compiler is able to optimize-out the accesses to those local memory areas, and optimize-out the instantiation of the local memory. This allows the JIT-compile step to complete successfully, and your code runs "without runtime error".
You can verify this by commenting those lines out and then specify a full compile (nvcc -arch=sm_11 ...), where -arch is short for --gpu-architecture.

This error usually means the kernel has been compiled for the wrong architecture. You need to find out what the compute capability of your GPU is, and then compile it for that architecture. E.g. if your GPU has compute capability 1.1, compile it with -arch=sm_11. You can also build an executable for more than one architecture.

printf() in my CUDA kernel doesn't result produce any output

I have added some printf() statements in my CUDA program
__device__ __global__ void Kernel(float *, float * ,int );
void DeviceFunc(float *temp_h , int numvar , float *temp1_h)
{ .....
//Kernel call
printf("calling kernel\n");
Kernel<<<dimGrid , dimBlock>>>(a_d , b_d , numvar);
printf("kernel called\n");
....
}
int main(int argc , char **argv)
{ ....
printf("beforeDeviceFunc\n\n");
DeviceFunc(a_h , numvar , b_h); //Showing the data
printf("after DeviceFunc\n\n");
....
}
Also in the Kernel.cu, I wrote:
#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
int idx = threadIdx.x ;
int idy = threadIdx.y ;
//Allocating memory in the share memory of the device
__shared__ float temp[16][16];
//Copying the data to the shared memory
temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
printf("idx=%d, idy=%d, size=%d", idx, idy, size);
....
}
Then I compile using -arch=sm_20 like this:
nvcc -c -arch sm_20 main.cu
nvcc -c -arch sm_20 Kernel.cu
nvcc -arch sm_20 main.o Kernel.o -o main
Now when I run the program, I see:
beforeDeviceFunc
calling kernel
kernel called
after DeviceFunc
So the printf() inside the kernel is not printed. How can I fix that?

printf() output is only displayed if the kernel finishes successfully, so check the return codes of all CUDA function calls and make sure no errors are reported.
Furthermore printf() output is only displayed at certain points in the program. Appendix B.32.2 of the Programming Guide lists these as
Kernel launch via <<<>>> or cuLaunchKernel() (at the start of the launch, and if the CUDA_LAUNCH_BLOCKING environment variable is set to 1, at the end of the launch as well),
Synchronization via cudaDeviceSynchronize(), cuCtxSynchronize(), cudaStreamSynchronize(), cuStreamSynchronize(), cudaEventSynchronize(), or cuEventSynchronize(),
Memory copies via any blocking version of cudaMemcpy*() or cuMemcpy*(),
Module loading/unloading via cuModuleLoad() or cuModuleUnload(),
Context destruction via cudaDeviceReset() or cuCtxDestroy().
Prior to executing a stream callback added by cudaStreamAddCallback() or cuStreamAddCallback().
To check this is your problem, put the following code after your kernel invocation:
{
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != cudaSuccess)
printf("kernel launch failed with error \"%s\".\n",
cudaGetErrorString(cudaerr));
}
You should then see either the output of your kernel or an error message.
More conveniently, cuda-memcheck will automatically check all return codes for you if you run your executable under it. While you should always check for errors anyway, this comes handy when resolving concrete issues.

I had the same error just now and decreasing the block size to 512 helped. According to documentation maximum block size can be either 512 or 1024.
I have written a simple test that showed that my GTX 1070 has a maximum block size of 1024. UPD: you can check if your kernel has ever executed by using cudaError_t cudaPeekAtLastError() that returns cudaSuccess if the kernel has started successfully, and only after it is worse calling cudaError_t cudaDeviceSynchronize().
Testing block size of 1023
Testing block size of 1024
Testing block size of 1025
CUDA error: invalid configuration argument
Block maximum size is 1024
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
__global__
void set1(int* t)
{
t[threadIdx.x] = 1;
}
inline bool failed(cudaError_t error)
{
if (cudaSuccess == error)
return false;
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(error));
return true;
}
int main()
{
int blockSize;
for (blockSize = 1; blockSize < 1 << 12; blockSize++)
{
printf("Testing block size of %d\n", blockSize);
int* t;
if(failed(cudaMallocManaged(&t, blockSize * sizeof(int))))
{
failed(cudaFree(t));
break;
}
for (int i = 0; i < blockSize; i++)
t[0] = 0;
set1 <<<1, blockSize>>> (t);
if (failed(cudaPeekAtLastError()))
{
failed(cudaFree(t));
break;
}
if (failed(cudaDeviceSynchronize()))
{
failed(cudaFree(t));
break;
}
bool hasError = false;
for (int i = 0; i < blockSize; i++)
if (1 != t[i])
{
printf("CUDA error: t[%d] = %d but not 1\n", i, t[i]);
hasError = true;
break;
}
if (hasError)
{
failed(cudaFree(t));
break;
}
failed(cudaFree(t));
}
blockSize--;
if(blockSize <= 0)
{
printf("CUDA error: block size cannot be 0\n");
return 1;
}
printf("Block maximum size is %d", blockSize);
return 0;
}
P.S. Please note, that the only thing in block sizing is warp granularity which is 32 nowadays, so if 0 == yourBlockSize % 32 the warps are used pretty efficiently. The only reason to make blocks bigger then 32 is when the code needs synchronization as synchronization is available only among threads in a single block which makes a developer to use a single large block instead of many small ones. So running with higher number of smaller blocks can be even more efficient than running with lower number of larger blocks.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008