Kernel fails on launch cause of kernel pararameters - cuda

I made a simple CUDA kernel which fails to launch for some reason I don't understand.
Below you see my global vars.
unsigned int volume[256*256*256];//contains volume data of source
unsigned int target[256*256*256];//contains volume data of target
unsigned int* d_volume=NULL;//source data on device
unsigned int* d_target=NULL;//target data on device
The next function is a kernel launcher.
void launch_kernel(){
cudaMalloc(&d_volume,256*256*256*sizeof(unsigned int));
cudaMemcpy(d_volume, volume, 256*256*256*sizeof(unsigned int),cudaMemcpyHostToDevice);
cudaMalloc(&d_target,256*256*256*sizeof(unsigned int));
cudaMemcpy(d_target, target, 256*256*256*sizeof(unsigned int),cudaMemcpyHostToDevice);
dim3 threads(256,1,1);
dim3 blocks(256,256,1);
simple_kernel<<<blocks,threads>>>(d_volume,d_target);
cudaError_t cudaResult;
cudaResult = cudaGetLastError();
if (cudaResult != cudaSuccess)
{
cout<<"kernel failed"<<endl;
}
cudaMemcpy(volume, d_volume, 256*256*256*sizeof( int),cudaMemcpyDeviceToHost);
cudaFree(d_volume);
cudaMemcpy(target, d_target 256*256*256*sizeof( int),cudaMemcpyDeviceToHost);
cudaFree(d_target);
}
Problem seems to be on d_target cause if I launch the kernel like that:
simple_kernel<<<blocks,threads>>>(d_volume,d_volume);
it is working perfectly (passes on to the device the values that must be passed) and no message appears. Any idea why could that happen?
Kernel declaration follows below.
__global__ void simple_kernel(unsigned int* src,unsigned int* tgt){
//i dont think it matters what it is for.
int x = threadIdx.x;
int y = blockIdx.x;
int z = blockIdx.y;
if(x!=0 || x!=255 || y!=0 || y!=255 || z!=0 || z!=255 ){//in bound of memory allocated
if( src[x*256*256+y*256+z]==tgt[x*256*256+y*256+z])
if(tgt[(x+1)*256*256+y*256+z]==1 || tgt[(x-1)*256*256+y*256+z]==1 || tgt[(x-1)*256*256+(y+1)*256+z] ||tgt[(x-1)*256*256+(y-1)*256+z])
src[x*256*256+y*256+z]=1;
else
src[x*256*256+y*256+z]=0;
}
}

CUDA can return error also in a case of out-of-bounds read access to global memory. You perform this out-of-bounds read access in:
if(tgt[(x+1)*256*256+y*256+z]==1 || ...) e.g. for x = y = z = 255 which go through your out-of-bounds checking.
In a case you launch your kernel as
simple_kernel<<<blocks,threads>>>(d_volume,d_volume);
during out-of-bounds read access you actually access global memory which has already been allocated for d_target as arrays d_volume and d_target are stored consecutively, hence, no error occurs.
Confirm my opinion by further error-checking or launch your program with cuda-memcheck.

Related

CUDA multi-gpu p2p sync

I try to implement a producer-consumer relationship between
two GPUs, my application makes the producer GPU record an event and then
the consumer GPU inserts a stream-wait on that event into its command
stream. It will stopprocessing its commands when consumer GPU encounters the stream-wait,
until the producer GPU has passed the 'point of execution' where cudaEventRecord was called.
I started with a staging buffer like this:
cudaError_t chCpyP2P(void *_dst, int dstDevice, const void *_src, int srcDevice, size_t N) {
cudaError_t status;
char *dst = (char*) _dst;
const char *src = (const char*) _src;
int stagingIndex = 0;
size_t copySize = min(N, STAGING_BUFFER_SIZE);
while ( N ) {
cudaSetDevice(srcDevice);
cudaStreamWaitEvent(0, g_events[dstDevice][stagingIndex],0);
cudaMemcpyAsync(g_hostBuffers[stagingIndex], src, copySize, cudaMemcpyDeviceToHost, NULL);
cudaEventRecord(g_events[srcDevice][stagingIndex] );
cudaSetDevice(dstDevice);
cudaMemcpyAsync(dst, g_hostBuffers[stagingIndex], copySize, cudaMemcpyHostToDevice, NULL);
dst += copySize;
src += copySize;
N -= copySize;
stagingIndex = 1 - stagingIndex;
}
}
But I am missing somehow an essential step as it is not working as expected. I do not find any place where to
overthink my plans.
Does anyone know what I could do ?
Thanks for help, hope my question isn't too dumb.
on the host, the staging buffer is allocated and the memcpy starts by having
the source GPU copy source data into the staging buffer and recording a event.
But: Unlike the host2device memcpy, the CPU doesn't need to
synchronize as all synchronization will be done by the GPUs. Because memcpy
and the event-record are asynchronous, directly after the initial memcpy,
the CPU can request the destination-GPU to wait on that initial event and start a memcpy of the same buffer.
In order to let the two GPUs can use the staging buffers concurrently, two staging buffers and two CUDA events are needed. The CPU loops over the input buffer and output buffers, issuing memcpy and event-record
commands, until it has requested copies for all bytes, waiting for both GPUs to finish processing.
cudaError_t chCpyP2P(void *_dst,int dstDevice,const void *_src,int srcDevice,size_t N)
{
cudaError_t status;
char *dst = (char *) _dst;
const char *src = (const char *) _src;
int stg_idx = 0; // staging-index
while (N) {
size_t sz_cpy = min(N,STAGING_BUFFER_SIZE);
cudaSetDevice( srcDevice );
cudaStreamWaitEvent(0,g_events[dstDevice][stg_idx],0);
cudaMemcpyAsync(g_hostBuffers[stg_idx],src,sz_cpy,cudaMemcpyDeviceToHost,NULL);
cudaEventRecord(g_events[srcDevice][stg_idx]);
cudaSetDevice(dstDevice);
cudaStreamWaitEvent(0,g_events[srcDevice][stg_idx],0);
cudaMemcpyAsync(dst,g_hostBuffers[stg_idx],sz_cpy,cudaMemcpyHostToDevice,NULL);
cudaEventRecord(g_events[dstDevice][stg_idx]);
dst += sz_cpy;
src += sz_cpy;
N -= sz_cpy;
stg_idx = 1 - stg_idx;
}
cudaSetDevice(srcDevice);
cudaDeviceSynchronize();
cudaSetDevice(dstDevice);
cudaDeviceSynchronize();
return status;
}
You also need to define size_t sz_cpy outside the loop ;-)

CUDA invalid device symbol error

the code below compiles just fine. But when i try to run it, i got
GPUassert: invalid device symbol file.cu 114
When i comment lines marked by (!!!) the error wont show up. My question is what is causing this error because it gives me no sense.
Compiling with nvcc file.cu -arch compute_11
#include "stdio.h"
#include <algorithm>
#include <ctime>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define THREADS 64
#define BLOCKS 256
#define _dif (((1ll<<32)-121)/(THREADS*BLOCKS)+1)
#define HASH_SIZE 1024
#define ROUNDS 16
#define HASH_ROW (HASH_SIZE/ROUNDS)+(HASH_SIZE%ROUNDS==0?0:1)
#define HASH_COL 1000000000/HASH_SIZE
typedef unsigned long long ull;
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
//fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ unsigned int primes[1024];
//__device__ unsigned char primes[(1<<28)+1];
__device__ long long n = 1ll<<32;
__device__ ull dev_base;
__device__ unsigned int dev_hash;
__device__ unsigned int dev_index;
time_t curtime;
__device__ int hashh(long long x) {
return (x>>1)%1024;
}
// compute (x^e)%n
__device__ ull mulmod(ull x,ull e,ull n) {
ull ans = 1;
while(e>0) {
if(e&1) ans = (ans*x)%n;
x = (x*x)%n;
e>>=1;
}
return ans;
}
// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(ull a,ull n) {
int d=0;
ull t = n-1;
while(t%2==0) {
++d;
t>>=1;
}
ull x = mulmod(a,t,n);
if(x==1) return 1;
for(int i=0;i<d;++i) {
if(x==n-1) return 1;
x=(x*x)%n;
}
return 0;
}
__device__ int prime(long long x) {
//unsigned long long b = 2;
//return is_SPRP(b,(unsigned long long)x);
return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}
__global__ void find(unsigned int *out,unsigned int *c) {
unsigned int buff[HASH_ROW][256];
int local_c[HASH_ROW];
for(int i=0;i<HASH_ROW;++i) local_c[i]=0;
long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*_dif;
long long e = b+_dif;
if(b%2==0) ++b;
for(long long i=b;i<e && i<n;i+=2) {
if(i%3==0 || i%5==0 || i%7==0) continue;
int hash_num = hashh(i)-(dev_hash*(HASH_ROW));
if(0<=hash_num && hash_num<HASH_ROW) {
if(prime(i)) continue;
buff[hash_num][local_c[hash_num]++]=(unsigned int)i;
if(local_c[hash_num]==256) {
int start = atomicAdd(c+hash_num,local_c[hash_num]);
if(start+local_c[hash_num]>=HASH_COL) return;
unsigned int *out_offset = out+hash_num*(HASH_COL)*4;
for(int i=0;i<local_c[hash_num];++i) out_offset[i+start]=buff[hash_num][i]; //(!!!)
local_c[hash_num]=0;
}
}
}
for(int i=0;i<HASH_ROW;++i) {
int start = atomicAdd(c+i,local_c[i]);
if(start+local_c[i]>=HASH_COL) return;
unsigned int *out_offset = out+i*(HASH_COL)*4;
for(int j=0;j<local_c[i];++j) out_offset[j+start]=buff[i][j]; //(!!!)
}
}
int main(void) {
printf("HASH_ROW: %d\nHASH_COL: %d\nPRODUCT: %d\n",(int)HASH_ROW,(int)HASH_COL,(int)(HASH_ROW)*(HASH_COL));
ull *base_adr;
gpuErrchk(cudaGetSymbolAddress((void**)&base_adr,dev_base));
gpuErrchk(cudaMemset(base_adr,0,7));
gpuErrchk(cudaMemset(base_adr,0x02,1));
}
A rather unusual error.
The failure is occurring because:
By specifying a virtual architecture only (-arch compute_11) you defer the PTX compile step until runtime (i.e. you are forcing JIT-compile)
The JIT-compile is failing (at runtime)
The failure of the JIT-compile (and link) means device symbols cannot be properly established
Due to the problem with device symbols, the operation cudaGetSymbolAddress on the device symbol dev_base fails, and throws an error.
Why is the JIT-compile failing? You can find out yourself by triggering the machine code compile (which runs the ptxas assembler) by specifying -arch=sm_11 instead of -arch compute_11. If you do that, you'll get this result:
ptxas error : Entry function '_Z4findPjS_' uses too much local data (0x10100 bytes, 0x4000 max)
So even though your code doesn't call the find kernel, it must compile successfully to have a sane device environment for symbols.
Why does this compile error occur? Because you are requesting too much local memory per thread. cc 1.x devices are limited to 16KB local memory per thread, and your find kernel is requesting quite a bit more than that (over 64KB).
When I initially tried it on my device, I was using a cc2.0 device which has a higher limit (512KB per thread) and so the JIT-compile step succeeded.
In general, I would recommend specifying both a virtual architecture and a machine architecture, and the shorthand way to do that is:
nvcc -arch=sm_11 ....
(for a cc1.1 device)
This question/answer may also be of interest, and the nvcc manual has more details about virtual vs. machine architecture, and how to specify the compilation phases for each.
I believe the reason the error goes away when you comment out those particular lines in the kernel, is that with those commented out, the compiler is able to optimize-out the accesses to those local memory areas, and optimize-out the instantiation of the local memory. This allows the JIT-compile step to complete successfully, and your code runs "without runtime error".
You can verify this by commenting those lines out and then specify a full compile (nvcc -arch=sm_11 ...), where -arch is short for --gpu-architecture.
This error usually means the kernel has been compiled for the wrong architecture. You need to find out what the compute capability of your GPU is, and then compile it for that architecture. E.g. if your GPU has compute capability 1.1, compile it with -arch=sm_11. You can also build an executable for more than one architecture.

Shared memory address passed to device function is still shared memory?

Let's say i have this __device__ function:
__device__ unsigned char* dev_kernel(unsigned char* array_sh, int params){
return array_sh + params;
}
And within the __global__ kernel i use it in this way:
uarray = dev_kernel (uarray, params);
Where uarray is an array located in shared memory.
But when i use cuda-gdb to see the addresss of uarray within __global__ kernel i get:
(#generic unsigned char * #shared) 0x1000010 "z\377*"
And within __device__ kernel i get:
(unsigned char * #generic) 0x1000010 <Error reading address 0x1000010: Operation not permitted>
Despite the error, the program in running ok (maybe it is some limitation of cuda-gdb).
So, i want to know: Within the __device__ kernel, uarray is shared yet? I'm changing the array from global to shared memory and the time is almost the same (with shared memory the time is a little worse).
So, i want to know: Within the __device__ kernel, uarray is shared yet?
Yes, when you pass a pointer to shared memory to a device function this way, it still points to the same place in shared memory.
In response to the questions posted below which are perplexing me, I elected to show a simple example:
$ cat t249.cu
#include <stdio.h>
#define SSIZE 256
__device__ unsigned char* dev_kernel(unsigned char* array_sh, int params){
return array_sh + params;
}
__global__ void mykernel(){
__shared__ unsigned char myshared[SSIZE];
__shared__ unsigned char *u_array;
for (int i = 0; i< SSIZE; i++)
myshared[i] = (unsigned char) i;
unsigned char *loc = dev_kernel(myshared, 5);
u_array = loc;
printf("val = %d\n", *loc);
printf("val = %d\n", *u_array);
}
int main(){
mykernel<<<1,1>>>();
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_20 -g -G -o t249 t249.cu
$ cuda-gdb ./t249
NVIDIA (R) CUDA Debugger
5.5 release
....
Reading symbols from /home/user2/misc/t249...done.
(cuda-gdb) break mykernel
Breakpoint 1 at 0x4025dc: file t249.cu, line 9.
(cuda-gdb) run
Starting program: /home/user2/misc/t249
[Thread debugging using libthread_db enabled]
Breakpoint 1, mykernel () at t249.cu:9
9 __global__ void mykernel(){
(cuda-gdb) break 14
Breakpoint 2 at 0x4025e1: file t249.cu, line 14.
(cuda-gdb) continue
Continuing.
[New Thread 0x7ffff725a700 (LWP 26184)]
[Context Create of context 0x67e360 on Device 0]
[Launch of CUDA Kernel 0 (mykernel<<<(1,1,1),(1,1,1)>>>) on Device 0]
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 2, warp 0, lane 0]
Breakpoint 1, mykernel<<<(1,1,1),(1,1,1)>>> () at t249.cu:12
12 for (int i = 0; i< SSIZE; i++)
(cuda-gdb) continue
Continuing.
Breakpoint 2, mykernel<<<(1,1,1),(1,1,1)>>> () at t249.cu:14
14 unsigned char *loc = dev_kernel(myshared, 5);
(cuda-gdb) print &(myshared[0])
$1 = (#shared unsigned char *) 0x8 ""
^
|
cuda-gdb is telling you that this pointer is defined in a __shared__ statement, and therefore it's storage is implicit and it is unmodifiable.
(cuda-gdb) print &(u_array)
$2 = (#generic unsigned char * #shared *) 0x0
^ ^
| u_array is stored in shared memory.
u_array is a generic pointer, meaning it can point to anything.
(cuda-gdb) step
dev_kernel(unsigned char * #generic, int) (array_sh=0x1000008 "", params=5)
at t249.cu:6
6 return array_sh + params;
(cuda-gdb) print array_sh
$3 = (#generic unsigned char * #register) 0x1000008 ""
^ ^
| array_sh is stored in a register.
array_sh is a generic pointer, it can point to anything.
(cuda-gdb) print u_array
No symbol "u_array" in current context.
(note that I can't access u_array from inside the __device__ function, so I don't understand your comment there.)
(cuda-gdb) step
mykernel<<<(1,1,1),(1,1,1)>>> () at t249.cu:15
15 u_array = loc;
(cuda-gdb) step
16 printf("val = %d\n", *loc);
(cuda-gdb) print u_array
$4 = (
#generic unsigned char * #shared) 0x100000d ......
^ ^
| u_array is stored in shared memory
u_array is a generic pointer, it can point to anything
(cuda-gdb)
Although you haven't provided it, I am assuming your definition of u_array is similar to mine, based on the cuda-gdb output you are getting.
Note that the indicators like #shared are not telling you what kind of memory a pointer is pointing to, they are telling you either what kind of pointer it is (defined implicitly in a __shared__ statement) or else where it is stored (in shared memory).
If this doesn't sort out your questions, please provide a complete example, along with complete cuda-gdb session output, just as I have.

CUDA kernel call in a simple sample

It's the first parallel code of cuda by example .
Can any one describe me about the kernel call : <<< N , 1 >>>
This is the code with important points :
#define N 10
__global__ void add( int *a, int *b, int *c ) {
int tid = blockIdx.x; // this thread handles the data at its thread id
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main( void ) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
// allocate the memory on the GPU
// fill the arrays 'a' and 'b' on the CPU
// copy the arrays 'a' and 'b' to the GPU
add<<<N,1>>>( dev_a, dev_b, dev_c );
// copy the array 'c' back from the GPU to the CPU
// display the results
// free the memory allocated on the GPU
return 0;
}
Why it used of <<< N , 1 >>> that it means we used of N blocks and 1 thread in each block ?? since we can write this <<< 1 , N >>> and used 1 block and N thread in this block for more optimization.
For this little example, there is no particular reason (as Bart already told you in the comments). But for a larger, more realistic example you should always keep in mind that the number of threads per block is limited. That is, if you use N = 10000, you could not use <<<1,N>>> anymore, but <<<N,1>>> would still work.

CUDA-GDB crashes in Kernel

I've been trying to debug my code, as I know something is going wrong in the Kernel, and I've been trying to figure out what specifically. If I try to step into the kernel it seems to completely step over the kernel functions, and will eventually cause an error on quitting:
Single stepping until exit from function dyld_stub_cudaSetupArgument,
which has no line number information.
[Launch of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),(4,1,1)>>>) on
Device 0]
[Termination of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),
(4,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
[Termination of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
add (below=0x124400, newtip=0x124430, newfork=0x125ac0) at test.cu:1223
And if I try to break in the Kernel my entire computer crashes and I have to restart it.
I figure there must be something wrong with the way I'm calling the kernel, but I can't figure out what.
The code is rather long, so I'm only including an excerpt of it:
__global__ void fillinOne(seqptr qset, long max) {
int i, j;
aas aa;
int idx = blockIdx.x;
__shared__ long qs[3];
if(idx < max)
{
memcpy(qs, qset[idx], sizeof(long[3]));
for (i = 0; i <= 1; i++)
{
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1))
{
if (((1L << ((long)aa)) & qs[i]) != 0)
{
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
}
}
//Kernel for left!= NULL and rt != NULL
void fillin(node *p, node *left, node *rt)
{
cudaError_t err = cudaGetLastError();
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
//int i, j;
if (left == NULL)
{
//copy rt->numsteps into p->numsteps--doesn't actually require CUDA, because no computation to do
memcpy(p->numsteps, rt->numsteps, stepsize);
checkCUDAError("memcpy");
//allocate siteset (array of sitearrays) on device
seqptr qsites; //as in array of qs's
cudaMalloc((void **) &qsites, sitesize);
checkCUDAError("malloc");
//copy rt->siteset into device array (equivalent to memcpy(qs, rs) but for whole array)
cudaMemcpy(qsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//do loop in device
int block_size = 1; //each site operated on independently
int n_blocks = chars;
fillinOne <<< n_blocks, block_size>>> (qsites, chars);
cudaThreadSynchronize();
//put qset in p->siteset--equivalent to memcpy(p->siteset[m], qs)
cudaMemcpy(p->siteset, qsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");
//Cleanup
cudaFree(qsites);
}
If anyone has any ideas at all, please resond! Thanks in advance!
I suppose you have a single card configuration. When you are debugging a cuda kernel and you break inside it you effectively put the display driver in pause. That causes what you think is a crash. If you want to use the cuda-gdb with only one graphics card you must use it in command line mode (don't start X or press ctrl-alt-fn from X).
If you have two cards you must run the code in the card not running the display. Use cudaSelectDevice(n).