NVCC ignoring CUDA code? - cuda

I have just installed CUDA 5.5 on my notebook and trying out using NVCC to compile a basic hello world program from this link http://computer-graphics.se/hello-world-for-cuda.html
The code I'm trying out is this:
// This is the REAL "hello world" for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string "World!"
// By Ingemar Ragnemalm 2010
#include <stdio.h>
const int N = 16;
const int blocksize = 16;
__global__
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}
int main()
{
char a[N] = "Hello \0\0\0\0\0\0";
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);
printf("%s", a);
cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
cudaFree( bd );
printf("%s\n", a);
return EXIT_SUCCESS;
}
It is supposed to print out "Hello world!", but after I compiled using "nvcc hello.cu -o a.out", my output is "Hello Hello", can someone tell me what is going on?

This was caused by a broken CUDA driver installation. A corrected installation allowed what was otherwise correct code to run without error.
[This community wiki entry was assembled from comments to get this question off the unanswered queue]

Related

CUDA device runtime api cudaMemsetAsync doesn't work

I am trying to call cudaMemsetAsync from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.
Here is my test code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>
const int size = 5;
__global__ void kernel(int *c)
{
cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
}
int main()
{
cudaError_t cudaStatus;
int c[size] = { 12, 12, 12, 12, 12 };
int *dev_c = 0;
cudaStatus = cudaSetDevice(0);
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
cudaStatus = cudaDeviceReset();
printf("%d\n", cudaStatus);
printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
And if I run it, I got output like this:
>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
kernel.cu
Creating library a.lib and object a.exp
0
{0,0,0,0,0}
When I call memory set, I use value 0x7FFFFFFF. I'm expecting non-zero numbers, but it always shows zero.
Is this a bug? or I did something wrong? I'm using CUDA 8.0
I can confirm this appears not to work in CUDA 8 on the systems I tested it with.
If you want a single thread to perform the operation, you can use memset directly in device code (it, like memcpy, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread.
If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this:
#include <cstring>
#include <cstdio>
const int size = 5;
__global__ void myMemset_kernel(void* p, unsigned char val, size_t sz)
{
size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned char* _p = (unsigned char*)p;
for(; tid < sz; tid += blockDim.x * gridDim.x) {
_p[tid] = val;
}
}
__device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL)
{
const dim3 blocksz(256,1,1);
size_t nblocks = (sz + blocksz.x -1) / blocksz.x;
unsigned charval = val & 0xff;
myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz);
}
__global__ void kernel(int *c)
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
myMemset(c, 0x7FFFFFFF, size * 4, s);
cudaDeviceSynchronize();
}
int main()
{
int c[size];
int *dev_c;
memset(&c[0], 0xffffff0c, size * sizeof(int));
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
which compiles and does this:
$ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt
$ ./memset
{0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c}
{ffffffff,ffffffff,ffffffff,ffffffff,ffffffff}
A final point -- note the values above and read this question and answer. In your code, it is not possible to use cudaMemset to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, cudaMemset and its relatives work like regular memset and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway.

invalid device symbol cudaMemcpyFromSymbol CUDA

I want to calculate the sum of all elements of an array in CUDA. I came up with this code. It compiles without any error. But the result is always zero. I've got the invalid device symbol from cudaMemcpyFromSymbol. I cannot use any libraries like Thrust or Cublas.
#define TRIALS_PER_THREAD 4096
#define NUM_BLOCKS 256
#define NUM_THREADS 256
double *dev;
__device__ volatile double pi_gpu = 0;
__global__ void ArraySum(double *array)
{
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
pi_gpu = pi_gpu + array[tid];
__syncthreads();
}
int main (int argc, char *argv[]) {
cudaMalloc((void **) &dev, NUM_BLOCKS * NUM_THREADS * sizeof(double));
double pi_gpu_h;
ArraySum<<<NUM_BLOCKS, NUM_THREADS>>>(dev);
cudaDeviceSynchronize();
cudaError err = cudaMemcpyFromSymbol(&pi_gpu_h, &pi_gpu, sizeof(double), cudaMemcpyDeviceToHost);
if( cudaSuccess != err )
{
fprintf( stderr, "cudaMemcpyFromSymbolfailed : %s\n", cudaGetErrorString( err ) );
exit( -1 );
}
return pi_gpu_h; // this is always zero!!!
}
The symbol argument in the copy from symbol call is incorrect. It should look like this:
cudaMemcpyFromSymbol(&pi_gpu_h, pi_gpu, sizeof(double), 0, cudaMemcpyDeviceToHost)

Shortest paths by BFS, porting a code from CUDA to openCL

I am currently porting a CUDA code that finds shortest paths from each node to other nodes in a (undirected) graph.
So basically, the CUDA code constructs a graph read from a text file. Then it proceeds to build adjancent arrays h_v and h_e.
For example
A B
A C
B C
gives
h_v[0] = 0, h_e[0]=1
h_v[1] = 0, h_e[1]=2
h_v[2] = 1, h_e[2]=2
Then it calls the kernel to compute shortest paths from each node using BFS.
The cuda host code is as follow:
int cc_bfs(int n_count, int e_count, int *h_v, int *h_e, float *h_cc, bool ec){
int *d_v, *d_e;
cudaCheckError(cudaMalloc((void **)&d_v, sizeof(int)*e_count));
cudaCheckError(cudaMalloc((void **)&d_e, sizeof(int)*e_count));
cudaCheckError(cudaMemcpy(d_v, h_v, sizeof(int)*e_count, cudaMemcpyHostToDevice));
cudaCheckError(cudaMemcpy(d_e, h_e, sizeof(int)*e_count, cudaMemcpyHostToDevice));
int *d_d, *d_dist;
cudaCheckError(cudaMalloc((void **)&d_d, sizeof(int)*n_count));
cudaCheckError(cudaMalloc((void **)&d_dist, sizeof(int)));
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
bool *d_continue;
cudaCheckError(cudaMalloc((void**)&d_continue, sizeof(bool)));
for(int s=0; s<n_count; s++){ //BIG FOR LOOP
//////code to initalize h_d[i]
for(int i=0; i<n_count; i++)
h_d[i]=-1;
h_d[s]=0; //for marking the root
cudaCheckError(cudaMemcpy(d_d, h_d, sizeof(int)*n_count, cudaMemcpyHostToDevice));
//////////////////////////////
///////////////////////////////
int threads_per_block=e_count;
int blocks=1;
if(e_count>MAX_THREADS_PER_BLOCK){
blocks = (int)ceil(e_count/(float)MAX_THREADS_PER_BLOCK);
threads_per_block = MAX_THREADS_PER_BLOCK;
}
dim3 grid(blocks);
dim3 threads(threads_per_block);
/////////////////////////////////
bool h_continue;
int h_dist=0;
cudaCheckError(cudaMemset(d_dist, 0, sizeof(int)));
do{
h_continue=false;
cudaCheckError(cudaMemcpy(d_continue, &h_continue, sizeof(bool), cudaMemcpyHostToDevice));
cc_bfs_kernel<<<grid, threads>>>(d_v, d_e, d_d, d_continue, d_dist, e_count);
checkCUDAError("Kernel invocation");
cudaThreadSynchronize();
h_dist++;
cudaCheckError(cudaMemcpy(d_dist, &h_dist, sizeof(int), cudaMemcpyHostToDevice));//for what?
cudaCheckError(cudaMemcpy(&h_continue, d_continue, sizeof(bool), cudaMemcpyDeviceToHost));
}while(h_continue);
///////////////////
//then code to read back h_d from device
}
And here is cuda kernel
__global__ void cc_bfs_kernel(int *d_v, int *d_e, int *d_d,
bool *d_continue, int *d_dist, int e_count){
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if(tid<e_count){
/* for each edge (u, w) */
int u=d_v[tid];
int w=d_e[tid];
if(d_d[u]==*d_dist){ //of the interest root
if(d_d[w]==-1){ //not yet check
*d_continue=true; //continue
d_d[w]=*d_dist+1; //increase
}
}
}
}
Here is my effort to port it to openCL. I am just an amateur in openCL, so I am trying the best to port the original code line by line :(
openCL host code
cl_mem d_d= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*n_count, NULL,NULL);
cl_mem d_dist= clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(int), NULL,NULL);
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
cl_mem d_continue = clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(bool), NULL,NULL);
float* h_cc;
h_cc = (float *)malloc(sizeof(float)*n_count);
cl_mem d_v= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
cl_mem d_e= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
err = clEnqueueWriteBuffer(queue, d_v, CL_TRUE, 0, e_count * sizeof(int), host_v, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_e, CL_TRUE, 0, e_count * sizeof(int), host_e, 0, NULL, NULL);
size_t global_size= e_count;
for(int s=0; s<n_count; s++)
{ //BIG LOOP
//initalize h_d[i]
for(int i=0; i<n_count; i++)
h_d[i]=-1;
h_d[s]=0;
//copy h_d to d_d
err = clEnqueueWriteBuffer(queue, d_d, CL_TRUE, 0,
n_count * sizeof(int), h_d, 0, NULL, NULL);
bool h_continue;
int h_dist=0;
int mark = 0;
int* h_id;
h_id= (int*) malloc(sizeof(int)*e_count);
cl_mem id= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,
sizeof(int)*e_count, NULL,NULL);
do{
h_continue=false;
err = clEnqueueWriteBuffer(queue, d_continue, CL_TRUE, 0,
sizeof(bool), &h_continue, 0, NULL, NULL);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_v);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_e);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_d);
err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_continue);
err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&d_dist);
err = clSetKernelArg(kernel, 5, sizeof(int), (void *)&e_count);
err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&id);
/////EXECUTE
cl_event sync1;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
&global_size, NULL, 0, NULL, &sync1); //wait for this to finish (to synchronize)
err = clWaitForEvents(1, &sync1);
clReleaseEvent(sync1);
///////////////////
err = clEnqueueReadBuffer(queue, id, CL_TRUE, 0,
sizeof(int)*e_count, h_id, 0, NULL, NULL);
printf("e_count = %d error : %d\n",e_count, err);//check error?
for(int j = 0; j< e_count; j++)
{
printf("%d ",h_id[j]);
}
h_dist++;
mark++;//for debug
err = clEnqueueWriteBuffer(queue, d_dist, CL_TRUE, 0,
sizeof(int), &h_dist, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, d_continue, CL_TRUE, 0,
sizeof(bool), &h_continue, 0, NULL, NULL);
}
while(h_continue);
err = clEnqueueReadBuffer(queue, d_d, CL_TRUE, 0,
n_count*sizeof(int), h_d, 0, NULL, NULL);
and openCL kernel
__kernel void cc_bfs_kernel(__global int *d_v, __global int *d_e, __global int *d_d,
__global bool *d_continue, __global int *d_dist, const int e_count, __global int *id)
{
int tid = get_global_id(0)-get_global_offset(0);
//barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = 0; i< e_count; i++)
{
id[i]=i;
}
if(tid<e_count){
id[tid]= tid;
/* for each edge (u, w) */
int u=d_v[tid];
int w=d_e[tid];
if(d_d[u]==*d_dist){ //of the interest root
if(d_d[w]==-1)
{ //not yet check
*d_continue=true; //continue
d_d[w]=*d_dist+1; //increase
}
}
}
}
The code cant give the correct result, so I debug it by printing some values (the tid inside the kernel, the marks value to check how many times the code goes through the while loop). Sadly, the tid gives rubbish values,and it goes through the while loop only once.Could you please pointing out what I am missing here?
I have another doubt:
How can I do something similar as cudathreadsynchronize()? In this version of openCL, I associate clEnqueueNDRangeKernel with a command event and wait for it, but apparently I seems not to work :(
Thank you greatly.
First you should ensure that every step is correct by checking the error codes.
As an example, AFAIK, this code is not valid :
clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*e_count, NULL,NULL)
because you are asking to use an allocated memory area whereas you're not providing any : the host_ptr parameter is indeed NULL.
Either remove this flag or if you really want host memory specify : CL_MEM_ALLOC_HOST_PTR.
Check the API documentation to know for each function how to retrieve status : either using the return value or a dedicated parameter (the last one) like clCreateBuffer does : http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clCreateBuffer.html
For your code it should give a CL_INVALID_HOST_PTR error.

HANDLE_ERROR not found error in Cuda

__global__ void add( int a, int b, int *c ) {
*c = a + b;
}
int main( void ) {
int c;
int *dev_c;
HANDLE_ERROR( cudaMalloc( (void**)&dev_c, sizeof(int) ) );
add<<<1,1>>>( 2, 7, dev_c );
HANDLE_ERROR( cudaMemcpy( &c, dev_c, sizeof(int), cudaMemcpyDeviceToHost ) );
printf( "2 + 7 = %d\n", c );
cudaFree( dev_c );
}
This is the code.
HANDLE_ERROR not found error is being generated. i dont know how to solve it. Tried to grab some header files but can't figure it out...
Any Help Please!!!
If I had to guess, I'd say you're using the book CUDA By Example, which defines the HANDLE_ERROR macro as follows:
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
Make sure that this code appears somewhere in your source, or somewhere in a header you #include.
You can download the source code for the book here.
The source code also contains the header files (in the common folder), where the missing macros are defined, and which the book quotes in the source codes as (for example)
#include "../common/book.h"
If the links become unavailable, search the books title on the Nvidia Developer site, or the site of the CUDA, you will find the direct link to the book's page, where the source code can be found.

CUDA random number generating

I would like to generate random numbers in my __device__ function, and keep it in my int Board[500] , I found some examples , but they used some type named curandState. I only need a function like a rand() in C++.
here is my code , I have N3[40000] array in my device memory , I generate some random numbers in my kernel working for one thread (I mean this "kernel <<<1,1>>> ... ") , then I copy it to my N2[40000] from CPU, and print it ,so here is the code
#include <iostream>
#include <Cuda.h>
#include<curand.h>
#include<curand_kernel.h>
int n = 200;
using namespace std;
__device__ float generate( curandState* globalState, int ind )
{
//int ind = threadIdx.x;
curandState localState = globalState[ind];
float RANDOM = curand_uniform( &localState );
globalState[ind] = localState;
return RANDOM;
}
__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
int id = threadIdx.x;
curand_init ( seed, id, 0, &state[id] );
}
__global__ void kernel(float* N, curandState* globalState, int n)
{
// generate random numbers
for(int i=0;i<40000;i++)
{
int k = generate(globalState, i) * 100000;
while(k > n*n-1)
{
k-=(n*n-1);
}
N[i] = k;
}
}
int main()
{
int N=40000;
curandState* devStates;
cudaMalloc ( &devStates, N*sizeof( curandState ) );
// setup seeds
setup_kernel <<< 1, N >>> ( devStates,unsigned(time(NULL)) );
float N2[40000];
float* N3;
cudaMalloc((void**) &N3, sizeof(float)*N);
kernel<<<1,1>>> (N3, devStates, n);
cudaMemcpy(N2, N3, sizeof(float)*N, cudaMemcpyDeviceToHost);
for(int i=0;i<N;i++)
{
cout<<N2[i]<<endl;
}
return 0;
}
You may use curand library to generate random numbers in device memory and then run your kernel without even having to copy those values to the host.