Including C standard headers in CUDA NVRTC code - cuda

I'm writing a CUDA kernel that is compiled at runtime using NVRTC (CUDA version 9.2 with NVRTC version 7.5), which needs the stdint.h header, in order to have the int32_t etc. types.
If I write the kernel source code without the include, it works correctly. For example the kernel
extern "C" __global__ void f() { ... }
Compiles to PTX code where f is defined as .visible .entry f.
But if the kernel source code is
#include <stdint.h>
extern "C" __global__ void f() { ... }
it reports A function without execution space annotations (__host__/__device__/__global__) is considered a host function, and host functions are not allowed in JIT mode. (also without extern "C").
Passing -default-device makes the PTX code .visible .func f, so the function cannot be called from the host.
Is there a way to include headers in the source code, and still have a __global__ entry function? Or alternately, a way to know which integer size convention is used on the by the NVRTC compiler, so that the int32_t etc. types can be manually defined?
Edit:
Example program that shows the problem:
#include <cstdlib>
#include <string>
#include <vector>
#include <memory>
#include <cassert>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>
[[noreturn]] void fail(const std::string& msg, int code) {
std::cerr << "error: " << msg << " (" << code << ')' << std::endl;
std::exit(EXIT_FAILURE);
}
std::unique_ptr<char[]> compile_to_ptx(const char* program_source) {
nvrtcResult rv;
// create nvrtc program
nvrtcProgram prog;
rv = nvrtcCreateProgram(
&prog,
program_source,
"program.cu",
0,
nullptr,
nullptr
);
if(rv != NVRTC_SUCCESS) fail("nvrtcCreateProgram", rv);
// compile nvrtc program
std::vector<const char*> options = {
"--gpu-architecture=compute_30"
};
//options.push_back("-default-device");
rv = nvrtcCompileProgram(prog, options.size(), options.data());
if(rv != NVRTC_SUCCESS) {
std::size_t log_size;
rv = nvrtcGetProgramLogSize(prog, &log_size);
if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLogSize", rv);
auto log = std::make_unique<char[]>(log_size);
rv = nvrtcGetProgramLog(prog, log.get());
if(rv != NVRTC_SUCCESS) fail("nvrtcGetProgramLog", rv);
assert(log[log_size - 1] == '\0');
std::cerr << "Compile error; log:\n" << log.get() << std::endl;
fail("nvrtcCompileProgram", rv);
}
// get ptx code
std::size_t ptx_size;
rv = nvrtcGetPTXSize(prog, &ptx_size);
if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTXSize", rv);
auto ptx = std::make_unique<char[]>(ptx_size);
rv = nvrtcGetPTX(prog, ptx.get());
if(rv != NVRTC_SUCCESS) fail("nvrtcGetPTX", rv);
assert(ptx[ptx_size - 1] == '\0');
nvrtcDestroyProgram(&prog);
return ptx;
}
const char program_source[] = R"%%%(
//#include <stdint.h>
extern "C" __global__ void f(int* in, int* out) {
out[threadIdx.x] = in[threadIdx.x];
}
)%%%";
int main() {
CUresult rv;
// initialize CUDA
rv = cuInit(0);
if(rv != CUDA_SUCCESS) fail("cuInit", rv);
// compile program to ptx
auto ptx = compile_to_ptx(program_source);
std::cout << "PTX code:\n" << ptx.get() << std::endl;
}
When //#include <stdint.h> in the kernel source is uncommented it no longer compiles. When //options.push_back("-default-device"); is uncommented it compiles but does not mark the function f as .entry.
CMakeLists.txt to compile it (needs CUDA driver API + NVRTC)
cmake_minimum_required(VERSION 3.4)
project(cudabug CXX)
find_package(CUDA REQUIRED)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED 14)
add_executable(cudabug cudabug.cc)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
link_directories(${CUDA_LIBRARY_DIRS})
target_link_libraries(cudabug PUBLIC ${CUDA_LIBRARIES} nvrtc cuda)

[Preface: this is a very hacky answer, and is specific to the GNU toolchain (although I suspect the problem in the question is also specific to the GNU toolchain)].
It would appear that the problem here is with the GNU standard header features.h, which gets pulled into stdint.h and then winds up defining a lot of stub functions which have the default __host__ compilation space. This causes nvrtc to blow up. It also seems that the -default-device option will result in a resolved glibC compiler feature set which makes the whole nvrtc compiler fail.
You can defeat this (in a very hacky way) by predefining a feature set for the standard library which excludes all the host functions. Changing your JIT kernel code to
const char program_source[] = R"%%%(
#define __ASSEMBLER__
#define __extension__
#include <stdint.h>
extern "C" __global__ void f(int32_t* in, int32_t* out) {
out[threadIdx.x] = in[threadIdx.x];
}
)%%%";
got me this:
$ nvcc -std=c++14 -ccbin=g++-7 jit_header.cu -o jitheader -lnvrtc -lcuda
$ ./jitheader
PTX code:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-24330188
// Cuda compilation tools, release 9.2, V9.2.148
// Based on LLVM 3.4svn
//
.version 6.2
.target sm_30
.address_size 64
// .globl f
.visible .entry f(
.param .u64 f_param_0,
.param .u64 f_param_1
)
{
.reg .b32 %r<3>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [f_param_0];
ld.param.u64 %rd2, [f_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.u32 %r2, [%rd6];
add.s64 %rd7, %rd3, %rd5;
st.global.u32 [%rd7], %r2;
ret;
}
Big caveat: This worked on the glibC system I tried it on. It probably won't work with other toolchains or libC implementations (if, indeed, they have this problem).

Another alternative is creating stand-ins, for some of the standard library headers. NVRTC's API supports your specifying header file contents as strings, associated with header names - before it will go looking through the filesystem for you. This approach is adopted in NVIDIA JITify, and I've adopted it myself working on something else which may or may not be released.
The easy way to do this You can just take the JITify header stubs for stdint.h, limits.h , from here, which I'm also attaching since it's not very long. Alternatively, you can generate this stub yourself to make sure you're not missing out on anything that's relevant from the standard. Here's how that works:
Start with your stdint.h file (or cstdint file as the case may be);
For each include directive in the file (and recursively, for each include in an include etc):
2.1 Figure out whether you can skip including the file altogether (possibly by making a few defines which are known to hold on the GPU).
2.2 If you're not sure you can skip the file - include it entirely and recurse to (2.), or keep it as its own separate header (and apply the whole process in (1.) to it).
You now have a header file which only includes device-safe header files (or none at all)
Partially-preprocess the file, dropping everything that won't be used on a GPU
Remove the lines which might be problematic on a GPU (e.g. #pragma's), and add __device__ __host__ or just __host__ as appropriate to each function declaration.
Important note: Doing this requires paying attention to licenses and copyrights. You would be creating a "derivative work" of glibc and/or JITify and/or StackOverflow contributions etc.
Now, the stdint.h and limits.h from NVIDIA JITify I promised. I've adapted them to not have namespaces:
stdint.h:
#pragma once
#include <limits.h>
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef signed long long int64_t;
typedef signed char int_fast8_t;
typedef signed short int_fast16_t;
typedef signed int int_fast32_t;
typedef signed long long int_fast64_t;
typedef signed char int_least8_t;
typedef signed short int_least16_t;
typedef signed int int_least32_t;
typedef signed long long int_least64_t;
typedef signed long long intmax_t;
typedef signed long intptr_t; //optional
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef unsigned char uint_fast8_t;
typedef unsigned short uint_fast16_t;
typedef unsigned int uint_fast32_t;
typedef unsigned long long uint_fast64_t;
typedef unsigned char uint_least8_t;
typedef unsigned short uint_least16_t;
typedef unsigned int uint_least32_t;
typedef unsigned long long uint_least64_t;
typedef unsigned long long uintmax_t;
#define INT8_MIN SCHAR_MIN
#define INT16_MIN SHRT_MIN
#if defined _WIN32 || defined _WIN64
#define WCHAR_MIN SHRT_MIN
#define WCHAR_MAX SHRT_MAX
typedef unsigned long long uintptr_t; //optional
#else
#define WCHAR_MIN INT_MIN
#define WCHAR_MAX INT_MAX
typedef unsigned long uintptr_t; //optional
#endif
#define INT32_MIN INT_MIN
#define INT64_MIN LLONG_MIN
#define INT8_MAX SCHAR_MAX
#define INT16_MAX SHRT_MAX
#define INT32_MAX INT_MAX
#define INT64_MAX LLONG_MAX
#define UINT8_MAX UCHAR_MAX
#define UINT16_MAX USHRT_MAX
#define UINT32_MAX UINT_MAX
#define UINT64_MAX ULLONG_MAX
#define INTPTR_MIN LONG_MIN
#define INTMAX_MIN LLONG_MIN
#define INTPTR_MAX LONG_MAX
#define INTMAX_MAX LLONG_MAX
#define UINTPTR_MAX ULONG_MAX
#define UINTMAX_MAX ULLONG_MAX
#define PTRDIFF_MIN INTPTR_MIN
#define PTRDIFF_MAX INTPTR_MAX
#define SIZE_MAX UINT64_MAX
limits.h:
#pragma once
#if defined _WIN32 || defined _WIN64
#define __WORDSIZE 32
#else
#if defined __x86_64__ && !defined __ILP32__
#define __WORDSIZE 64
#else
#define __WORDSIZE 32
#endif
#endif
#define MB_LEN_MAX 16
#define CHAR_BIT 8
#define SCHAR_MIN (-128)
#define SCHAR_MAX 127
#define UCHAR_MAX 255
enum {
_JITIFY_CHAR_IS_UNSIGNED = (char)-1 >= 0,
CHAR_MIN = _JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN,
CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX,
};
#define SHRT_MIN (-32768)
#define SHRT_MAX 32767
#define USHRT_MAX 65535
#define INT_MIN (-INT_MAX - 1)
#define INT_MAX 2147483647
#define UINT_MAX 4294967295U
#if __WORDSIZE == 64
# define LONG_MAX 9223372036854775807L
#else
# define LONG_MAX 2147483647L
#endif
#define LONG_MIN (-LONG_MAX - 1L)
#if __WORDSIZE == 64
#define ULONG_MAX 18446744073709551615UL
#else
#define ULONG_MAX 4294967295UL
#endif
#define LLONG_MAX 9223372036854775807LL
#define LLONG_MIN (-LLONG_MAX - 1LL)
#define ULLONG_MAX 18446744073709551615ULL

Related

My C code won't compile against C code in a `.cu` file

I have 2 files that form a small CUDA library from my previous program (which works well, btw) written on C++.
The header for this library is:
#ifndef __cudaLU__
#define __cudaLU__
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <cuComplex.h>
#include <stdlib.h>
void denseLS(int dim,
std::complex<float> * A,
std::complex<float> * b );
void sparseLS(int dim,
std::complex<float> *csrVal,
int *csrRowPtr,
int *csrColInd,
std::complex<float> *vecVal);
#endif
And I want to use this library in my old-as-the-hills C program just by setting procedure in the head of my main.c file:
extern void denseLS(int dim, float complex *A, float complex *b);
And it fails with a bunch of similar errors. Few of them are:
..NA/cudaLS.cu(115): error: namespace "std" has no member "complex"
..NA/cudaLS.cu(115): error: expected a ")"
..NA/cudaLS.cu(137): error: identifier "csrRowPtr" is undefined
..NA/cudaLS.cu(169): error: identifier "csrColInd" is undefined
..NA/cudaLS.cu(170): error: identifier "csrVal" is undefined
..NA/cudaLS.cu(171): error: identifier "vecVal" is undefined
I tried to make a change std::complex -> float complex and nothing works. Still same errors (without std error, ofc).
The cmake instructions file
cmake_minimum_required(VERSION 3.8)
project(NA)
set(CMAKE_C_STANDARD 11)
find_package(GSL REQUIRED)
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
cuda_add_library(solvers STATIC
cudaLS.cu
cudaLS.h)
target_link_libraries(solvers ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY} ${CUDA_cusolver_LIBRARY})
target_compile_features(solvers PUBLIC cxx_std_11)
set_target_properties( solvers
PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
add_executable(NA main.c)
set_target_properties(NA PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(NA PRIVATE GSL::gsl m solvers)
What am I doing wrong pals?
UPD:
g++/gcc - 7.3
Linux
Well, I found what exactly I did in a wrong way.
Cmake is OK. But headers in the .h file have to be modified to
extern "C" void denseLS(int dim, cuComplex *A, cuComplex *b );
The cuda functions in .c have to be decleared in the head (or separate .h-file) as
void denseLS(int dim, float complex *A, float complex *b);

Why is this not copying from device to host in Cuda?

I'm working through the examples of the "CUDA by Example" book. The following code doesn't give me an answer and work as it should. Where's the mistake?
Will appreciate your help and answers.
I get an output,which reads
Calculation done on GPU yields the answer: &d
Press enter to stop
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
using namespace std;
__global__ void add_integers_cuda(int a, int b, int *c)
{
*c = a + b;
}
int main(void)
{
int c;
int *dev_ptr;
cudaMalloc((void **)&dev_ptr, sizeof(int)); //allocate sizeof(int) bytes of contiguous memory in the gpu device and return the address of first byte to dev_ptr.
// call the kernel
add_integers_cuda <<<1,1>>>(2,7,dev_ptr);
cudaMemcpy(&c, dev_ptr, sizeof(int), cudaMemcpyDeviceToHost);
printf("Calculation done on GPU yields the answer: &d\n",c );
cudaFree(dev_ptr);
printf("Press enter to stop.");
cin.ignore(255, '\n');
return 0;
}
"
&d is not a correct printf formatting character here:
printf("Calculation done on GPU yields the answer: &d\n",c );
You won't get the output you are expecting.
You should use %d instead:
printf("Calculation done on GPU yields the answer: %d\n",c );
This particular issue has nothing to do with CUDA of course.
You may also want to run CUDA codes with cuda-memcheck and/or use proper CUDA error checking if you are just learning and having trouble. Neither of those would have pointed out the above error, however.

Loading multiple modules in JCuda is not working

In jCuda one can load cuda files as PTX or CUBIN format and call(launch) __global__ functions (kernels) from Java.
With keeping that in mind, I want to develop a framework with JCuda that gets user's __device__ function in a .cu file at run-time, loads and runs it.
And I have already implemented a __global__ function, in which each thread finds out the start point of its related data, perform some computation, initialization and then call user's __device__ function.
Here is my kernel pseudo code:
extern "C" __device__ void userFunc(args);
extern "C" __global__ void kernel(){
// initialize
userFunc(args);
// rest of the kernel
}
And user's __device__ function:
extern "C" __device__ void userFunc(args){
// do something
}
And in Java side, here is the part that I load the modules(modules are made from ptx files which are successfully created from cuda files with this command: nvcc -m64 -ptx path/to/cudaFile -o cudaFile.ptx)
CUmodule kernelModule = new CUmodule(); // 1
CUmodule userFuncModule = new CUmodule(); // 2
cuModuleLoad(kernelModule, ptxKernelFileName); // 3
cuModuleLoad(userFuncModule, ptxUserFuncFileName); // 4
When I try to run it I got error at line 3 : CUDA_ERROR_NO_BINARY_FOR_GPU. After some searching I get that my ptx file has some syntax error. After running this suggested command:
ptxas -arch=sm_30 kernel.ptx
I got:
ptxas fatal : Unresolved extern function 'userFunc'
Even when I replace line 3 with 4 to load userFunc before kernel I get this error. I got stuck at this phase. Is this the correct way to load multiple modules that need to be linked together in JCuda? Or is it even possible?
Edit:
Second part of the question is here
The really short answer is: No, you can't load multiple modules into a context in the runtime API.
You can do what you want, but it requires explicit setup and execution of a JIT linking call. I have no idea how (or even whether) that has been implemented in JCUDA, but I can show you how to do it with the standard driver API. Hold on...
If you have a device function in one file, and a kernel in another, for example:
// test_function.cu
#include <math.h>
__device__ float mathop(float &x, float &y, float &z)
{
float res = sin(x) + cos(y) + sqrt(z);
return res;
}
and
// test_kernel.cu
extern __device__ float mathop(float & x, float & y, float & z);
__global__ void kernel(float *xvals, float * yvals, float * zvals, float *res)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
res[tid] = mathop(xvals[tid], yvals[tid], zvals[tid]);
}
You can compile them to PTX as usual:
$ nvcc -arch=sm_30 -ptx test_function.cu
$ nvcc -arch=sm_30 -ptx test_kernel.cu
$ head -14 test_kernel.ptx
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324607
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_30
.address_size 64
// .globl _Z6kernelPfS_S_S_
.extern .func (.param .b32 func_retval0) _Z6mathopRfS_S_
At runtime, your code must create a JIT link session, add each PTX to the linker session, then finalise the linker session. This will give you a handle to a compiled cubin image which can be loaded as a module as usual. The simplest possible driver API code to put this together looks like this:
#include <cstdio>
#include <cuda.h>
#define drvErrChk(ans) { drvAssert(ans, __FILE__, __LINE__); }
inline void drvAssert(CUresult code, const char *file, int line, bool abort=true)
{
if (code != CUDA_SUCCESS) {
fprintf(stderr, "Driver API Error %04d at %s %d\n", int(code), file, line);
exit(-1);
}
}
int main()
{
cuInit(0);
CUdevice device;
drvErrChk( cuDeviceGet(&device, 0) );
CUcontext context;
drvErrChk( cuCtxCreate(&context, 0, device) );
CUlinkState state;
drvErrChk( cuLinkCreate(0, 0, 0, &state) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_function.ptx", 0, 0, 0) );
drvErrChk( cuLinkAddFile(state, CU_JIT_INPUT_PTX, "test_kernel.ptx" , 0, 0, 0) );
size_t sz;
char * image;
drvErrChk( cuLinkComplete(state, (void **)&image, &sz) );
CUmodule module;
drvErrChk( cuModuleLoadData(&module, image) );
drvErrChk( cuLinkDestroy(state) );
CUfunction function;
drvErrChk( cuModuleGetFunction(&function, module, "_Z6kernelPfS_S_S_") );
return 0;
}
You should be able to compile and run this as posted and verify it works OK. It should serve as a template for a JCUDA implementation, if they have JIT linking support implemented.

CUDA invalid device symbol error

the code below compiles just fine. But when i try to run it, i got
GPUassert: invalid device symbol file.cu 114
When i comment lines marked by (!!!) the error wont show up. My question is what is causing this error because it gives me no sense.
Compiling with nvcc file.cu -arch compute_11
#include "stdio.h"
#include <algorithm>
#include <ctime>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define THREADS 64
#define BLOCKS 256
#define _dif (((1ll<<32)-121)/(THREADS*BLOCKS)+1)
#define HASH_SIZE 1024
#define ROUNDS 16
#define HASH_ROW (HASH_SIZE/ROUNDS)+(HASH_SIZE%ROUNDS==0?0:1)
#define HASH_COL 1000000000/HASH_SIZE
typedef unsigned long long ull;
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
//fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__device__ unsigned int primes[1024];
//__device__ unsigned char primes[(1<<28)+1];
__device__ long long n = 1ll<<32;
__device__ ull dev_base;
__device__ unsigned int dev_hash;
__device__ unsigned int dev_index;
time_t curtime;
__device__ int hashh(long long x) {
return (x>>1)%1024;
}
// compute (x^e)%n
__device__ ull mulmod(ull x,ull e,ull n) {
ull ans = 1;
while(e>0) {
if(e&1) ans = (ans*x)%n;
x = (x*x)%n;
e>>=1;
}
return ans;
}
// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(ull a,ull n) {
int d=0;
ull t = n-1;
while(t%2==0) {
++d;
t>>=1;
}
ull x = mulmod(a,t,n);
if(x==1) return 1;
for(int i=0;i<d;++i) {
if(x==n-1) return 1;
x=(x*x)%n;
}
return 0;
}
__device__ int prime(long long x) {
//unsigned long long b = 2;
//return is_SPRP(b,(unsigned long long)x);
return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}
__global__ void find(unsigned int *out,unsigned int *c) {
unsigned int buff[HASH_ROW][256];
int local_c[HASH_ROW];
for(int i=0;i<HASH_ROW;++i) local_c[i]=0;
long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*_dif;
long long e = b+_dif;
if(b%2==0) ++b;
for(long long i=b;i<e && i<n;i+=2) {
if(i%3==0 || i%5==0 || i%7==0) continue;
int hash_num = hashh(i)-(dev_hash*(HASH_ROW));
if(0<=hash_num && hash_num<HASH_ROW) {
if(prime(i)) continue;
buff[hash_num][local_c[hash_num]++]=(unsigned int)i;
if(local_c[hash_num]==256) {
int start = atomicAdd(c+hash_num,local_c[hash_num]);
if(start+local_c[hash_num]>=HASH_COL) return;
unsigned int *out_offset = out+hash_num*(HASH_COL)*4;
for(int i=0;i<local_c[hash_num];++i) out_offset[i+start]=buff[hash_num][i]; //(!!!)
local_c[hash_num]=0;
}
}
}
for(int i=0;i<HASH_ROW;++i) {
int start = atomicAdd(c+i,local_c[i]);
if(start+local_c[i]>=HASH_COL) return;
unsigned int *out_offset = out+i*(HASH_COL)*4;
for(int j=0;j<local_c[i];++j) out_offset[j+start]=buff[i][j]; //(!!!)
}
}
int main(void) {
printf("HASH_ROW: %d\nHASH_COL: %d\nPRODUCT: %d\n",(int)HASH_ROW,(int)HASH_COL,(int)(HASH_ROW)*(HASH_COL));
ull *base_adr;
gpuErrchk(cudaGetSymbolAddress((void**)&base_adr,dev_base));
gpuErrchk(cudaMemset(base_adr,0,7));
gpuErrchk(cudaMemset(base_adr,0x02,1));
}
A rather unusual error.
The failure is occurring because:
By specifying a virtual architecture only (-arch compute_11) you defer the PTX compile step until runtime (i.e. you are forcing JIT-compile)
The JIT-compile is failing (at runtime)
The failure of the JIT-compile (and link) means device symbols cannot be properly established
Due to the problem with device symbols, the operation cudaGetSymbolAddress on the device symbol dev_base fails, and throws an error.
Why is the JIT-compile failing? You can find out yourself by triggering the machine code compile (which runs the ptxas assembler) by specifying -arch=sm_11 instead of -arch compute_11. If you do that, you'll get this result:
ptxas error : Entry function '_Z4findPjS_' uses too much local data (0x10100 bytes, 0x4000 max)
So even though your code doesn't call the find kernel, it must compile successfully to have a sane device environment for symbols.
Why does this compile error occur? Because you are requesting too much local memory per thread. cc 1.x devices are limited to 16KB local memory per thread, and your find kernel is requesting quite a bit more than that (over 64KB).
When I initially tried it on my device, I was using a cc2.0 device which has a higher limit (512KB per thread) and so the JIT-compile step succeeded.
In general, I would recommend specifying both a virtual architecture and a machine architecture, and the shorthand way to do that is:
nvcc -arch=sm_11 ....
(for a cc1.1 device)
This question/answer may also be of interest, and the nvcc manual has more details about virtual vs. machine architecture, and how to specify the compilation phases for each.
I believe the reason the error goes away when you comment out those particular lines in the kernel, is that with those commented out, the compiler is able to optimize-out the accesses to those local memory areas, and optimize-out the instantiation of the local memory. This allows the JIT-compile step to complete successfully, and your code runs "without runtime error".
You can verify this by commenting those lines out and then specify a full compile (nvcc -arch=sm_11 ...), where -arch is short for --gpu-architecture.
This error usually means the kernel has been compiled for the wrong architecture. You need to find out what the compute capability of your GPU is, and then compile it for that architecture. E.g. if your GPU has compute capability 1.1, compile it with -arch=sm_11. You can also build an executable for more than one architecture.

cudaMemset fails on __device__ variable

I am having trouble using cudaMemset on a device variable. Is it possible to use the reference to the device variable for cudaMemset, or is it just a matter of missing compiler flags, or libraries.. I am using cuda 4.1, and
NVRM version: NVIDIA UNIX x86_64 Kernel Module 285.05.33 Thu Jan 19
14:07:02 PST 2012
This is my sample code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
// device variable and kernel
__device__ float d_test;
int main() {
if (cudaMemset(&d_test,0,sizeof(float)) !=cudaSuccess)
printf("Error!\n");
}
which outputs:
Error!
Your problem is that d_test (as it appears in the host symbol table) isn't a valid device address and the runtime cannot access it directly. The solution is to use the cudaGetSymbolAddress API function to read the address of the device symbol from the context at runtime. Here is a slightly expanded version of your demonstration case which should work correctly:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
// device variable and kernel
__device__ float d_test;
inline void gpuAssert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
int main()
{
float * _d_test;
gpuErrchk( cudaFree(0) );
gpuErrchk( cudaGetSymbolAddress((void **)&_d_test, "d_test") );
gpuErrchk( cudaMemset(_d_test,0,sizeof(float)) );
gpuErrchk( cudaThreadExit() );
return 0;
}
Here, we read the address of the device symbol d_test from the context into a host pointer _d_test. This can then be passed to host side API functions like cudaMemset, cudaMemcpy, etc.
Edit to note that the form of cudaGetSymbolAddress shown in this answer has been deprecated and removed from the CUDA runtime API. For modern CUDA, the call would be:
gpuErrchk( cudaGetSymbolAddress((void **)&_d_test, d_test) );
I believe you can also use cudaMemcpyFromSymbol:
A function, such as the following kernel, can change the value of the variable declared in global memory (outside of the main function)
__global__ void kernel1() { d_test = 1.0; }
Inside your main, you can obtain the value using cudaMemcpyFromSymbol
cudaMemcpyFromSymbol(&h_test,"d_test",sizeof(float),0,cudaMemcpyDeviceToHost);
Of course, there is also cudaMemcpyToSymbol to change the value of the global variable.
The idea came from here: Having problem assigning a device variable in CUDA