Load device function from shared library with dlopen - cuda

I'm relatively new to cuda programming and can't find a solution to my problem.
I'm trying to have a shared library, lets call it func.so, that defines a device function
__device__ void hello(){ prinf("hello");}
I then want to be able to access that library via dlopen, and use that function in my programm. I tried something along the following lines:
func.cu
#include <stdio.h>
typedef void(*pFCN)();
__device__ void dhello(){
printf("hello\n")
}
__device__ pFCN ptest = dhello;
pFCN h_pFCN;
extern "C" pFCN getpointer(){
cudaMemcpyFromSymbol(&h_pFCN, ptest, sizeof(pFCN));
return h_pFCN;
}
main.cu
#include <dlfcn.h>
#include <stdio.h>
typedef void (*fcn)();
typedef fcn (*retpt)();
retpt hfcnpt;
fcn hfcn;
__device__ fcn dfcn;
__global__ void foo(){
(*dfcn)();
}
int main() {
void * m_handle = dlopen("gputest.so", RTLD_NOW);
hfcnpt = (retpt) dlsym( m_handle, "getpointer");
hfcn = (*hfcnpt)();
cudaMemcpyToSymbol(dfcn, &hfcn, sizeof(fcn), 0, cudaMemcpyHostToDevice);
foo<<<1,1>>>();
cudaThreadSynchronize();
return 0;
}
But this way I get the following error when debugging with cuda-gdb:
CUDA Exception: Warp Illegal Instruction
Program received signal CUDA_EXCEPTION_4, Warp Illegal Instruction.
0x0000000000806b30 in dtest () at func.cu:5
I appreciate any help you all can give me! :)

Calling a __device__ function in one compilation unit from device code in another compilation unit requires separate compilation with device linking usage of nvcc.
However, such usage with libraries only works with static libraries.
Therefore if the target __device__ function is in the .so library, and the calling code is outside of the .so library, your approach cannot work, with the current nvcc toolchain.
The only "workarounds" I can suggest would be to put the desired target function in a static library, or else put both caller and target inside the same .so library. There are a number of questions/answers on the cuda tag which give examples of these alternate approaches.

Related

Problems in CUDA function cudaMemcpyToSymbol()

I'm transporting data to specific CUDA symbol, my CUDA version is 10.1, GPU is Tesla K80. I compiled the code on VS2017, code generated by compute_35 & sm35. When I wrote my code like this,
<.h>
#include <cuda_runtime.h>
__device__ __constant__ float scoreRatio;
<.cpp>
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(&scoreRatio,&ScoreRatio,sizeof(ScoreRatio));
printf("%d: %s.\n",cudaErr,cudaGetErorString(cudaErr));
it compiled well but got cudaErrInvalidSymbol when I run the program,
13: Invalid device symbol
If I modified my code like this,
<.h>
#include <cuda_runtime.h>
__device__ __constant__ float scoreRatio;
<.cpp>
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(scoreRatio,&ScoreRatio,sizeof(ScoreRatio));
then the compile would fail due to incompatible parameter type as the first parameter is FLOAT while function asks VOID*, here I found the function definition in cuda_runtime_api.h,
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
Could anyone please give some advice, much appreciated.
This:
<.h>
#include <cuda_runtime.h>
__device__ __constant__ float scoreRatio;
<.cpp>
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(&scoreRatio,&ScoreRatio,sizeof(ScoreRatio));
printf("%d: %s.\n",cudaErr,cudaGetErorString(cudaErr));
is illegal/wrong in two ways. You must use nvcc to compile the code using a device code aware trajectory, and the first argument of the cudaMemcpyToSymbol call is incorrect. If you simply rename your .cpp source file to have a .cu file extension and change the contents so that it looks like this:
<.cu>
#include <.h>
....
const float ScoreRatio;
cudaErr=cudaMemcpyToSymbol(scoreRatio, &ScoreRatio, sizeof(ScoreRatio));
printf("%d: %s.\n", cudaErr, cudaGetErorString(cudaErr));
it will both compile and run correctly. See here for an explanation of why it is necessary to change the first argument of the cudaMemcpyToSymbol call.

cuda::cub error calling a __host__ function from a __device__ functionis not allowed

I use cub::DeviceReduce::Sum to compute the summation of a vector, but it gave me the error :
error: calling a __host__ function("cub::DeviceReduce::Sum<double *, double *> ") from a __device__ function("dotcubdev") is not allowed
error: identifier "cub::DeviceReduce::Sum<double *, double *> " is undefined in device code
The code sample is as follows:
__device__ void sumcubdev(double* a, double *sum, int N)
{
// Declare, allocate, and initialize device-accessible pointers
//for input and output
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, a, sum, N);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, a, sum, N);
}
The code can run successfully in the "main{}" body, but it can't work in the function.
To use a cub device-wide function from device code, it is necessary to build your project to support CUDA dynamic parallelism. In the cub documentation, this is indicated here:
Usage Considerations
Dynamic parallelism. DeviceReduce methods can be called within kernel code on devices in which CUDA dynamic parallelism is supported.
For example, you can compile the code you have shown with:
$ cat t1364.cu
#include <cub/cub.cuh>
__device__ void sumcubdev(double* a, double *sum, int N)
{
// Declare, allocate, and initialize device-accessible pointers
//for input and output
// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, a, sum, N);
// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
// Run sum-reduction
cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, a, sum, N);
}
$ nvcc -arch=sm_35 -dc t1364.cu
$
(CUDA 9.2, CUB 1.8.0)
This means CUB will be launching child kernels to get the work done.
This is not a complete tutorial on how to use CUDA Dynamic Parallelism (CDP). The above is the compile command only and omits the link step. There are many questions here on the cuda tag which discuss CDP, you can read about it in two blog articles and the programming guide, and there are CUDA sample projects showing how to compile and use it.

Access to CUDA library functions inside specialized instantiations of __device__ function templates

I have the following template __device__ function in CUDA:
template<typename T>
__device__ void MyatomicAdd(T *address, T val){
atomicAdd(address , val);
}
that compiles and runs just fine if instantiated with T as a float, i.e.
__global__ void myKernel(float *a, float b){
MyatomicAdd<float>(a,b);
}
will run without a problem.
I wanted to specialize this function, as there is no atomicAdd() for doubles, so I can hand code an implementation in double precision. Ignoring the double precision specialization for now, the single precision specialization and template look like this:
template<typename T>
__device__ void MyatomicAdd(T *address, T val){
};
template<>
__device__ void MyatomicAdd<float>(float *address, float val){
atomicAdd(address , val);
}
Now the compiler complains that atomicAdd() is undefined in my specialization, the same applies when I try to use any CUDA functions like __syncthreads() within the specialization. Any ideas? Thanks.
It ended up being a linking problem with some OpenGL code developed by a colleague. Forcing the specializations to be inline fixed the problem, although obviously not the root cause. Still, it'll do for now until I can be be bothered to dig through the other guy's code.

invalid resource in a windows cuda project

I've ported a cuda project from linux to windows (basically just added few defines and typedefs in the header file). I'm using visual studio 2008, and the cuda runtime api custom build rules from the SDK. The code is c, not c++ (and I'm compiling /TC not /TP)
I'm having scope issues that I didn't have in linux. Global variables in my header file aren't shared between the .c files and .cu files.
I've created a simplified project, and here is all of the code:
main.h:
#ifndef MAIN_H
#define MAIN_H
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
cudaEvent_t cudaEventStart;
#if defined __cplusplus
extern "C" void func(void);
#else
extern void func(void);
#endif
#endif
main.c:
#include "main.h"
int main(void)
{
int iDevice = 0;
cudaSetDevice(iDevice);
cudaFree(0);
cudaGetDevice(&iDevice);
printf("device: %d\n", iDevice);
cudaEventCreate(&cudaEventStart);
printf("create event: %d\n", (int) cudaEventStart);
func();
cudaEventDestroy(cudaEventStart);
printf("destroy event: %d\n", (int) cudaEventStart);
return cudaThreadExit();
}
kernel.cu:
#include "main.h"
void func()
{
printf("event in cu: %d\n", (int) cudaEventStart);
}
output:
device: 0
create event: 44199920
event in cu: 0
event destroy: 441999920
Any ideas about what I am doing wrong here? How do I need to change my setup so that it works in visual studio? Ideally, I'd like a setup that works multi-platform.
CUDA 3.2, GTX 480, 64-bit Win7, 263.06 general
What you are trying to do
Would not work even without CUDA -- try renaming kernel.cu to kernel.c and recompile. You will get a linker error because cudaEventStart will be multiply defined -- in each compilation unit (.c file) that includes it. You would need to make the variable static, and initialize it in only one compilation unit.
Compiles in CUDA because CUDA does not have a linker, and therefore code in compilation units compiled by nvcc (.cu files) cannot reference symbols in other compilation units. CUDA doesn't support static global variables currently. In the future CUDA will have a linker, but currently it does not.
What is happening is each compilation unit is getting its own, non-conflicting instance of cudaEventStart.
What you can do is get rid of the global variable (make it a local variable in main()), add cudaEvent_t parameters to the functions that need to use the event, and then pass the event variable around.
BTW, in your second post, you have circular #includes...
I modified my simplified example (with success) by including the .cu file in the header and removing the forward declarations of the .cu file function.
main.h:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "kernel.cu"
cudaEvent_t cudaEventStart;
main.c:
#include "main.h"
int main(void)
{
int iDevice = 0;
cudaSetDevice(iDevice);
cudaFree(0);
cudaGetDevice(&iDevice);
printf("device: %d\n", iDevice);
cudaEventCreate(&cudaEventStart);
printf("create event: %d\n", (int) cudaEventStart);
func();
cudaEventDestroy(cudaEventStart);
printf("destroy event: %d\n", (int) cudaEventStart);
return cudaThreadExit();
}
kernel.cu:
#ifndef KERNEL_CU
#define KERNEL_CU
#include "main.h"
void func(void);
void func()
{
printf("event in cu: %d\n", (int) cudaEventStart);
}
#endif
output:
device: 0
create event: 42784024
event in cu: 42784024
event destroy: 42784024
About to see if it works in my real project, and whether the solution is portable back to linux.

Using assert within kernel invocation

Is there convenient way for using asserts within the kernels invocation on device mode?
CUDA now has a native assert function. Use assert(...). If its argument is zero, it will stop kernel execution and return an error. (or trigger a breakpoint if in CUDA debugging.)
Make sure to include "assert.h". Also, this requires compute capability 2.x or higher, and is not supported on MacOS. For more details see CUDA C Programming Guide, Section B.16.
The programming guide also includes this example:
#include <assert.h>
__global__ void testAssert(void)
{
int is_one = 1;
int should_be_one = 0;
// This will have no effect
assert(is_one);
// This will halt kernel execution
assert(should_be_one);
}
int main(int argc, char* argv[])
{
testAssert<<<1,1>>>();
cudaDeviceSynchronize();
return 0;
}
#define MYASSERT(condition) \
if (!(condition)) { return; }
MYASSERT(condition);
if you need something fancier you can use cuPrintf() which is available from the CUDA site for registered developers.