I need to create a shared library for cuda. The compilation of the library works fine but when I try to use it in my program nvcc returns a linker or ptxas error.
I reduced the problem to the following code. The library must replace different C functions (here: memset).
The library consists of three C++ files:
FileA.h
#ifndef FILEA_H_
#define FILEA_H_
namespace A {
__device__
void* memset(void* _in, int _val, int _size);
};
#endif
FileA.cpp
#include "FileA.h"
__device__
void* A::memset(void* _in, int _val, int _size) {
char* tmp = (char*)_in;
for(int i = 0; i < _size; i++) tmp[i] = _val;
return _in;
}
TempClass.h
#ifndef TEMPCLASS_H_
#define TEMPCLASS_H_
#include "FileA.h"
namespace A {
template <typename T>
class TC {
public:
__device__
TC() {
data = new T[10];
}
__device__
~TC(){
delete [] data;
}
__device__
void clear(){
A::memset(data, 0, 10*sizeof(T));
}
T* data;
};
};
#endif
Using the following commands I create a shared library:
nvcc -Xcompiler -fPIC -x cu -rdc=true -c FileA.cpp -o FileA.o
nvcc -Xcompiler -fPIC --shared -o libTestA.so FileA.o -lcudart
This library should be used in a main program:
main.cpp
#include <cuda.h>
#include <TempClass.h>
#include <iostream>
__device__
int doSomthing() {
A::TC<int>* tc = new A::TC<int>();
tc->clear();
for (int i = 0; i < 5; i++) tc->data[i] = i;
int sum = 0;
for (int i = 0; i < 5; i++) sum += tc->data[i];
delete tc;
return sum;
}
__global__
void kernel(int* _res) {
_res[0] = doSomthing();
}
int main(int argc, char** argv) {
int* devVar;
int* hostVar;
hostVar = new int[1];
hostVar[0] = -1;
cudaMalloc(&devVar, sizeof(int));
cudaMemcpy(devVar, hostVar, sizeof(int), cudaMemcpyHostToDevice);
kernel<<< 1, 1>>> (devVar);
cudaMemcpy(hostVar, devVar, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "kernel done. sum " << *hostVar << std::endl;
return 0;
}
If I try to compile the program with the commands:
nvcc -Xcompiler -fPIC -I. -L. -rdc=true -x cu -c main.cpp -o main.o
nvcc -Xcompiler -fPIC -I. -L. main.o -o main -lTestA
I receive the error message:
nvlink error : Undefined reference to '_ZN1A6memsetEPvii' in 'main.o'
I receive the same error if I try to compile the file directly:
nvcc -Xcompiler -fPIC -I. -L. -rdc=true -x cu main.cpp -o main -lTestA
The command nm libTestA.so shows that the library contains the function symbol _ZN1A6memsetEPvii.
When I remove the -rdc=true option while linking I receive a ptxas error:
ptxas fatal : Unresolved extern function '_ZN1A6memsetEPvii'
In my case static linking is no option, I need a shared library. I've also tried to make memset an extern "C" function but this collides with the original C function. The code compiles correctly with g++. Do you have suggestions how to solve this problem.
It appears that you are attempting to do device-code linking across a library boundary. Currently, that is only possible with a static library.
The options that I am aware of would be to switch to a static library/link arrangement, or else refactor your code so that you do not need to link device code across a dynamic library boundary.
Related
What do we have to do to use cuPrintf()? (device compute capability 1.2, Ubuntu 12) I couldn't find "cuPrintf.cu" and "cudaPrintf.cuh", so i downloaded their code and include them:
#include "cuPrintf.cuh"
#include "cuPrintf.cu"
By the way this is the rest of the code:
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main () {
dim3 gridSize = dim3 (1);
dim3 blockSize = dim3 (16);
cudaPrintfInit ();
hello_kernel <<< gridSize, blockSize >>> (1.2345f);
cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();
return (0);
}
But nvcc still gives a mistake:
max#max-Lenovo-G560:~/CUDA/matrixMult$ nvcc printfTest.cu -o printfTest
printfTest.cu(5): error: calling a __host__ function("printf") from a __global__
function("hello_kernel") is not allowed
Thanks!
In your kernel instead of this:
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
you should do this:
cuPrintf ("Thread number %d. f = %d\n", threadIdx.x, f);
Other than that, I believe your code is correct (it works for me).
This SO question/answer gives more tips about using cuPrintf properly.
Include <stdio.h> and compile with -arch=sm_20.
Details:
code:
#include <stdio.h>
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main(){
return 0;
}
compilations:
nvcc -arch=sm_20 -o printfTest printfTest.cu
If I have simple test cuda kernel in hello.cu file as:
extern "C" __device__ float radians( float f ){
return f*3.14159265;
}
And test OpenACC code in mainacc.c:
#include <stdio.h>
#include <stdlib.h>
#define N 10
#pragma acc routine seq
extern float radians( float );
int main() {
int i;
float *hptr, *dptr;
hptr = (float *) calloc(N, sizeof(float));
#pragma acc parallel loop copy(hptr[0:N])
for(i=0; i<N; i++) {
hptr[i] = radians(i*0.1f);
}
for( i=0; i< N; i++)
printf("\n %dth value : %f", i, hptr[i]);
return 0;
}
If I try to compile this code as below I get link time errors:
nvcc hello.cu -c
cc -hacc -hlist=a mainacc.c hello.o
nvlink error : Undefined reference to 'radians' in '/tmp/pe_20271//app_cubin_20271.omainacc_1.o__sec.cubin'
cuda_link: nvlink fatal error
I tried nvcc with "--relocatable-device-code true” option etc but no success. Loaded modules are:
craype-accel-nvidia35
cudatoolkit/6.5
PrgEnv-cray/5.2.40
Could you tell me correct way to use cuda device kernel within OpenACC?
I've been able to make this sort of mixing work with PGI, but I've not yet been able to produce a sample that works with the Cray compiler. Here's a simple example that works for PGI.
This is the file containing the CUDA.
// saxpy_cuda_device.cu
extern "C"
__device__
float saxpy_dev(float a, float x, float y)
{
return a * x + y;
}
This is the file containing OpenACC.
// openacc_cuda_device.cpp
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#pragma acc routine seq
extern "C" float saxpy_dev(float, float, float);
int main(int argc, char **argv)
{
float *x, *y, tmp;
int n = 1<<20, i;
x = (float*)malloc(n*sizeof(float));
y = (float*)malloc(n*sizeof(float));
#pragma acc data create(x[0:n]) copyout(y[0:n])
{
#pragma acc kernels
{
for( i = 0; i < n; i++)
{
x[i] = 1.0f;
y[i] = 0.0f;
}
}
#pragma acc parallel loop
for( i = 0; i < n; i++ )
{
y[i] = saxpy_dev(2.0, x[i], y[i]);
}
}
fprintf(stdout, "y[0] = %f\n",y[0]);
return 0;
}
Below is the compilation command.
$ make
nvcc -rdc true -c saxpy_cuda_device.cu
pgc++ -fast -acc -ta=nvidia:rdc,cuda7.0 -c openacc_cuda_device.cpp
pgc++ -o openacc_cuda_device -fast -acc -ta=nvidia:rdc,cuda7.0 saxpy_cuda_device.o openacc_cuda_device.o -Mcuda
You can use the -Wc command line option to add the generated ptx file to the CUDA link line. I've opened a bug to make sure we document how to do this.
nvcc hello.cu -ptx -arch=sm_35
cc -hacc -hlist=a mainacc.c -Wc,hello.ptx
One suggestion is to provide both a host and device version of the subroutine and then use the "bind" clause to indicate which version to call from a compute region. This will allow you to maintain portability with the host code.
For example:
% cat radians.cu
extern "C" __device__ float cuda_radians( float f ){
return f*3.14159265;
}
extern "C" float radians( float f ){
return f*3.14159265;
}
% cat test.c
#include <stdio.h>
#include <stdlib.h>
#define N 10
#pragma acc routine (radians) bind(cuda_radians) seq
extern float radians( float f);
int main() {
int i;
float *hptr, *dptr;
hptr = (float *) calloc(N, sizeof(float));
#pragma acc parallel loop copy(hptr[0:N])
for(i=0; i<N; i++) {
hptr[i] = radians(i*0.1f);
}
for( i=0; i< N; i++)
printf("\n %dth value : %f", i, hptr[i]);
return 0;
}
% nvcc -c radians.cu --relocatable-device-code true
% pgcc -acc -ta=tesla:cuda7.0 -Minfo=accel test.c radians.o -V15.7 -Mcuda
test.c:
main:
15, Generating copy(hptr[:10])
Accelerator kernel generated
Generating Tesla code
16, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
% a.out
0th value : 0.000000
1th value : 0.314159
2th value : 0.628319
3th value : 0.942478
4th value : 1.256637
5th value : 1.570796
6th value : 1.884956
7th value : 2.199115
8th value : 2.513274
9th value : 2.827434
I'm using CUDA and attempting to use a function pointer to pass a CUDA function to a library that later uses this function in its device kernel, similar to the CUDA function pointer example.
The important sections of the code are:
/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );
__device__ void gpuTest(int type , void *data)
{
....
}
__device__ qsched_funtype function = gpuTest;
void main(...)
{
//Various initialization setup.
if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
error("Failed to copy function pointer from device");
qsched_run_CUDA( &s , func );
}
The qsched_run_CUDA function is a library function that does some initialization, copies the function pointer to the device (to a variable it can see) and then runs a kernel that at some points calls the gpuTest function using that function pointer.
The code compiles correctly provided I use -G with the following nvcc call:
nvcc -g -G -m64 -I../src ../src/.libs/libquicksched_cuda.a -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda -DWITH_CUDA -gencode arch=compute_30,code=sm_30 -lgomp test_gpu_simple.cu -o out.out
where
../src/.libs/libquicksched_cuda.a
is the library containing the qsched_run_CUDA function.
The moment I remove the -G flag from my nvcc call then suddenly it all breaks, and the kernel run in qsched_run_CUDA crashes with an invalid program counter error, and the function pointer (including in my own .cu file) is set to 0x4.
Presumably I need to use the seperate compilation in CUDA ( http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#using-separate-compilation-in-cuda ) as explained vaguely in Cuda function pointer consistency - however I'm not sure how to do this when using library functions, neither nvcc's guide nor the stackoverflow link make it obvious how to do this.
Has anyone any experience with this? I attempted to briefly try to work out nvlink to do this but I didn't get far (it didn't seem happy with my passing it a library).
Yes, you will need to use separate compilation. I put together a simple test case based on what you have shown so far, and using the nvcc separate compilation library example from the documentation. Here is the code:
kernel_lib.cu:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );
__global__ void mykernel(int type, void *data, void *func){
((qsched_funtype)func)(type, data);
}
int qsched_run_CUDA(int val, void *d_data, void *func)
{
mykernel<<<1,1>>>(val, d_data, func);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
return 0;
}
main.cu:
#include <stdio.h>
#define DATA_VAL 5
int qsched_run_CUDA(int, void*, void*);
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );
__device__ void gpuTest(int type , void *data)
{
((int *)data)[0] = type;
}
__device__ qsched_funtype function = gpuTest;
int main()
{
void *func;
cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype));
cudaCheckErrors("Failed to copy function pointer from device");
int h_data = 0;
int *d_data;
cudaMalloc((void **)&d_data, sizeof(int));
cudaCheckErrors("cudaMalloc fail");
cudaMemset(d_data, 0, sizeof(int));
cudaCheckErrors("cudaMemset fail");
int return_val = qsched_run_CUDA(DATA_VAL, (void *)d_data, func);
if (return_val != 0) printf("return code error\n");
cudaMemcpy(&h_data, d_data, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
if (h_data != DATA_VAL) {printf("Fail! %d\n", h_data); return 1;}
printf("Success!\n");
return 0;
}
compile commands and result:
$ nvcc -arch=sm_20 -dc kernel_lib.cu
$ nvcc -lib kernel_lib.o -o test.a
$ nvcc -arch=sm_20 -dc main.cu
$ nvcc -arch=sm_20 main.o test.a -o test
$ ./test
Success!
$
I used CUDA 5.0 for this test.
I am trying to use the thrust::copy_if function on mapped memory. However, as I get a runtime error and I am not being able to find it, before spending a lot of time in debugging, I would like to have a confirmation of the fact that it is effectively allowed to pass a pointer to a mapped memory location to the thrust::device_ptr wrapper.
Here is an example of what I mean:
int size=1024;
int* v_locked;
int* v_device;
int* stencil_device;
device_ptr<int> v_wrapper;
device_ptr<int> v_wrapper_end;
device_ptr<int> stencil_wrapper;
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped));
cudaHostGetDevicePointer(&v_device, &v_locked, 0);
cudaMalloc((void**)&stencil_device, size*sizeof(int));
/*
kernel assigning stencil_device elements ...
*/
v_wrapper = device_pointer_cast(v_device);
stencil_wrapper = device_pointer_cast(stencil_device);
v_wrapper_end = copy_if(make_counting_iterator<int>(0), make_counting_iterator<int>(size), stencil_wrapper, v_wrapper, _1 == 1);
Is this a correct usage of mapped memory with thrust library?
Thank you.
Yes, it is possible.
I believe there were several problems with your code.
You don't appear to be doing any proper cuda error checking If you were, you would have detected that although your calls to cudaHostGetDevicePointer seem to compile correctly, they were not set up correctly.
As mentioned above, your calls to cudaHostGetDevicePointer() were not set up correctly. The second pointer argument is passed as a single pointer (*), not double pointer (**). Refer to the documentation This call as written would throw a cuda runtime error which you can trap.
Prior to your cudaHostAlloc calls, you should use the cudaSetDeviceFlags(cudaDeviceMapHost); call to enable this feature.
Here is a sample code which seems to work correctly for me, and has the above problems fixed:
$ cat t281.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template<typename T>
struct is_one : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x==1);
}
};
int main(){
int size=1024;
int* v_locked;
int* v_device;
int* stencil_locked;
int* stencil_device;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags");
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 1");
cudaHostGetDevicePointer(&v_device, v_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 1");
cudaHostAlloc((void**)&stencil_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 2");
cudaHostGetDevicePointer(&stencil_device, stencil_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 2");
for (int i = 0; i < size; i++){
v_locked[i] = i;
stencil_locked[i] = i%2;}
thrust::device_ptr<int> v_wrapper = thrust::device_pointer_cast(v_device);
thrust::device_ptr<int> stencil_wrapper = thrust::device_pointer_cast(stencil_device);
thrust::device_ptr<int> v_wrapper_end = v_wrapper + size;
thrust::device_vector<int> result(size);
thrust::device_vector<int>::iterator result_end = copy_if(v_wrapper, v_wrapper_end, stencil_wrapper, result.begin(), is_one<int>());
int result_size = result_end - result.begin();
thrust::host_vector<int> h_result(result_size);
thrust::copy_n(result.begin(), result_size, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -o t281 t281.cu
$ ./t281
1 3 5 7 9 11 13 15 17 19
$
I'm trying to compile a kernel that uses dynamic parallelism to run CUBLAS to a cubin file.
When I try to compile the code using the command
nvcc -cubin -m64 -lcudadevrt -lcublas_device -gencode arch=compute_35,code=sm_35 -o test.cubin -c test.cu
I get ptxas fatal : Unresolved extern function 'cublasCreate_v2
If I add the -rdc=true compile option it compiles fine, but when I try to load the module using cuModuleLoad I get error 500: CUDA_ERROR_NOT_FOUND. From cuda.h:
/**
* This indicates that a named symbol was not found. Examples of symbols
* are global/constant variable names, texture names, and surface names.
*/
CUDA_ERROR_NOT_FOUND = 500,
The kernel code:
#include <stdio.h>
#include <cublas_v2.h>
extern "C" {
__global__ void a() {
cublasHandle_t cb_handle = NULL;
cudaStream_t stream;
if( threadIdx.x == 0 ) {
cublasStatus_t status = cublasCreate_v2(&cb_handle);
cublasSetPointerMode_v2(cb_handle, CUBLAS_POINTER_MODE_HOST);
if (status != CUBLAS_STATUS_SUCCESS) {
return;
}
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cublasSetStream_v2(cb_handle, stream);
}
__syncthreads();
int jp;
double A[3];
A[0] = 4.0f;
A[1] = 5.0f;
A[2] = 6.0f;
cublasIdamax_v2(cb_handle, 3, A, 1, &jp );
}
}
NOTE: The scope of A is local, so the data at the pointer given to cublasIdamax_v2 is undefined, and so jp ends up as a more or less random value in this code. The correct way to do it would be to have A in global memory.
Host code:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
int main() {
CUresult error;
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction testkernel;
// Initialize
error = cuInit(0);
if (error != CUDA_SUCCESS) printf("ERROR: cuInit, %i\n", error);
error = cuDeviceGet(&cuDevice, 0);
if (error != CUDA_SUCCESS) printf("ERROR: cuInit, %i\n", error);
error = cuCtxCreate(&cuContext, 0, cuDevice);
if (error != CUDA_SUCCESS) printf("ERROR: cuCtxCreate, %i\n", error);
error = cuModuleLoad(&cuModule, "test.cubin");
if (error != CUDA_SUCCESS) printf("ERROR: cuModuleLoad, %i\n", error);
error = cuModuleGetFunction(&testkernel, cuModule, "a");
if (error != CUDA_SUCCESS) printf("ERROR: cuModuleGetFunction, %i\n", error);
return 0;
}
The host code is compiled using nvcc -lcuda test.cpp.
If I replace the kernel with a simple kernel (below) and compile it without -rdc=true, it works fine.
Simple working kernel
#include <stdio.h>
extern "C" {
__global__ void a() {
printf("hello\n");
}
}
Thanks in advance
Soren
You are just missing -dlink in your first approach:
nvcc -cubin -m64 -lcudadevrt -lcublas_device -gencode arch=compute_35,code=sm_35 -o test.cubin -c test.cu -dlink
You can also do that in two steps:
nvcc -m64 test.cu -gencode arch=compute_35,code=sm_35 -o test.o -dc
nvcc -dlink test.o -arch sm_35 -lcublas_device -lcudadevrt -cubin -o test.cubin