CUDA 5.0: CUBIN and CUBLAS_device, compute capability 3.5 - cuda

I'm trying to compile a kernel that uses dynamic parallelism to run CUBLAS to a cubin file.
When I try to compile the code using the command
nvcc -cubin -m64 -lcudadevrt -lcublas_device -gencode arch=compute_35,code=sm_35 -o test.cubin -c test.cu
I get ptxas fatal : Unresolved extern function 'cublasCreate_v2
If I add the -rdc=true compile option it compiles fine, but when I try to load the module using cuModuleLoad I get error 500: CUDA_ERROR_NOT_FOUND. From cuda.h:
/**
* This indicates that a named symbol was not found. Examples of symbols
* are global/constant variable names, texture names, and surface names.
*/
CUDA_ERROR_NOT_FOUND = 500,
The kernel code:
#include <stdio.h>
#include <cublas_v2.h>
extern "C" {
__global__ void a() {
cublasHandle_t cb_handle = NULL;
cudaStream_t stream;
if( threadIdx.x == 0 ) {
cublasStatus_t status = cublasCreate_v2(&cb_handle);
cublasSetPointerMode_v2(cb_handle, CUBLAS_POINTER_MODE_HOST);
if (status != CUBLAS_STATUS_SUCCESS) {
return;
}
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cublasSetStream_v2(cb_handle, stream);
}
__syncthreads();
int jp;
double A[3];
A[0] = 4.0f;
A[1] = 5.0f;
A[2] = 6.0f;
cublasIdamax_v2(cb_handle, 3, A, 1, &jp );
}
}
NOTE: The scope of A is local, so the data at the pointer given to cublasIdamax_v2 is undefined, and so jp ends up as a more or less random value in this code. The correct way to do it would be to have A in global memory.
Host code:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
int main() {
CUresult error;
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction testkernel;
// Initialize
error = cuInit(0);
if (error != CUDA_SUCCESS) printf("ERROR: cuInit, %i\n", error);
error = cuDeviceGet(&cuDevice, 0);
if (error != CUDA_SUCCESS) printf("ERROR: cuInit, %i\n", error);
error = cuCtxCreate(&cuContext, 0, cuDevice);
if (error != CUDA_SUCCESS) printf("ERROR: cuCtxCreate, %i\n", error);
error = cuModuleLoad(&cuModule, "test.cubin");
if (error != CUDA_SUCCESS) printf("ERROR: cuModuleLoad, %i\n", error);
error = cuModuleGetFunction(&testkernel, cuModule, "a");
if (error != CUDA_SUCCESS) printf("ERROR: cuModuleGetFunction, %i\n", error);
return 0;
}
The host code is compiled using nvcc -lcuda test.cpp.
If I replace the kernel with a simple kernel (below) and compile it without -rdc=true, it works fine.
Simple working kernel
#include <stdio.h>
extern "C" {
__global__ void a() {
printf("hello\n");
}
}
Thanks in advance
Soren

You are just missing -dlink in your first approach:
nvcc -cubin -m64 -lcudadevrt -lcublas_device -gencode arch=compute_35,code=sm_35 -o test.cubin -c test.cu -dlink
You can also do that in two steps:
nvcc -m64 test.cu -gencode arch=compute_35,code=sm_35 -o test.o -dc
nvcc -dlink test.o -arch sm_35 -lcublas_device -lcudadevrt -cubin -o test.cubin

Related

Compute Capability printf Function [duplicate]

What do we have to do to use cuPrintf()? (device compute capability 1.2, Ubuntu 12) I couldn't find "cuPrintf.cu" and "cudaPrintf.cuh", so i downloaded their code and include them:
#include "cuPrintf.cuh"
#include "cuPrintf.cu"
By the way this is the rest of the code:
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main () {
dim3 gridSize = dim3 (1);
dim3 blockSize = dim3 (16);
cudaPrintfInit ();
hello_kernel <<< gridSize, blockSize >>> (1.2345f);
cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();
return (0);
}
But nvcc still gives a mistake:
max#max-Lenovo-G560:~/CUDA/matrixMult$ nvcc printfTest.cu -o printfTest
printfTest.cu(5): error: calling a __host__ function("printf") from a __global__
function("hello_kernel") is not allowed
Thanks!
In your kernel instead of this:
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
you should do this:
cuPrintf ("Thread number %d. f = %d\n", threadIdx.x, f);
Other than that, I believe your code is correct (it works for me).
This SO question/answer gives more tips about using cuPrintf properly.
Include <stdio.h> and compile with -arch=sm_20.
Details:
code:
#include <stdio.h>
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main(){
return 0;
}
compilations:
nvcc -arch=sm_20 -o printfTest printfTest.cu

CUDA build shared library

I need to create a shared library for cuda. The compilation of the library works fine but when I try to use it in my program nvcc returns a linker or ptxas error.
I reduced the problem to the following code. The library must replace different C functions (here: memset).
The library consists of three C++ files:
FileA.h
#ifndef FILEA_H_
#define FILEA_H_
namespace A {
__device__
void* memset(void* _in, int _val, int _size);
};
#endif
FileA.cpp
#include "FileA.h"
__device__
void* A::memset(void* _in, int _val, int _size) {
char* tmp = (char*)_in;
for(int i = 0; i < _size; i++) tmp[i] = _val;
return _in;
}
TempClass.h
#ifndef TEMPCLASS_H_
#define TEMPCLASS_H_
#include "FileA.h"
namespace A {
template <typename T>
class TC {
public:
__device__
TC() {
data = new T[10];
}
__device__
~TC(){
delete [] data;
}
__device__
void clear(){
A::memset(data, 0, 10*sizeof(T));
}
T* data;
};
};
#endif
Using the following commands I create a shared library:
nvcc -Xcompiler -fPIC -x cu -rdc=true -c FileA.cpp -o FileA.o
nvcc -Xcompiler -fPIC --shared -o libTestA.so FileA.o -lcudart
This library should be used in a main program:
main.cpp
#include <cuda.h>
#include <TempClass.h>
#include <iostream>
__device__
int doSomthing() {
A::TC<int>* tc = new A::TC<int>();
tc->clear();
for (int i = 0; i < 5; i++) tc->data[i] = i;
int sum = 0;
for (int i = 0; i < 5; i++) sum += tc->data[i];
delete tc;
return sum;
}
__global__
void kernel(int* _res) {
_res[0] = doSomthing();
}
int main(int argc, char** argv) {
int* devVar;
int* hostVar;
hostVar = new int[1];
hostVar[0] = -1;
cudaMalloc(&devVar, sizeof(int));
cudaMemcpy(devVar, hostVar, sizeof(int), cudaMemcpyHostToDevice);
kernel<<< 1, 1>>> (devVar);
cudaMemcpy(hostVar, devVar, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "kernel done. sum " << *hostVar << std::endl;
return 0;
}
If I try to compile the program with the commands:
nvcc -Xcompiler -fPIC -I. -L. -rdc=true -x cu -c main.cpp -o main.o
nvcc -Xcompiler -fPIC -I. -L. main.o -o main -lTestA
I receive the error message:
nvlink error : Undefined reference to '_ZN1A6memsetEPvii' in 'main.o'
I receive the same error if I try to compile the file directly:
nvcc -Xcompiler -fPIC -I. -L. -rdc=true -x cu main.cpp -o main -lTestA
The command nm libTestA.so shows that the library contains the function symbol _ZN1A6memsetEPvii.
When I remove the -rdc=true option while linking I receive a ptxas error:
ptxas fatal : Unresolved extern function '_ZN1A6memsetEPvii'
In my case static linking is no option, I need a shared library. I've also tried to make memset an extern "C" function but this collides with the original C function. The code compiles correctly with g++. Do you have suggestions how to solve this problem.
It appears that you are attempting to do device-code linking across a library boundary. Currently, that is only possible with a static library.
The options that I am aware of would be to switch to a static library/link arrangement, or else refactor your code so that you do not need to link device code across a dynamic library boundary.

Passing CUDA function pointers with libraries

I'm using CUDA and attempting to use a function pointer to pass a CUDA function to a library that later uses this function in its device kernel, similar to the CUDA function pointer example.
The important sections of the code are:
/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );
__device__ void gpuTest(int type , void *data)
{
....
}
__device__ qsched_funtype function = gpuTest;
void main(...)
{
//Various initialization setup.
if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
error("Failed to copy function pointer from device");
qsched_run_CUDA( &s , func );
}
The qsched_run_CUDA function is a library function that does some initialization, copies the function pointer to the device (to a variable it can see) and then runs a kernel that at some points calls the gpuTest function using that function pointer.
The code compiles correctly provided I use -G with the following nvcc call:
nvcc -g -G -m64 -I../src ../src/.libs/libquicksched_cuda.a -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda -DWITH_CUDA -gencode arch=compute_30,code=sm_30 -lgomp test_gpu_simple.cu -o out.out
where
../src/.libs/libquicksched_cuda.a
is the library containing the qsched_run_CUDA function.
The moment I remove the -G flag from my nvcc call then suddenly it all breaks, and the kernel run in qsched_run_CUDA crashes with an invalid program counter error, and the function pointer (including in my own .cu file) is set to 0x4.
Presumably I need to use the seperate compilation in CUDA ( http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#using-separate-compilation-in-cuda ) as explained vaguely in Cuda function pointer consistency - however I'm not sure how to do this when using library functions, neither nvcc's guide nor the stackoverflow link make it obvious how to do this.
Has anyone any experience with this? I attempted to briefly try to work out nvlink to do this but I didn't get far (it didn't seem happy with my passing it a library).
Yes, you will need to use separate compilation. I put together a simple test case based on what you have shown so far, and using the nvcc separate compilation library example from the documentation. Here is the code:
kernel_lib.cu:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );
__global__ void mykernel(int type, void *data, void *func){
((qsched_funtype)func)(type, data);
}
int qsched_run_CUDA(int val, void *d_data, void *func)
{
mykernel<<<1,1>>>(val, d_data, func);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
return 0;
}
main.cu:
#include <stdio.h>
#define DATA_VAL 5
int qsched_run_CUDA(int, void*, void*);
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );
__device__ void gpuTest(int type , void *data)
{
((int *)data)[0] = type;
}
__device__ qsched_funtype function = gpuTest;
int main()
{
void *func;
cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype));
cudaCheckErrors("Failed to copy function pointer from device");
int h_data = 0;
int *d_data;
cudaMalloc((void **)&d_data, sizeof(int));
cudaCheckErrors("cudaMalloc fail");
cudaMemset(d_data, 0, sizeof(int));
cudaCheckErrors("cudaMemset fail");
int return_val = qsched_run_CUDA(DATA_VAL, (void *)d_data, func);
if (return_val != 0) printf("return code error\n");
cudaMemcpy(&h_data, d_data, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
if (h_data != DATA_VAL) {printf("Fail! %d\n", h_data); return 1;}
printf("Success!\n");
return 0;
}
compile commands and result:
$ nvcc -arch=sm_20 -dc kernel_lib.cu
$ nvcc -lib kernel_lib.o -o test.a
$ nvcc -arch=sm_20 -dc main.cu
$ nvcc -arch=sm_20 main.o test.a -o test
$ ./test
Success!
$
I used CUDA 5.0 for this test.

Is it possible to use thrust::device_ptr on a mapped array?

I am trying to use the thrust::copy_if function on mapped memory. However, as I get a runtime error and I am not being able to find it, before spending a lot of time in debugging, I would like to have a confirmation of the fact that it is effectively allowed to pass a pointer to a mapped memory location to the thrust::device_ptr wrapper.
Here is an example of what I mean:
int size=1024;
int* v_locked;
int* v_device;
int* stencil_device;
device_ptr<int> v_wrapper;
device_ptr<int> v_wrapper_end;
device_ptr<int> stencil_wrapper;
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped));
cudaHostGetDevicePointer(&v_device, &v_locked, 0);
cudaMalloc((void**)&stencil_device, size*sizeof(int));
/*
kernel assigning stencil_device elements ...
*/
v_wrapper = device_pointer_cast(v_device);
stencil_wrapper = device_pointer_cast(stencil_device);
v_wrapper_end = copy_if(make_counting_iterator<int>(0), make_counting_iterator<int>(size), stencil_wrapper, v_wrapper, _1 == 1);
Is this a correct usage of mapped memory with thrust library?
Thank you.
Yes, it is possible.
I believe there were several problems with your code.
You don't appear to be doing any proper cuda error checking If you were, you would have detected that although your calls to cudaHostGetDevicePointer seem to compile correctly, they were not set up correctly.
As mentioned above, your calls to cudaHostGetDevicePointer() were not set up correctly. The second pointer argument is passed as a single pointer (*), not double pointer (**). Refer to the documentation This call as written would throw a cuda runtime error which you can trap.
Prior to your cudaHostAlloc calls, you should use the cudaSetDeviceFlags(cudaDeviceMapHost); call to enable this feature.
Here is a sample code which seems to work correctly for me, and has the above problems fixed:
$ cat t281.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template<typename T>
struct is_one : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x==1);
}
};
int main(){
int size=1024;
int* v_locked;
int* v_device;
int* stencil_locked;
int* stencil_device;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags");
cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 1");
cudaHostGetDevicePointer(&v_device, v_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 1");
cudaHostAlloc((void**)&stencil_locked, size*sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc 2");
cudaHostGetDevicePointer(&stencil_device, stencil_locked, 0);
cudaCheckErrors("cudaHostGetDevicePointer 2");
for (int i = 0; i < size; i++){
v_locked[i] = i;
stencil_locked[i] = i%2;}
thrust::device_ptr<int> v_wrapper = thrust::device_pointer_cast(v_device);
thrust::device_ptr<int> stencil_wrapper = thrust::device_pointer_cast(stencil_device);
thrust::device_ptr<int> v_wrapper_end = v_wrapper + size;
thrust::device_vector<int> result(size);
thrust::device_vector<int>::iterator result_end = copy_if(v_wrapper, v_wrapper_end, stencil_wrapper, result.begin(), is_one<int>());
int result_size = result_end - result.begin();
thrust::host_vector<int> h_result(result_size);
thrust::copy_n(result.begin(), result_size, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -arch=sm_20 -o t281 t281.cu
$ ./t281
1 3 5 7 9 11 13 15 17 19
$

How do we use cuPrintf()?

What do we have to do to use cuPrintf()? (device compute capability 1.2, Ubuntu 12) I couldn't find "cuPrintf.cu" and "cudaPrintf.cuh", so i downloaded their code and include them:
#include "cuPrintf.cuh"
#include "cuPrintf.cu"
By the way this is the rest of the code:
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main () {
dim3 gridSize = dim3 (1);
dim3 blockSize = dim3 (16);
cudaPrintfInit ();
hello_kernel <<< gridSize, blockSize >>> (1.2345f);
cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();
return (0);
}
But nvcc still gives a mistake:
max#max-Lenovo-G560:~/CUDA/matrixMult$ nvcc printfTest.cu -o printfTest
printfTest.cu(5): error: calling a __host__ function("printf") from a __global__
function("hello_kernel") is not allowed
Thanks!
In your kernel instead of this:
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
you should do this:
cuPrintf ("Thread number %d. f = %d\n", threadIdx.x, f);
Other than that, I believe your code is correct (it works for me).
This SO question/answer gives more tips about using cuPrintf properly.
Include <stdio.h> and compile with -arch=sm_20.
Details:
code:
#include <stdio.h>
__global__ void hello_kernel (float f) {
printf ("Thread number %d. f = %d\n", threadIdx.x, f);
}
int main(){
return 0;
}
compilations:
nvcc -arch=sm_20 -o printfTest printfTest.cu