Can't I call a __host__ __device__ function from a __device__ function? - cuda

In CUDA documentation I found that cudaDeviceGetAttribute is a __host__ __device__ function. So I thought I could call it in my __global__ function to get some attributes of my device. Sadly it seems to mean something different because I get an compile error event if I put it into a __device__ function and call this one from my global.
Is it possible to call cudaDeviceGetAttribute on my GPU? or what else does __host__ __device__ mean?
Here is my source code:
__device__ void GetAttributes(int* unique)
{
cudaDeviceAttr attr = cudaDevAttrMaxThreadsPerBlock;
cudaDeviceGetAttribute(unique, attr, 0);
}
__global__ void ClockTest(int* a, int* b, long* return_time, int* unique)
{
clock_t start = clock();
//some complex calculations
*a = *a + *b;
*b = *a + *a;
GetAttributes(unique);
*a = *a + *b - *a;
clock_t end = clock();
*return_time = end - start;
}
int main()
{
int a = 2;
int b = 3;
long time = 0;
int uni;
int* dev_a;
int* dev_b;
long* dev_time;
int* unique;
for (int i = 0; i < 10; ++i) {
cudaMalloc(&dev_a, sizeof(int));
cudaMalloc(&dev_b, sizeof(int));
cudaMalloc(&dev_time, sizeof(long));
cudaMalloc(&unique, sizeof(int));
cudaMemcpy(dev_a, &a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, &b, sizeof(int), cudaMemcpyHostToDevice);
ClockTest <<<1,1>>>(dev_a, dev_b, dev_time, unique);
cudaMemcpy(&a, dev_a, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&time, dev_time, sizeof(long), cudaMemcpyDeviceToHost);
cudaMemcpy(&uni, unique, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(&dev_a);
cudaFree(&dev_b);
cudaFree(&dev_time);
cudaFree(&unique);
printf("%d\n", time);
printf("unique: %d\n", uni);
cudaDeviceReset();
}
return 0;
}

EDIT: sorry, my previous answer was not correct. There does seems to be a problem in nvcc (see below).
cudaDeviceGetAttribute can work correctly in device code, here is a worked example on K20X, CUDA 8.0.61:
$ cat t1305.cu
#include <stdio.h>
__global__ void tkernel(){
int val;
cudaError_t err = cudaDeviceGetAttribute(&val, cudaDevAttrMaxThreadsPerBlock, 0);
printf("err = %d, %s\n", err, cudaGetErrorString(err));
printf("val = %d\n", val);
}
int main(){
tkernel<<<1,1>>>();
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -o t1305 t1305.cu -rdc=true -lcudadevrt
$ cuda-memcheck ./t1305
========= CUDA-MEMCHECK
err = 0, no error
val = 1024
========= ERROR SUMMARY: 0 errors
$
There are various runtime API functions supported for use in device code.
For the supported runtime API functions, it's generally necessary to:
compile for a cc 3.5 or higher device
compile with relocatable device code
link against the cuda device runtime library
In addition, your code has some other coding errors in that we do not pass the address of the pointer to cudaFree, just the pointer itself.
Caveats for this particular function:
There appears to be a problem in the CUDA compiler that if this device runtime API call is used without any other runtime API call in the kernel code, then the code generation will not happen correctly. The workaround at this time is to make sure your kernel contains at least one other cuda runtime API call. In my above example I used cudaGetErrorString, but you could e.g. use cudaDeviceSynchronize() or anything else, I think. I have filed an internal NVIDIA bug to report this issue.
There appears to be a documentation error in the list of device runtime API calls supported in the CDP section of the programming guide (link above). The function cudaGetDeviceProperty does not exist, but I believe it should refer to cudaDeviceGetAttribute. I have filed an internal NVIDIA bug for this documentation error.

Related

Can we get cuda kernel function name in cudaLaunchKernel?

I was trying to insert some codes in cudaLaunchKernel and need to store its function name, but I cannot find a direct API that can help me to get the kernel function name. I have considered CUPTI, but it uses callback function to get the information so I cannot change the behavior of the kernel launch(or need heavy inter-process communication which is ugly.....)
Is there any way I can get the function name in cudaLaunchKernel(maybe by the function pointer?)?
An exampla is as follows.
cudaKernelLaunch(...) {
kernel_id = getKernelNameBySomeMethods(); // it's what I want..
send_to_other_processes(kernel_name);
return ::cudaKernelLaunch(...);
}
// for other process
receive_kernel_name_from_other_process;
store_it;
Edit: A identifier is also ok. I may send the ID to another process to store so I need to classify different cuda kernels.
There are no APIs to do this, either public or private AFAIK. The compiler emits a lot of static host side boilerplate to perform the runtime API magic we take for granted, it isn't done by the runtime library itself.
However, the nature of that boilerplate means you can build your own lookup table pretty easily -- some hacking over a lunch break got me this partial proof of concept which does what I think it is you want:
#include <cstdio>
#include <map>
#include <string>
#include <iostream>
__global__ void kernel_1(float *in, float *out, int N)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
if (tidx == 0) printf("Running kernel_1\n");
if (tidx < N) out[tidx] = in[tidx];
}
__global__ void kernel_2(float *in, float *out, int N)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
if (tidx == 0) printf("Running kernel_2\n");
if (tidx < N) out[tidx] = 2.f * in[tidx];
}
__global__ void kernel_3(float *in, float *out, int N)
{
int tidx = threadIdx.x + blockDim.x * blockIdx.x;
if (tidx == 0) printf("Running kernel_3\n");
if (tidx < N) out[tidx] = 3.f * in[tidx];
}
void notakernel(float *in, float *out, int N)
{
printf("Someone bad happened\n");
}
std::map <void*, std::string> ktable = {
{ (void*)kernel_1, "kernel_1" },
{ (void*)kernel_2, "kernel_2" },
{ (void*)kernel_3, "kernel_3" } };
cudaError_t MyLaunchKernel (void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream)
{
auto it = ktable.find(func);
if (it != ktable.end()) {
std::cout << "Received request to call " << it->second << std::endl;
} else {
std::cout << "Received request to call unknown function!" << std::endl;
}
return cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
}
int main()
{
int N = 100;
float* a; cudaMalloc<float>(&a, N * sizeof(float));
float* b; cudaMalloc<float>(&b, N * sizeof(float));
void* args[] = { (void*)&a, (void*)&b, (void*)&N };
MyLaunchKernel((void*)kernel_1, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
MyLaunchKernel((void*)kernel_2, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
MyLaunchKernel((void*)kernel_3, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
MyLaunchKernel((void*)notakernel, dim3(1), dim3(1), args, 0, NULL);
cudaDeviceSynchronize();
return 0;
}
which appears to work:
$ nvcc -std=c++11 -arch=sm_52 -o lookup lookup.cu
$ cuda-memcheck ./lookup
========= CUDA-MEMCHECK
Received request to call kernel_1
Running kernel_1
Received request to call kernel_2
Running kernel_2
Received request to call kernel_3
Running kernel_3
Received request to call unknown function!
========= Program hit cudaErrorInvalidDeviceFunction (error 98) due to "invalid device function" on CUDA API call to cudaLaunchKernel.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3b9803]
========= Host Frame:./lookup [0x4ca95]
========= Host Frame:./lookup [0x746c]
========= Host Frame:./lookup [0x769f]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./lookup [0x722a]
=========
========= ERROR SUMMARY: 1 error
Obviously things need to be a bit more complex in a complete implementation for your use case -- you would require the reverse lookup implementation for another called to go from name/ID to pointer, and if you have multiple source files compiled separately, then you would need a list concatenation call for the construction of the working list at runtime. But it is important to remember that the function pointers you are passing are actually host pointers, not device pointers (thanks to the runtime API magic), so the cost and complexity of runtime setup is trivial when you can use pre-baked C++ standard library containers and algorithms and function adapters to do most of the heavy lifting.

Unified memory and struct with arrays

I have a big Struct of Arrays of Structs on CUDA, that is constant and read only for my application. A quite simplified example would be
struct Graph{
Node * nodes;
int nNode;
}
struct Node{
int* pos;
int nPos;
}
My kernels would need to navigate this graph and query it. As you know, copying this struct to GPU memory with cudaMalloc and cudaMemcpy is just lots of code, that unified memory is supposed to remove the need of.
In my code, I generated the graph in CPU and then, for testing, I designed the following kernel
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nNode;
};
being called as:
// using malloc for testing to make sure I know what I am doing
int * d_res,* h_res;
cudaMalloc((void **)&d_res,sizeof(int));
h_res=(int*)malloc(sizeof(int));
testKernel<<<1,1>>>(graph,d_res);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(h_res,d_res,sizeof(int),cudaMemcpyDeviceToHost));
with the error checks from here.
When I use the testKernel as is shown, it works fine, but if I change the kernel to:
__global__ void testKernel(const Graph graph,int * d_res){
d_res[0]=graph.nodes[0].nPos;
};
I get illegal memory access errors.
Is this because the unified memory does not handle this type of data correctly?
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Full MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
// d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
graph.nodes = (Node*)malloc(2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.nodes[i].pos = (int*)malloc(3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
Your code isn't using CUDA unified memory. UM is not "automatic" in any way. It requires specific programming steps to take advantage of it and it has specific system requirements.
All of this is covered in the UM section of the programming guide.
Is there a way to make sure I can avoid writing all the explicit copies to GPU memory?
Proper use of UM should allow this. Here is a fully worked example. The only thing I have done is mechanically convert your malloc operations in host code to equivalent cudaMallocManaged operations.
$ cat t1389.cu
#include <algorithm>
#include <stdio.h>
typedef struct node{
int* pos;
int nPos;
}Node;
typedef struct Graph{
Node * nodes;
int nNode;
}Graph;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(const Graph graph, int * d_res){
d_res[0] = graph.nNode;
d_res[0]=graph.nodes[0].nPos; // Not working
};
int main(void){
// fake data, this comes from another process
Graph graph;
cudaMallocManaged(&(graph.nodes), 2*sizeof(Node));
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
cudaMallocManaged(&(graph.nodes[i].pos), 3 * sizeof(int));
graph.nodes[i].pos[0] = 0;
graph.nodes[i].pos[1] = 1;
graph.nodes[i].pos[2] = 2;
graph.nodes[i].nPos = 3;
}
printf("%d\n", graph.nNode); // Change to the kernel variable for comparison
int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(int));
h_res = (int*)malloc(sizeof(int));
testKernel << <1, 1 >> >(graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost));
printf("%d", h_res[0]);
return 0;
}
$ nvcc t1389.cu -o t1389
$ cuda-memcheck ./t1389
========= CUDA-MEMCHECK
2
3========= ERROR SUMMARY: 0 errors
$
UM has a number of system requirements that are documented. I'm not going to try to recite them all here. Primarily you need a cc3.0 or higher GPU. Your MCVE did not include any standard error checking, and I didn't try to add it. But if you still have problems with this code, be sure to use proper CUDA error checking and run it with cuda-memcheck.
If your entire data structure, including embedded pointers, is allocated using ordinary host allocators, and you have no control over that, then you won't be able to use it directly in a UM regime, without doing some sort of involved copying. The exception here would be on an IBM Power9 system as mentioned in section K.1.6 of the above linked programming guide section.
Before attempting to use a host allocator (e.g. malloc) with UM, you should first test the pageableMemoryAccessUsesHostPageTables property, as mentioned in that section.
That property currently won't be set on any system except a properly configured IBM Power9 system. No x86 system currently has this property set/available.

CUDA: How to add a device function from host to an array declared in Device? [duplicate]

I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back() method called from main()) and copy it to a GPU __constant__ or __device__ array, without needing to resort to static __device__ function pointers. I believe this question is related to my problem; however, my goal is to create the __host__ function pointer array iteratively and then copy it to the __constant__ function pointer array instead of initialising the latter on declaration.
A working code example with static function pointers (as seen here or here) would be:
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];
// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
fptr_t f;
if (threadIdx.x < num_functions) {
f = constant_fList[threadIdx.x];
f(a,b);
}
}
#endif
main.cu:
#include "common.h"
// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;
// Load function list to constant memory
void loadList_staticpointers() {
fptr_t h_fList[num_functions];
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}
int main() {
loadList_staticpointers();
int a = 12, b = 15;
kernel<<<1,3>>>(a, b);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Specs: GeForce GTX 670, compiled for -arch=sm_30, CUDA 6.5, Ubuntu 14.04
I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add or p_Mul, manipulation of void loadList_functionpointers(), etc. To make it clear, I am trying something like the following (crashing) code:
main_wrong.cu:
#include "common.h"
#include <vector>
// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;
// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}
// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call
void UploadVector() {
fptr_t* h_vpointer = vec_fList.data();
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}
int main() {
addFunc(Add);
addFunc(Subtract);
addFunc(Multiply);
int a = 12, b = 15;
UploadVector();
kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b) is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?
If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:
template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
constexpr auto num_f = sizeof...(Functions);
constexpr fptr_t table[] = { Functions... };
if (threadIdx.x < num_f)
{
fptr_t f = table[threadIdx.x];
f(a,b);
}
}
You would then call this kernel using
kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);
Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!
Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.
A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];
#endif
main.cu:
#include "common.h"
// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;
// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
fptr_t dev_f = f; // Create device function pointer
dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
dev_fList[idx](a,b); // Make sure that the array was populated correctly
}
// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
if (count_f >= num_functions) {
std::cout << "Error: not enough memory statically allocated on device!\n";
exit(EXIT_FAILURE);
}
kernel<f><<<1,1>>>(a,b,count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
count_f++;
}
int main() {
int a = 12, b = 15;
addFunc<Add>(a,b);
addFunc<Subtract>(a,b);
addFunc<Multiply>(a,b);
return 0;
}
Edit: Added copy of the array of function pointers to constant memory
For what it's worth, here is how to copy our dev_fList array to constant memory:
In common.h:
__constant__ fptr_t cst_fList[num_functions];
__global__ void cst_test(int a, int b, int idx) {
if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}
In main.cu main() function, after all desired functions have been added:
fptr_t *temp;
gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );
cst_test<<<1,count_f>>>(a,b, count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.
It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.
Example showing structure with function pointers:
This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.
This is dynamic assignment:
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
/*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
/*__host__ __device__*/ pDecayFu rad_decay;
/*__host__ __device__*/ pDecayFu lrate_decay;
};
// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..
This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(
pDistanceFu hDistance;
pDecayFu hRadDay;
pDecayFu hLRateDecay;
void DeviceAssign(DistFunction &dist) {
cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );
dist.distance = hDistance;
dist.rad_decay = hRadDay;
dist.lrate_decay = hLRateDecay;
}
Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..
// .. and this would work
#ifdef __CUDACC__
__host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}
__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible
void DeviceAssign2(DistFunction &dist) {
cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
// the same:
// cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
// ..
dist.lrate_decay = hLRateDecay;
// ..
}

CUDA: Copy dynamically created array of function pointers on the CPU to GPU memory

I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back() method called from main()) and copy it to a GPU __constant__ or __device__ array, without needing to resort to static __device__ function pointers. I believe this question is related to my problem; however, my goal is to create the __host__ function pointer array iteratively and then copy it to the __constant__ function pointer array instead of initialising the latter on declaration.
A working code example with static function pointers (as seen here or here) would be:
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];
// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
fptr_t f;
if (threadIdx.x < num_functions) {
f = constant_fList[threadIdx.x];
f(a,b);
}
}
#endif
main.cu:
#include "common.h"
// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;
// Load function list to constant memory
void loadList_staticpointers() {
fptr_t h_fList[num_functions];
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}
int main() {
loadList_staticpointers();
int a = 12, b = 15;
kernel<<<1,3>>>(a, b);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Specs: GeForce GTX 670, compiled for -arch=sm_30, CUDA 6.5, Ubuntu 14.04
I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add or p_Mul, manipulation of void loadList_functionpointers(), etc. To make it clear, I am trying something like the following (crashing) code:
main_wrong.cu:
#include "common.h"
#include <vector>
// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;
// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}
// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call
void UploadVector() {
fptr_t* h_vpointer = vec_fList.data();
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}
int main() {
addFunc(Add);
addFunc(Subtract);
addFunc(Multiply);
int a = 12, b = 15;
UploadVector();
kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b) is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?
If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:
template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
constexpr auto num_f = sizeof...(Functions);
constexpr fptr_t table[] = { Functions... };
if (threadIdx.x < num_f)
{
fptr_t f = table[threadIdx.x];
f(a,b);
}
}
You would then call this kernel using
kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);
Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!
Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.
A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];
#endif
main.cu:
#include "common.h"
// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;
// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
fptr_t dev_f = f; // Create device function pointer
dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
dev_fList[idx](a,b); // Make sure that the array was populated correctly
}
// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
if (count_f >= num_functions) {
std::cout << "Error: not enough memory statically allocated on device!\n";
exit(EXIT_FAILURE);
}
kernel<f><<<1,1>>>(a,b,count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
count_f++;
}
int main() {
int a = 12, b = 15;
addFunc<Add>(a,b);
addFunc<Subtract>(a,b);
addFunc<Multiply>(a,b);
return 0;
}
Edit: Added copy of the array of function pointers to constant memory
For what it's worth, here is how to copy our dev_fList array to constant memory:
In common.h:
__constant__ fptr_t cst_fList[num_functions];
__global__ void cst_test(int a, int b, int idx) {
if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}
In main.cu main() function, after all desired functions have been added:
fptr_t *temp;
gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );
cst_test<<<1,count_f>>>(a,b, count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.
It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.
Example showing structure with function pointers:
This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.
This is dynamic assignment:
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
/*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
/*__host__ __device__*/ pDecayFu rad_decay;
/*__host__ __device__*/ pDecayFu lrate_decay;
};
// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..
This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(
pDistanceFu hDistance;
pDecayFu hRadDay;
pDecayFu hLRateDecay;
void DeviceAssign(DistFunction &dist) {
cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );
dist.distance = hDistance;
dist.rad_decay = hRadDay;
dist.lrate_decay = hLRateDecay;
}
Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..
// .. and this would work
#ifdef __CUDACC__
__host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}
__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible
void DeviceAssign2(DistFunction &dist) {
cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
// the same:
// cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
// ..
dist.lrate_decay = hLRateDecay;
// ..
}

CUDA shared object between threads

I am totally new to CUDA. I want to create one object on the device, and access its member from different threads. I use nvcc -arch=sm_20 (on Tesla M2090), and if I run my code I get an 'unspecified launch failure'. Here is my code:
#include <stdio.h>
#include <string>
using namespace std;
#ifdef __CUDACC__
#define CUDA_CALLABLE __host__ __device__
#else
#define CUDA_CALLABLE
#endif
class SimpleClass {
public:
int i;
CUDA_CALLABLE SimpleClass(){i=1;};
CUDA_CALLABLE ~SimpleClass(){};
};
__global__ void initkernel(SimpleClass *a){
a = new SimpleClass();
}
__global__ void delkernel(SimpleClass *a){
delete a;
}
__global__ void kernel(SimpleClass *a){
printf("%d\n", a->i);
}
int main() {
SimpleClass *a;
initkernel<<<1,1>>>(a);
cudaThreadSynchronize();
kernel<<<1,10>>>(a);
cudaThreadSynchronize();
delkernel<<<1,1>>>(a);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
string lastError = cudaGetErrorString(error);
printf("%s\n",lastError.c_str());
return 0;
}
You get the 'unspecified launch failure' during your first kernel code because 'a' is a pointer stored in the host, but you want to give it a value from a device function. If you want to allocate the object on the device, than you first have to allocate a pointer on the device and than you can read and write it form device (kernel) code, but be careful because it will require double indirection.
Your code should looks like something like this (the rest of the functions should be modified similarly):
__global__ void initkernel(SimpleClass** a){
*a = new SimpleClass();
}
int main() {
SimpleClass** a;
cudaMalloc((void**)&a, sizeof(SimpleClass**));
initkernel<<<1,1>>>(a);
cudaThreadSynchronize();
}
PS.: pQB is absolutely right about that, you should do an error check after each kernel code to detect the errors as soon as possible (and currently for finding the exact location of the error in your code)