Assignment of function pointer with the unified memory in CUDA

Assignment of function pointer with the unified memory in CUDA - cuda

I am trying to implement the dynamic binding of functions with CUDA under the convenient unified memory model. Here, we have a struct Parameters containing a member, a function pointer void (*p_func)().
#include <cstdio>
struct Parameters {
void (*p_func)();
};
The struct is managed by the unified memory and we assign the actual function func_A to p_func.
__host__ __device__
void func_A() {
printf("func_A is correctly invoked!\n");
return;
}
When we go through the following code, the problem arises: if assignment 1 runs, i.e., para->p_func = func_A, both device and host function addresses are actually assigned by the function address at the host. In the contrast, if assignment 2 runs, the addresses both become the device one.
__global__ void assign_func_pointer(Parameters* para) {
para->p_func = func_A;
}
__global__ void run_on_device(Parameters* para) {
printf("run on device with address %p\n", para->p_func);
para->p_func();
}
void run_on_host(Parameters* para) {
printf("run on host with address %p\n", para->p_func);
para->p_func();
}
int main(int argc, char* argv[]) {
Parameters* para;
cudaMallocManaged(&para, sizeof(Parameters));
// assignment 1, if we uncomment this section, p_func points to address at host
para->p_func = func_A;
printf("addr#host: %p\n", para->p_func);
// assignment 2, if we uncomment this section, p_func points to address at device
assign_func_pointer<<<1,1>>>(para); //
cudaDeviceSynchronize();
printf("addr#device: %p\n", para->p_func);
run_on_device<<<1,1>>>(para);
cudaDeviceSynchronize();
run_on_host(para);
cudaFree(para);
return 0;
}
The question now is, is it possible for the function pointers at both the device and host point to the correct function addresses, respectively, under the unified memory model?

Leaving aside the technicalities of unified memory for a moment, your question is effectively "can one variable simultaneously have two different values?" and the answer to that is obviously no.
In more detail: CUDA unified memory fundamentally ensures that a given managed allocation will have consistent values (under certain constraints) when accessed from both host and device. What you are asking for is the complete opposite of that, and it obviously isn't supported.

With some modifications to the struct definition, something like this may be possible:
$ cat t1288.cu
#include <cstdio>
struct Parameters {
void (*p_hfunc)();
void (*p_dfunc)();
__host__ __device__
void p_func(){
#ifdef __CUDA_ARCH__
(*p_dfunc)();
#else
(*p_hfunc)();
#endif
}
};
__host__ __device__
void func_A() {
printf("func_A is correctly invoked!\n");
return;
}
__global__ void assign_func_pointer(Parameters* para) {
para->p_dfunc = func_A;
}
__global__ void run_on_device(Parameters* para) {
printf("run on device\n"); // with address %p\n", para->p_dfunc);
para->p_func();
}
void run_on_host(Parameters* para) {
printf("run on host\n"); // with address %p\n", para->p_func);
para->p_func();
}
int main(int argc, char* argv[]) {
Parameters* para;
cudaMallocManaged(&para, sizeof(Parameters));
// assignment 1, if we uncomment this section, p_func points to address at host
para->p_hfunc = func_A;
printf("addr#host: %p\n", para->p_hfunc);
// assignment 2, if we uncomment this section, p_func points to address at device
assign_func_pointer<<<1,1>>>(para); //
cudaDeviceSynchronize();
printf("addr#device: %p\n", para->p_dfunc);
run_on_device<<<1,1>>>(para);
cudaDeviceSynchronize();
run_on_host(para);
cudaFree(para);
return 0;
}
$ nvcc -arch=sm_35 -o t1288 t1288.cu
$ cuda-memcheck ./t1288
========= CUDA-MEMCHECK
addr#host: 0x402add
addr#device: 0x8
run on device
func_A is correctly invoked!
run on host
func_A is correctly invoked!
========= ERROR SUMMARY: 0 errors
$
I concur with the other answer that it is currently not possible even with managed memory, to have a single numerical function pointer that works correctly both in host code and device code.

Related

CUDA: How to add a device function from host to an array declared in Device? [duplicate]

I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back() method called from main()) and copy it to a GPU __constant__ or __device__ array, without needing to resort to static __device__ function pointers. I believe this question is related to my problem; however, my goal is to create the __host__ function pointer array iteratively and then copy it to the __constant__ function pointer array instead of initialising the latter on declaration.
A working code example with static function pointers (as seen here or here) would be:
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];
// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
fptr_t f;
if (threadIdx.x < num_functions) {
f = constant_fList[threadIdx.x];
f(a,b);
}
}
#endif
main.cu:
#include "common.h"
// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;
// Load function list to constant memory
void loadList_staticpointers() {
fptr_t h_fList[num_functions];
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}
int main() {
loadList_staticpointers();
int a = 12, b = 15;
kernel<<<1,3>>>(a, b);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Specs: GeForce GTX 670, compiled for -arch=sm_30, CUDA 6.5, Ubuntu 14.04
I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add or p_Mul, manipulation of void loadList_functionpointers(), etc. To make it clear, I am trying something like the following (crashing) code:
main_wrong.cu:
#include "common.h"
#include <vector>
// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;
// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}
// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call
void UploadVector() {
fptr_t* h_vpointer = vec_fList.data();
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}
int main() {
addFunc(Add);
addFunc(Subtract);
addFunc(Multiply);
int a = 12, b = 15;
UploadVector();
kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b) is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?

If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:
template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
constexpr auto num_f = sizeof...(Functions);
constexpr fptr_t table[] = { Functions... };
if (threadIdx.x < num_f)
{
fptr_t f = table[threadIdx.x];
f(a,b);
}
}
You would then call this kernel using
kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);

Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!
Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.
A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];
#endif
main.cu:
#include "common.h"
// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;
// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
fptr_t dev_f = f; // Create device function pointer
dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
dev_fList[idx](a,b); // Make sure that the array was populated correctly
}
// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
if (count_f >= num_functions) {
std::cout << "Error: not enough memory statically allocated on device!\n";
exit(EXIT_FAILURE);
}
kernel<f><<<1,1>>>(a,b,count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
count_f++;
}
int main() {
int a = 12, b = 15;
addFunc<Add>(a,b);
addFunc<Subtract>(a,b);
addFunc<Multiply>(a,b);
return 0;
}
Edit: Added copy of the array of function pointers to constant memory
For what it's worth, here is how to copy our dev_fList array to constant memory:
In common.h:
__constant__ fptr_t cst_fList[num_functions];
__global__ void cst_test(int a, int b, int idx) {
if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}
In main.cu main() function, after all desired functions have been added:
fptr_t *temp;
gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );
cst_test<<<1,count_f>>>(a,b, count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.

It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.
Example showing structure with function pointers:
This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.
This is dynamic assignment:
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
/*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
/*__host__ __device__*/ pDecayFu rad_decay;
/*__host__ __device__*/ pDecayFu lrate_decay;
};
// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..
This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(
pDistanceFu hDistance;
pDecayFu hRadDay;
pDecayFu hLRateDecay;
void DeviceAssign(DistFunction &dist) {
cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );
dist.distance = hDistance;
dist.rad_decay = hRadDay;
dist.lrate_decay = hLRateDecay;
}
Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..
// .. and this would work
#ifdef __CUDACC__
__host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}
__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible
void DeviceAssign2(DistFunction &dist) {
cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
// the same:
// cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
// ..
dist.lrate_decay = hLRateDecay;
// ..
}

CUDA: Copy dynamically created array of function pointers on the CPU to GPU memory

I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back() method called from main()) and copy it to a GPU __constant__ or __device__ array, without needing to resort to static __device__ function pointers. I believe this question is related to my problem; however, my goal is to create the __host__ function pointer array iteratively and then copy it to the __constant__ function pointer array instead of initialising the latter on declaration.
A working code example with static function pointers (as seen here or here) would be:
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];
// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
fptr_t f;
if (threadIdx.x < num_functions) {
f = constant_fList[threadIdx.x];
f(a,b);
}
}
#endif
main.cu:
#include "common.h"
// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;
// Load function list to constant memory
void loadList_staticpointers() {
fptr_t h_fList[num_functions];
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}
int main() {
loadList_staticpointers();
int a = 12, b = 15;
kernel<<<1,3>>>(a, b);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Specs: GeForce GTX 670, compiled for -arch=sm_30, CUDA 6.5, Ubuntu 14.04
I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add or p_Mul, manipulation of void loadList_functionpointers(), etc. To make it clear, I am trying something like the following (crashing) code:
main_wrong.cu:
#include "common.h"
#include <vector>
// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;
// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}
// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call
void UploadVector() {
fptr_t* h_vpointer = vec_fList.data();
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}
int main() {
addFunc(Add);
addFunc(Subtract);
addFunc(Multiply);
int a = 12, b = 15;
UploadVector();
kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b) is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?

If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:
template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
constexpr auto num_f = sizeof...(Functions);
constexpr fptr_t table[] = { Functions... };
if (threadIdx.x < num_f)
{
fptr_t f = table[threadIdx.x];
f(a,b);
}
}
You would then call this kernel using
kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);

Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!
Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.
A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];
#endif
main.cu:
#include "common.h"
// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;
// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
fptr_t dev_f = f; // Create device function pointer
dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
dev_fList[idx](a,b); // Make sure that the array was populated correctly
}
// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
if (count_f >= num_functions) {
std::cout << "Error: not enough memory statically allocated on device!\n";
exit(EXIT_FAILURE);
}
kernel<f><<<1,1>>>(a,b,count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
count_f++;
}
int main() {
int a = 12, b = 15;
addFunc<Add>(a,b);
addFunc<Subtract>(a,b);
addFunc<Multiply>(a,b);
return 0;
}
Edit: Added copy of the array of function pointers to constant memory
For what it's worth, here is how to copy our dev_fList array to constant memory:
In common.h:
__constant__ fptr_t cst_fList[num_functions];
__global__ void cst_test(int a, int b, int idx) {
if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}
In main.cu main() function, after all desired functions have been added:
fptr_t *temp;
gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );
cst_test<<<1,count_f>>>(a,b, count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.

It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.
Example showing structure with function pointers:
This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.
This is dynamic assignment:
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
/*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
/*__host__ __device__*/ pDecayFu rad_decay;
/*__host__ __device__*/ pDecayFu lrate_decay;
};
// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..
This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(
pDistanceFu hDistance;
pDecayFu hRadDay;
pDecayFu hLRateDecay;
void DeviceAssign(DistFunction &dist) {
cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );
dist.distance = hDistance;
dist.rad_decay = hRadDay;
dist.lrate_decay = hLRateDecay;
}
Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..
// .. and this would work
#ifdef __CUDACC__
__host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}
__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible
void DeviceAssign2(DistFunction &dist) {
cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
// the same:
// cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
// ..
dist.lrate_decay = hLRateDecay;
// ..
}

How to implement device side CUDA virtual functions?

I see that CUDA doesn't allow for classes with virtual functions to be passed into kernel functions. Are there any work-arounds to this limitation?
I would really like to be able to use polymorphism within a kernel function.
Thanks!

The most important part of Robert Crovella's comment is:
The objects simply need to be created on the device.
So keeping that in mind, I was dealing with situation where I had an abstract class Function and then some implementations of it encapsulating different function and its evaluation. This is the simplified version of my code how I achieved polymorphism in my situation, but I am not saying it cannot be done better... It will hopefully help you to get the idea:
class Function
{
public:
__device__ Function() {}
__device__ virtual ~Function() {}
__device__ virtual void Evaluate(const real* __restrict__ positions, real* fitnesses, const SIZE_TYPE particlesCount) const = 0;
};
class FunctionRsj : public Function
{
private:
SIZE_TYPE m_DimensionsCount;
SIZE_TYPE m_PointsCount;
real* m_Y;
real* m_X;
public:
__device__ FunctionRsj(const SIZE_TYPE dimensionsCount, const SIZE_TYPE pointsCount, real* configFileData)
: m_DimensionsCount(dimensionsCount),
m_PointsCount(pointsCount),
m_Y(configFileData),
m_X(configFileData + pointsCount) {}
__device__ ~FunctionRsj()
{
// m_Y points to the beginning of the config
// file data, use it for destruction as this
// object took ownership of configFilDeata.
delete[] m_Y;
}
__device__ void Evaluate(const real* __restrict__ positions, real* fitnesses, const SIZE_TYPE particlesCount) const
{
// Implement evaluation of FunctionRsj here.
}
};
__global__ void evaluate_fitnesses(
const real* __restrict__ positions,
real* fitnesses,
Function const* const* __restrict__ function,
const SIZE_TYPE particlesCount)
{
// This whole kernel is just a proxy as kernels
// cannot be member functions.
(*function)->Evaluate(positions, fitnesses, particlesCount);
}
__global__ void create_function(
Function** function,
SIZE_TYPE dimensionsCount,
SIZE_TYPE pointsCount,
real* configFileData)
{
// It is necessary to create object representing a function
// directly in global memory of the GPU device for virtual
// functions to work correctly, i.e. virtual function table
// HAS to be on GPU as well.
if (threadIdx.x == 0 && blockIdx.x == 0)
{
(*function) = new FunctionRsj(dimensionsCount, pointsCount, configFileData);
}
}
__global__ void delete_function(Function** function)
{
delete *function;
}
int main()
{
// Lets just assume d_FunctionConfigData, d_Positions,
// d_Fitnesses are arrays allocated on GPU already ...
// Create function.
Function** d_Function;
cudaMalloc(&d_Function, sizeof(Function**));
create_function<<<1, 1>>>(d_Function, 10, 10, d_FunctionConfigData);
// Evaluate using proxy kernel.
evaluate_fitnesses<<<
m_Configuration.GetEvaluationGridSize(),
m_Configuration.GetEvaluationBlockSize(),
m_Configuration.GetEvaluationSharedMemorySize()>>>(
d_Positions,
d_Fitnesses,
d_Function,
m_Configuration.GetParticlesCount());
// Delete function object on GPU.
delete_function<<<1, 1>>>(d_Function);
}

Copy an object to device?

Can I copy a C++ object to the device?
say I have:
class CudaClass
{
public:
int* data;
CudaClass(int x) {
data = new int[1]; data[0] = x;
}
};
__global__ void useClass(CudaClass cudaClass)
{
printf("%d" cudaClass.data[0]);
};
int main()
{
CudaClass c(1);
}
Now how do I copy "c" to device memory and launch kernel "useClass"?

Yes, you can copy an object to the device for use on the device. When the object has embedded pointers to dynamically allocated regions, the process requires some extra steps.
See my answer here for a discussion of what is involved. That answer also has a few samples code answers linked to it.
Also, in your class definition, if you want certain functions to be usable on the device, you should decorate those functions appropriately (i.e. probably with __device__ __host__);
EDIT: In response to a question (now deleted) here is the simplest sample code I could come up with based on the supplied code:
#include <stdio.h>
class CudaClass
{
public:
int* data;
CudaClass(int x) {
data = new int[1]; data[0] = x;
}
};
__global__ void useClass(CudaClass *cudaClass)
{
printf("%d\n", cudaClass->data[0]);
};
int main()
{
CudaClass c(1);
// create class storage on device and copy top level class
CudaClass *d_c;
cudaMalloc((void **)&d_c, sizeof(CudaClass));
cudaMemcpy(d_c, &c, sizeof(CudaClass), cudaMemcpyHostToDevice);
// make an allocated region on device for use by pointer in class
int *hostdata;
cudaMalloc((void **)&hostdata, sizeof(int));
cudaMemcpy(hostdata, c.data, sizeof(int), cudaMemcpyHostToDevice);
// copy pointer to allocated device storage to device class
cudaMemcpy(&(d_c->data), &hostdata, sizeof(int *), cudaMemcpyHostToDevice);
useClass<<<1,1>>>(d_c);
cudaDeviceSynchronize();
return 0;
}
In the interest of brevity/clarity I have dispensed with the usual cuda error checking.
Responding to the question, you cannot allocate storage directly from the host using the pointer in the device-based class. This is because cudaMalloc expects an ordinary host based pointer storage, such as what you get with:
int *hostdata;
cudaMalloc cannot work with a pointer whose storage is already on the device. This will not work:
cudaMalloc(&(d_c->data), sizeof(int));
because it requires dereferencing a device pointer (d_c) in host code, which is not allowed.

CUDA shared object between threads

I am totally new to CUDA. I want to create one object on the device, and access its member from different threads. I use nvcc -arch=sm_20 (on Tesla M2090), and if I run my code I get an 'unspecified launch failure'. Here is my code:
#include <stdio.h>
#include <string>
using namespace std;
#ifdef __CUDACC__
#define CUDA_CALLABLE __host__ __device__
#else
#define CUDA_CALLABLE
#endif
class SimpleClass {
public:
int i;
CUDA_CALLABLE SimpleClass(){i=1;};
CUDA_CALLABLE ~SimpleClass(){};
};
__global__ void initkernel(SimpleClass *a){
a = new SimpleClass();
}
__global__ void delkernel(SimpleClass *a){
delete a;
}
__global__ void kernel(SimpleClass *a){
printf("%d\n", a->i);
}
int main() {
SimpleClass *a;
initkernel<<<1,1>>>(a);
cudaThreadSynchronize();
kernel<<<1,10>>>(a);
cudaThreadSynchronize();
delkernel<<<1,1>>>(a);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
string lastError = cudaGetErrorString(error);
printf("%s\n",lastError.c_str());
return 0;
}

You get the 'unspecified launch failure' during your first kernel code because 'a' is a pointer stored in the host, but you want to give it a value from a device function. If you want to allocate the object on the device, than you first have to allocate a pointer on the device and than you can read and write it form device (kernel) code, but be careful because it will require double indirection.
Your code should looks like something like this (the rest of the functions should be modified similarly):
__global__ void initkernel(SimpleClass** a){
*a = new SimpleClass();
}
int main() {
SimpleClass** a;
cudaMalloc((void**)&a, sizeof(SimpleClass**));
initkernel<<<1,1>>>(a);
cudaThreadSynchronize();
}
PS.: pQB is absolutely right about that, you should do an error check after each kernel code to detect the errors as soon as possible (and currently for finding the exact location of the error in your code)

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Assignment of function pointer with the unified memory in CUDA - cuda

Related

CUDA: How to add a device function from host to an array declared in Device? [duplicate]

CUDA: Copy dynamically created array of function pointers on the CPU to GPU memory

How to implement device side CUDA virtual functions?

Copy an object to device?

CUDA shared object between threads

Categories

Resources