Passing Host Function as a function pointer in __global__ OR __device__ function in CUDA - cuda

I am currently developing a GPU version of a CPU function
(e.g. function Calc(int a, int b, double* c, souble* d, CalcInvFunction GetInv )), in which a host function is passes as a function pointer(e.g. in above example GetInv is the host function of CalcInvFunction type). My question is, if i have to put Calc() function entirely in GPU, i have to pass the GetInv function as a function pointer argument in device function/kernel function, and is that possible?

Yes, for a GPU implementation of Calc, you should pass the GetInv as a __device__ function pointer.
It is possible, here are some worked examples:
Ex. 1
Ex. 2
Ex. 3
Most of the above examples demonstrate bringing the device function pointer all the way back to the host code. This may not be necessary for your particular case. But it should be fairly obvious from above how to grab a __device__ function pointer (in device code) and use it in a kernel.

Finally, i have been able to pass a host function as a function pointer in cuda kernel function (__global__ function). Thanks to Robert Crovella and njuffa for the answer. I have been able to pass a class member function(cpu function) as a function pointer to a cuda kernel. But, the main problem is, i can only pass the static class member function. I am not being able to pass the function not declared as static.
For Example:
/**/
__host__ __device__
static int
CellfunPtr(
void*ptr, int a
);
/**/
The above function work because this member function is declared as static member function. If i do not declare this member function as a static member as ,
/**/
__host__ __device__
int
CellfunPtr(
void*ptr, int a
);
/**/
then it doesnt work.
The complete code has four files.
First file
/*start of fundef.h file*/
typedef int (*pFunc_t)(void* ptr, int N);
/*end of fundef.h file*/
Second file
/*start of solver.h file*/
class CalcVars {
int eqnCount;
int numCell;
int numTri;
int numTet;
public:
double* cellVel;
double* cellPre;
/** Constructor */
CalcVars(
const int eqnCount_,
const int numCell_,
const int numTri_,
const int numTet_
);
/** Destructor */
~CalcVars(void);
public:
void
CalcAdv();
__host__ __device__
static int
CellfunPtr(
void*ptr, int a
);
};
/*end of solver.h file*/
Third file
/*start of solver.cu file*/
#include "solver.h"
__device__ pFunc_t pF1_d = CalcVars::CellfunPtr;
pFunc_t pF1_h ;
__global__ void kernel(int*a, pFunc_t func, void* thisPtr_){
int tid = threadIdx.x;
a[tid] = (*func)(thisPtr_, a[tid]);
};
/* Constructor */
CalcVars::CalcVars(
const int eqnCount_,
const int numCell_,
const int numTri_,
const int numTet_
)
{
this->eqnCount = eqnCount_;
this->numCell = numCell_;
this->numTri = numTri_;
this->cellVel = (double*) calloc((size_t) eqnCount, sizeof(double));
this->cellPre = (double*) calloc((size_t) eqnCount, sizeof(double));
}
/* Destructor */
CalcVars::~CalcVars(void)
{
free(this->cellVel);
free(this->cellPre);
}
void
CalcVars::CalcAdv(
){
/*int b1 = 0;
b1 = CellfunPtr(this, 1);*/
int Num = 50;
int *a1, *a1_dev;
a1 = (int *)malloc(Num*sizeof(int));
cudaMalloc((void**)&a1_dev, Num*sizeof(int));
for(int i = 0; i <Num; i++){
a1[i] = i;
}
cudaMemcpy(a1_dev, a1, Num*sizeof(int), cudaMemcpyHostToDevice);
//copy addresses of device functions to host
cudaMemcpyFromSymbol(&pF1_h, pF1_d, sizeof(pFunc_t));
kernel<<<1,42>>>(a1_dev, pF1_h, this);
cudaDeviceSynchronize();
cudaMemcpy(a1, a1_dev, Num*sizeof(int), cudaMemcpyDeviceToHost);
};
int
CalcVars::CellfunPtr(
void* ptr, int a
){
//CalcVars* ClsPtr = (CalcVars*)ptr;
printf("Printing from CPU function\n");
//int eqn_size = ClsPtr->eqnCount;
//printf("The number is %d",eqn_size);
return a-1;
};
/*end of solver.cu file*/
Fourth file
/*start of main.cpp file*/
#include "solver.h"
int main(){
int n_Eqn, n_cell, n_tri, n_tetra;
n_Eqn = 100;
n_cell = 200;
n_tri = 300;
n_tetra = 400;
CalcVars* calcvars;
calcvars = new CalcVars(n_Eqn, n_cell, n_tri, n_tetra );
calcvars->CalcAdv();
system("pause");
}
/*end of main.cpp file*/

Related

cudaMemcpy returns success but does not copy anything

below are the things I have checked with cuda-gdb:
the contents of src are correct
cudaMalloc, malloc, and file I/O are successful
cudaMemcpy returns cudaSuccess
the problematic cudaMemcpy is called and throws no errors or exceptions
destination is allocated (cudaMalloc) successfully
Below are relevent parts of the code: wavenet_server.cc mallocs the source, copies data from a file to the source, and calls make_wavenet. wavenet_infer.cu calls constructor of MyWaveNet and calls setEmbeddings.
wavenet_server.cc:
#include "wavenet_infer.h"
void readArrayFromBinary(void* array, size_t len, size_t num_bytes_per_elem, const char* file_name) {
FILE* file = fopen(file_name, "rb");
fread(array, num_bytes_per_elem, len, file);
fclose(file);
}
void setEmbeddingCurr(const char* fileName, size_t len) {
this->embedding_curr = (float*)malloc(sizeof(float) * len);
readArrayFromBinary((void*)this->embedding_curr, len, sizeof(float), fileName);
}
void setWavenet(void) {
this->wavenet = make_wavenet(this->num_samples,
this->batch_size,
this->embedding_prev,
this->embedding_curr,
this->num_layers,
this->max_dilation,
this->dilate_weights_prev,
this->dilate_weights_curr,
this->dilate_biases,
this->res_weights,
this->res_biases,
this->skip_weights,
this->skip_biases,
this->conv_out,
this->conv_end,
this->is_using_embed_tanh,
this->implementation);
}
wavenet_infer.cu:
#include "nv_wavenet.cuh"
typedef nvWavenetInfer<float,float, R, S, A> MyWaveNet;
void* make_wavenet(int sample_count,
int batch_size,
float* embedding_prev,
float* embedding_curr,
int num_layers,
int max_dilation,
float** in_layer_weights_prev,
float** in_layer_weights_curr,
float** in_layer_biases,
float** res_layer_weights,
float** res_layer_biases,
float** skip_layer_weights,
float** skip_layer_biases,
float* conv_out_weight,
float* conv_end_weight,
bool use_embed_tanh,
int implementation
) {
MyWaveNet* wavenet = new MyWaveNet(num_layers, max_dilation, batch_size, sample_count,
implementation, use_embed_tanh);
wavenet->setEmbeddings(embedding_prev, embedding_curr);
// We didn't use biases on our outputs
std::vector<float> dummy_bias_first(S, 0);
std::vector<float> dummy_bias_second(A, 0);
wavenet->setOutWeights(conv_out_weight,
dummy_bias_first.data(),
conv_end_weight,
dummy_bias_second.data());
for (int l = 0; l < num_layers; l++) {
wavenet->setLayerWeights(l, in_layer_weights_prev[l],
in_layer_weights_curr[l],
in_layer_biases[l],
res_layer_weights[l],
res_layer_biases[l],
skip_layer_weights[l],
skip_layer_biases[l]);
}
return (void*)wavenet;
}
nv_wavenet.cuh:
nvWavenetInfer (int numLayers, int maxDilation, int batchSize, int numSamples, int impl=0, bool tanhEmbed=true) : m_numLayers(numLayers), m_maxBatch(batchSize), m_maxSamples(numSamples), m_implementation((nvWavenetInfer::Implementation)impl), m_tanhEmbed(tanhEmbed) {
m_maxDilation = maxDilation;
/*
gpuErrChk(cudaMalloc(&m_yOut, numSamples*batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
gpuErrChk(cudaMemset(m_yOut, 0, numSamples*batchSize*sizeof(int)));
*/
gpuErrChk(cudaMalloc(&m_outputSelectors, numSamples*batchSize*sizeof(float)));
gpuErrChk(cudaMalloc(&m_embedPrev, A*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_embedCur, A*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wprev, numLayers*2*R*R*sizeof(T_weight)));
gpuErrChk(cudaMalloc(&m_Wcur, numLayers*2*R*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bh, numLayers*2*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Lh, numSamples*numLayers*batchSize*2*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wres, numLayers*R*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bres, numLayers*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wskip, numLayers*S*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bskip, numLayers*S*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_XtOut, numLayers*R*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipOut, numLayers*S*batchSize*sizeof(T_data)));
// For now, just burn memory as though all layers had the maximum dilation value
gpuErrChk(cudaMalloc(&m_XtIn, (m_maxDilation+1)*(numLayers+1)*R*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_hOut, numLayers*batchSize*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_aPrev, numLayers*batchSize*2*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipIn, numLayers*S*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipOutFinalAccumulate, A*batchSize*S/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_outAccumulate, A*batchSize*A/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_yInPrev, batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
gpuErrChk(cudaMalloc(&m_yInCur, batchSize*sizeof(int))); // one-hot vector represented as single value indicating which value is set
gpuErrChk(cudaMalloc(&m_WskipOut, A*S*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_BskipOut, A*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Wout, A*A*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_Bout, A*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_skipOutFinal, A*batchSize*S/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_out, A*batchSize*A/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_p, A*batchSize*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_h, numLayers*batchSize*R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_hSample, numLayers*batchSize*sizeof(int)));
gpuErrChk(cudaMalloc(&m_ySample, batchSize*sizeof(int)));
if (impl == PERSISTENT) {
gpuErrChk(cudaMalloc(&m_skipOutFinalAccumulate, A*batchSize*S/R*sizeof(T_data)));
gpuErrChk(cudaMalloc(&m_outAccumulate, A*batchSize*A/R*sizeof(T_data)));
}
}
virtual void setEmbeddings (float* embedPrev, float* embedCur) {
setActivation(m_embedPrev, embedPrev, A*R);
setActivation(m_embedCur, embedCur, A*R);
}
void setActivation(float* dst, float* src, size_t size) {
gpuErrChk(cudaMemcpy(dst, src, size*sizeof(float), cudaMemcpyHostToDevice));
}
Turns out that cudaMemcpy was not the issue. when examining device global memroy using cuda-gdb, one cannot do: x/10fw float_array. It will give incorrect values. To view, try this: p ((#global float*) float_array)[0]#10

Using host class member pointing to device memory in device code

I want to have an instance of a Container class allocating some device and host memory on initialization. I want to use the allocated memory in device code, without passing the actual pointer (for API reasons).
How do I create a global __device__ pointer to the member pointing to the device memory? I am happy to use thrust if that helps.
Here is a small example:
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
Container stuff;
__device__ auto d_int = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
There are two problems here:
You can't statically inititalize a __device__ variable in the way you are trying to (and the value you are trying to apply isn't correct either). The CUDA runtime API contains a function for initialising global scope device symbols. Use that instead.
Your global scope declaration of stuff shouldn't work either for a number of subtle reasons discussed here (it is technically undefined behaviour). Declare it at main scope instead.
Putting these two things together should lead your to do something like this instead:
__device__ int* d_int;
// ...
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.dint, sizeof(int*));
edit<<<4, 1>>>();
// ...
Here is a fully worked example:
$ cat t1199.cu
#include <iostream>
struct Container {
int *h_int = (int*)malloc(4*sizeof(int));
int *d_int;
Container() {
h_int[0] = 6; h_int[1] = 6; h_int[2] = 6; h_int[3] = 6;
cudaMalloc(&d_int, 4*sizeof(int));
memcpyHostToDevice();
}
void memcpyHostToDevice() {
cudaMemcpy(d_int, h_int, 4*sizeof(int), cudaMemcpyHostToDevice);
}
void memcpyDeviceToHost() {
cudaMemcpy(h_int, d_int, 4*sizeof(int), cudaMemcpyDeviceToHost);
}
};
//Container stuff;
__device__ int *d_int; // = &stuff.d_int; // How do I get that right?
__global__ void edit() { // To keep the API simple I do not want to pass the pointer
auto i = blockIdx.x*blockDim.x + threadIdx.x;
d_int[i] = 1 + 2*(i > 0) + 4*(i > 2);
}
int main(int argc, char const *argv[]) {
Container stuff;
cudaMemcpyToSymbol(d_int, &stuff.d_int, sizeof(int *));
edit<<<4, 1>>>();
stuff.memcpyDeviceToHost();
std::cout << stuff.h_int[0] << stuff.h_int[1] << stuff.h_int[2] << stuff.h_int[3] << "\n";
return 0;
}
$ nvcc -std=c++11 -o t1199 t1199.cu
$ cuda-memcheck ./t1199
========= CUDA-MEMCHECK
1337
========= ERROR SUMMARY: 0 errors
$

CUDA: How to add a device function from host to an array declared in Device? [duplicate]

I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back() method called from main()) and copy it to a GPU __constant__ or __device__ array, without needing to resort to static __device__ function pointers. I believe this question is related to my problem; however, my goal is to create the __host__ function pointer array iteratively and then copy it to the __constant__ function pointer array instead of initialising the latter on declaration.
A working code example with static function pointers (as seen here or here) would be:
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];
// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
fptr_t f;
if (threadIdx.x < num_functions) {
f = constant_fList[threadIdx.x];
f(a,b);
}
}
#endif
main.cu:
#include "common.h"
// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;
// Load function list to constant memory
void loadList_staticpointers() {
fptr_t h_fList[num_functions];
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}
int main() {
loadList_staticpointers();
int a = 12, b = 15;
kernel<<<1,3>>>(a, b);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Specs: GeForce GTX 670, compiled for -arch=sm_30, CUDA 6.5, Ubuntu 14.04
I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add or p_Mul, manipulation of void loadList_functionpointers(), etc. To make it clear, I am trying something like the following (crashing) code:
main_wrong.cu:
#include "common.h"
#include <vector>
// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;
// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}
// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call
void UploadVector() {
fptr_t* h_vpointer = vec_fList.data();
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}
int main() {
addFunc(Add);
addFunc(Subtract);
addFunc(Multiply);
int a = 12, b = 15;
UploadVector();
kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b) is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?
If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:
template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
constexpr auto num_f = sizeof...(Functions);
constexpr fptr_t table[] = { Functions... };
if (threadIdx.x < num_f)
{
fptr_t f = table[threadIdx.x];
f(a,b);
}
}
You would then call this kernel using
kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);
Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!
Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.
A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];
#endif
main.cu:
#include "common.h"
// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;
// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
fptr_t dev_f = f; // Create device function pointer
dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
dev_fList[idx](a,b); // Make sure that the array was populated correctly
}
// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
if (count_f >= num_functions) {
std::cout << "Error: not enough memory statically allocated on device!\n";
exit(EXIT_FAILURE);
}
kernel<f><<<1,1>>>(a,b,count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
count_f++;
}
int main() {
int a = 12, b = 15;
addFunc<Add>(a,b);
addFunc<Subtract>(a,b);
addFunc<Multiply>(a,b);
return 0;
}
Edit: Added copy of the array of function pointers to constant memory
For what it's worth, here is how to copy our dev_fList array to constant memory:
In common.h:
__constant__ fptr_t cst_fList[num_functions];
__global__ void cst_test(int a, int b, int idx) {
if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}
In main.cu main() function, after all desired functions have been added:
fptr_t *temp;
gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );
cst_test<<<1,count_f>>>(a,b, count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.
It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.
Example showing structure with function pointers:
This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.
This is dynamic assignment:
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
/*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
/*__host__ __device__*/ pDecayFu rad_decay;
/*__host__ __device__*/ pDecayFu lrate_decay;
};
// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..
This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(
pDistanceFu hDistance;
pDecayFu hRadDay;
pDecayFu hLRateDecay;
void DeviceAssign(DistFunction &dist) {
cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );
dist.distance = hDistance;
dist.rad_decay = hRadDay;
dist.lrate_decay = hLRateDecay;
}
Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..
// .. and this would work
#ifdef __CUDACC__
__host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}
__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible
void DeviceAssign2(DistFunction &dist) {
cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
// the same:
// cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
// ..
dist.lrate_decay = hLRateDecay;
// ..
}

CUDA: Copy dynamically created array of function pointers on the CPU to GPU memory

I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back() method called from main()) and copy it to a GPU __constant__ or __device__ array, without needing to resort to static __device__ function pointers. I believe this question is related to my problem; however, my goal is to create the __host__ function pointer array iteratively and then copy it to the __constant__ function pointer array instead of initialising the latter on declaration.
A working code example with static function pointers (as seen here or here) would be:
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];
// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
fptr_t f;
if (threadIdx.x < num_functions) {
f = constant_fList[threadIdx.x];
f(a,b);
}
}
#endif
main.cu:
#include "common.h"
// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;
// Load function list to constant memory
void loadList_staticpointers() {
fptr_t h_fList[num_functions];
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}
int main() {
loadList_staticpointers();
int a = 12, b = 15;
kernel<<<1,3>>>(a, b);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Specs: GeForce GTX 670, compiled for -arch=sm_30, CUDA 6.5, Ubuntu 14.04
I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add or p_Mul, manipulation of void loadList_functionpointers(), etc. To make it clear, I am trying something like the following (crashing) code:
main_wrong.cu:
#include "common.h"
#include <vector>
// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;
// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}
// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call
void UploadVector() {
fptr_t* h_vpointer = vec_fList.data();
gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}
int main() {
addFunc(Add);
addFunc(Subtract);
addFunc(Multiply);
int a = 12, b = 15;
UploadVector();
kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b) is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?
If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:
template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
constexpr auto num_f = sizeof...(Functions);
constexpr fptr_t table[] = { Functions... };
if (threadIdx.x < num_f)
{
fptr_t f = table[threadIdx.x];
f(a,b);
}
}
You would then call this kernel using
kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);
Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!
Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.
A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.
common.h:
#ifndef COMMON_H
#define COMMON_H
#include <stdio.h>
#include <iostream>
#define num_functions 3
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);
// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}
// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];
#endif
main.cu:
#include "common.h"
// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;
// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
fptr_t dev_f = f; // Create device function pointer
dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
dev_fList[idx](a,b); // Make sure that the array was populated correctly
}
// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
if (count_f >= num_functions) {
std::cout << "Error: not enough memory statically allocated on device!\n";
exit(EXIT_FAILURE);
}
kernel<f><<<1,1>>>(a,b,count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
count_f++;
}
int main() {
int a = 12, b = 15;
addFunc<Add>(a,b);
addFunc<Subtract>(a,b);
addFunc<Multiply>(a,b);
return 0;
}
Edit: Added copy of the array of function pointers to constant memory
For what it's worth, here is how to copy our dev_fList array to constant memory:
In common.h:
__constant__ fptr_t cst_fList[num_functions];
__global__ void cst_test(int a, int b, int idx) {
if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}
In main.cu main() function, after all desired functions have been added:
fptr_t *temp;
gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );
cst_test<<<1,count_f>>>(a,b, count_f);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.
It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.
Example showing structure with function pointers:
This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.
This is dynamic assignment:
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
/*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
/*__host__ __device__*/ pDecayFu rad_decay;
/*__host__ __device__*/ pDecayFu lrate_decay;
};
// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..
This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(
pDistanceFu hDistance;
pDecayFu hRadDay;
pDecayFu hLRateDecay;
void DeviceAssign(DistFunction &dist) {
cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );
dist.distance = hDistance;
dist.rad_decay = hRadDay;
dist.lrate_decay = hLRateDecay;
}
Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..
// .. and this would work
#ifdef __CUDACC__
__host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}
__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible
void DeviceAssign2(DistFunction &dist) {
cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
// the same:
// cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
// ..
dist.lrate_decay = hLRateDecay;
// ..
}

CUDA function pointers

I was trying to make somtehing like this (actually I need to write some integration functions) in CUDA
#include <iostream>
using namespace std;
float f1(float x) {
return x * x;
}
float f2(float x) {
return x;
}
void tabulate(float p_f(float)) {
for (int i = 0; i != 10; ++i) {
std::cout << p_f(i) << ' ';
}
std::cout << std::endl;
}
int main() {
tabulate(f1);
tabulate(f2);
return 0;
}
output:
0 1 4 9 16 25 36 49 64 81
0 1 2 3 4 5 6 7 8 9
I tried the following but only got the error
Error: Function pointers and function template parameters are not supported in sm_1x.
float f1(float x) {
return x;
}
__global__ void tabulate(float lower, float upper, float p_function(float), float* result) {
for (lower; lower < upper; lower++) {
*result = *result + p_function(lower);
}
}
int main() {
float res;
float* dev_res;
cudaMalloc( (void**)&dev_res, sizeof(float) ) ;
tabulate<<<1,1>>>(0.0, 5.0, f1, dev_res);
cudaMemcpy(&res, dev_res, sizeof(float), cudaMemcpyDeviceToHost);
printf("%f\n", res);
/************************************************************************/
scanf("%s");
return 0;
}
To get rid of your compile error, you'll have to use -gencode arch=compute_20,code=sm_20 as a compiler argument when compiling your code. But then you'll likely have some runtime problems:
Taken from the CUDA Programming Guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions
Function pointers to __global__ functions are supported in host code, but not in device code.
Function pointers to __device__ functions are only supported in device code compiled for devices of compute capability 2.x and higher.
It is not allowed to take the address of a __device__ function in host code.
so you can have something like this (adapted from the "FunctionPointers" sample):
//your function pointer type - returns unsigned char, takes parameters of type unsigned char and float
typedef unsigned char(*pointFunction_t)(unsigned char, float);
//some device function to be pointed to
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
...
}
//pComputeThreshold is a device-side function pointer to your __device__ function
__device__ pointFunction_t pComputeThreshold = Threshold;
//the host-side function pointer to your __device__ function
pointFunction_t h_pointFunction;
//in host code: copy the function pointers to their host equivalent
cudaMemcpyFromSymbol(&h_pointFunction, pComputeThreshold, sizeof(pointFunction_t))
You can then pass the h_pointFunction as a parameter to your kernel, which can use it to call your __device__ function.
//your kernel taking your __device__ function pointer as a parameter
__global__ void kernel(pointFunction_t pPointOperation)
{
unsigned char tmp;
...
tmp = (*pPointOperation)(tmp, 150.0)
...
}
//invoke the kernel in host code, passing in your host-side __device__ function pointer
kernel<<<...>>>(h_pointFunction);
Hopefully that made some sense. In all, it looks like you would have to change your f1 function to be a __device__ function and follow a similar procedure (the typedefs aren't necessary, but they do make the code nicer) to get it as a valid function pointer on the host-side to pass to your kernel. I'd also advise giving the FunctionPointers CUDA sample a look over
Even though you may be able to compile this code (see #Robert Crovella's answer) this code will not work. You cannot pass function pointers from host code as the host compiler has no way of figuring out the function address.
Here is a simple class for function pointers that are callable from within a kernel I wrote based on this question:
template <typename T>
struct cudaCallableFunctionPointer
{
public:
cudaCallableFunctionPointer(T* f_)
{
T* host_ptr = (T*)malloc(sizeof(T));
cudaMalloc((void**)&ptr, sizeof(T));
cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);
cudaFree(host_ptr)
}
~cudaCallableFunctionPointer()
{
cudaFree(ptr);
}
T* ptr;
};
you could use it like this:
__device__ double func1(double x)
{
return x + 1.0f;
}
typedef double (*func)(double x);
__device__ func f_ = func1;
__global__ void test_kernel(func* f)
{
double x = (*f)(2.0);
printf("%g\n", x);
}
int main()
{
cudaCallableFunctionPointer<func> f(&f_);
test_kernel << < 1, 1 >> > (f.ptr);
}
output:
3