How to write an OpenCL function - function

I'm new to OpenCL and I'm trying to implement a simple function in OpenCL. The function is supposed to be called from a kernel function.
void swap(int *a, int *b)
{
int *temp = a;
b = a;
a = temp;
}
However upon calling it, the swap doesn't work.
Is there a way to pass parameters by reference?

The way you have written the function, it is not doing anything. You are just assigning the pointers around. You need to have this:
void swap(int *a, int *b)
{
int temp = *a;
*a = *b;
*b = temp;
}
Reference parameters are not allowed, as far as I recall.

Related

How to implement properly an inline function in the device that returns a vector to another device function?

I want to implement properly an inlined device function that fill out a vector of dynamic size and return the filled vector like:
__device__ inline thrust::device_vector<double> make_array(double zeta, int l)
{
thrust::device_vector<double> ret;
int N =(int)(5*l+zeta); //the size of the array will depend on l and zeta, in a complex way...
// Make sure of sufficient memory allocation
ret.reserve(N);
// Resize array
ret.resize(N);
//fill it:
//for(int i=0;i<N;i++)
// ...;
return ret;
}
My goal is to use the content of the returned vector in another device function like:
__device__ inline double use_array(double zeta,int l)
{
thrust::device_vector<double> array = make_array(zeta, l);
double result = 0;
for(int i=0; i<array.size(); i++)
result += array[i];
return result;
}
How can I do it properly? my feeling is that a thrust vector is designed for this type of task, but I want to do it properly. What is the standard CUDA approach to this task?
thrust::device_vector is not usable in device code.
However you can return a pointer to a dynamically allocated area, like so:
#include <assert.h>
template <typename T>
__device__ T* make_array(T zeta, int l)
{
int N =(int)(5*l+zeta); //the size of the array will depend on l and zeta, in a complex way...
T *ret = (T *)malloc(N*sizeof(T));
assert(ret != NULL); // error checking
//fill it:
//for(int i=0;i<N;i++)
// ret[i] = ...;
return ret;
}
The inline keyword should not be necessary. The compiler will aggressively inline functions wherever possible.

Passing Host Function as a function pointer in __global__ OR __device__ function in CUDA

I am currently developing a GPU version of a CPU function
(e.g. function Calc(int a, int b, double* c, souble* d, CalcInvFunction GetInv )), in which a host function is passes as a function pointer(e.g. in above example GetInv is the host function of CalcInvFunction type). My question is, if i have to put Calc() function entirely in GPU, i have to pass the GetInv function as a function pointer argument in device function/kernel function, and is that possible?
Yes, for a GPU implementation of Calc, you should pass the GetInv as a __device__ function pointer.
It is possible, here are some worked examples:
Ex. 1
Ex. 2
Ex. 3
Most of the above examples demonstrate bringing the device function pointer all the way back to the host code. This may not be necessary for your particular case. But it should be fairly obvious from above how to grab a __device__ function pointer (in device code) and use it in a kernel.
Finally, i have been able to pass a host function as a function pointer in cuda kernel function (__global__ function). Thanks to Robert Crovella and njuffa for the answer. I have been able to pass a class member function(cpu function) as a function pointer to a cuda kernel. But, the main problem is, i can only pass the static class member function. I am not being able to pass the function not declared as static.
For Example:
/**/
__host__ __device__
static int
CellfunPtr(
void*ptr, int a
);
/**/
The above function work because this member function is declared as static member function. If i do not declare this member function as a static member as ,
/**/
__host__ __device__
int
CellfunPtr(
void*ptr, int a
);
/**/
then it doesnt work.
The complete code has four files.
First file
/*start of fundef.h file*/
typedef int (*pFunc_t)(void* ptr, int N);
/*end of fundef.h file*/
Second file
/*start of solver.h file*/
class CalcVars {
int eqnCount;
int numCell;
int numTri;
int numTet;
public:
double* cellVel;
double* cellPre;
/** Constructor */
CalcVars(
const int eqnCount_,
const int numCell_,
const int numTri_,
const int numTet_
);
/** Destructor */
~CalcVars(void);
public:
void
CalcAdv();
__host__ __device__
static int
CellfunPtr(
void*ptr, int a
);
};
/*end of solver.h file*/
Third file
/*start of solver.cu file*/
#include "solver.h"
__device__ pFunc_t pF1_d = CalcVars::CellfunPtr;
pFunc_t pF1_h ;
__global__ void kernel(int*a, pFunc_t func, void* thisPtr_){
int tid = threadIdx.x;
a[tid] = (*func)(thisPtr_, a[tid]);
};
/* Constructor */
CalcVars::CalcVars(
const int eqnCount_,
const int numCell_,
const int numTri_,
const int numTet_
)
{
this->eqnCount = eqnCount_;
this->numCell = numCell_;
this->numTri = numTri_;
this->cellVel = (double*) calloc((size_t) eqnCount, sizeof(double));
this->cellPre = (double*) calloc((size_t) eqnCount, sizeof(double));
}
/* Destructor */
CalcVars::~CalcVars(void)
{
free(this->cellVel);
free(this->cellPre);
}
void
CalcVars::CalcAdv(
){
/*int b1 = 0;
b1 = CellfunPtr(this, 1);*/
int Num = 50;
int *a1, *a1_dev;
a1 = (int *)malloc(Num*sizeof(int));
cudaMalloc((void**)&a1_dev, Num*sizeof(int));
for(int i = 0; i <Num; i++){
a1[i] = i;
}
cudaMemcpy(a1_dev, a1, Num*sizeof(int), cudaMemcpyHostToDevice);
//copy addresses of device functions to host
cudaMemcpyFromSymbol(&pF1_h, pF1_d, sizeof(pFunc_t));
kernel<<<1,42>>>(a1_dev, pF1_h, this);
cudaDeviceSynchronize();
cudaMemcpy(a1, a1_dev, Num*sizeof(int), cudaMemcpyDeviceToHost);
};
int
CalcVars::CellfunPtr(
void* ptr, int a
){
//CalcVars* ClsPtr = (CalcVars*)ptr;
printf("Printing from CPU function\n");
//int eqn_size = ClsPtr->eqnCount;
//printf("The number is %d",eqn_size);
return a-1;
};
/*end of solver.cu file*/
Fourth file
/*start of main.cpp file*/
#include "solver.h"
int main(){
int n_Eqn, n_cell, n_tri, n_tetra;
n_Eqn = 100;
n_cell = 200;
n_tri = 300;
n_tetra = 400;
CalcVars* calcvars;
calcvars = new CalcVars(n_Eqn, n_cell, n_tri, n_tetra );
calcvars->CalcAdv();
system("pause");
}
/*end of main.cpp file*/

C++ Passing Polymorphic Pointer of Pointers to Function

In trying to shorted my code for readability, I wound up changing too much and making mistakes. This is still condensed but taken straight from my code.
My problem is that I have a class called "function" and a derived class "pwfunction" which both have the virtual () operator. I'd like to pass an array of pointers to my "function" objects to various actual functions and use the () operator.
Final edit: This is a SSCCE version of what I'm talking about.
#include <iostream>
using namespace std;
class function
{
public:
virtual double operator () (double x) {return 1.5;}
};
class pwfunction : public function
{
public:
virtual double operator() (double x) {return 2.0;}
};
void interface();
void definefuncs (function** funcs, long unsigned numfuncs);
void interpolate(function* infunc);
void solvefuncs(function** funcs, long unsigned numfuncs);
int main()
{
interface();
return 0;
}
void interface()
{
long unsigned numfuncs = 1;
function* funcs[numfuncs];
definefuncs(funcs, numfuncs);
solvefuncs(funcs, numfuncs);
}
void definefuncs (function** funcs, long unsigned numfuncs)
{
interpolate(funcs[0]);
}
void interpolate(function* infunc)
{
infunc = new pwfunction();
cout<< (*infunc)(1.5)<<endl; //works
}
void solvefuncs(function** funcs, long unsigned numfuncs)
{
cout<< (*funcs[0])(1.5); //Error Message: Segmentation fault
}
The problem comes from the following:
void interpolate(function* infunc)
{
infunc = new pwfunction();
cout<< (*infunc)(1.5)<<endl; //works
}
is probably not doing what you want. infunc is allocated locally, and this does not affect anything else outside or this function (and is btw a memory leak). Interpolate should either return infunc, or allocate the original variable, such as
void interpolate(function*& infunc) ...
You don't allocate array for the funclist data in somefunction, so anything can happen. Perhaps you mean
func* funclist[1];
to indicate a one-element array of func pointers.

CUDA function pointers

I was trying to make somtehing like this (actually I need to write some integration functions) in CUDA
#include <iostream>
using namespace std;
float f1(float x) {
return x * x;
}
float f2(float x) {
return x;
}
void tabulate(float p_f(float)) {
for (int i = 0; i != 10; ++i) {
std::cout << p_f(i) << ' ';
}
std::cout << std::endl;
}
int main() {
tabulate(f1);
tabulate(f2);
return 0;
}
output:
0 1 4 9 16 25 36 49 64 81
0 1 2 3 4 5 6 7 8 9
I tried the following but only got the error
Error: Function pointers and function template parameters are not supported in sm_1x.
float f1(float x) {
return x;
}
__global__ void tabulate(float lower, float upper, float p_function(float), float* result) {
for (lower; lower < upper; lower++) {
*result = *result + p_function(lower);
}
}
int main() {
float res;
float* dev_res;
cudaMalloc( (void**)&dev_res, sizeof(float) ) ;
tabulate<<<1,1>>>(0.0, 5.0, f1, dev_res);
cudaMemcpy(&res, dev_res, sizeof(float), cudaMemcpyDeviceToHost);
printf("%f\n", res);
/************************************************************************/
scanf("%s");
return 0;
}
To get rid of your compile error, you'll have to use -gencode arch=compute_20,code=sm_20 as a compiler argument when compiling your code. But then you'll likely have some runtime problems:
Taken from the CUDA Programming Guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions
Function pointers to __global__ functions are supported in host code, but not in device code.
Function pointers to __device__ functions are only supported in device code compiled for devices of compute capability 2.x and higher.
It is not allowed to take the address of a __device__ function in host code.
so you can have something like this (adapted from the "FunctionPointers" sample):
//your function pointer type - returns unsigned char, takes parameters of type unsigned char and float
typedef unsigned char(*pointFunction_t)(unsigned char, float);
//some device function to be pointed to
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
...
}
//pComputeThreshold is a device-side function pointer to your __device__ function
__device__ pointFunction_t pComputeThreshold = Threshold;
//the host-side function pointer to your __device__ function
pointFunction_t h_pointFunction;
//in host code: copy the function pointers to their host equivalent
cudaMemcpyFromSymbol(&h_pointFunction, pComputeThreshold, sizeof(pointFunction_t))
You can then pass the h_pointFunction as a parameter to your kernel, which can use it to call your __device__ function.
//your kernel taking your __device__ function pointer as a parameter
__global__ void kernel(pointFunction_t pPointOperation)
{
unsigned char tmp;
...
tmp = (*pPointOperation)(tmp, 150.0)
...
}
//invoke the kernel in host code, passing in your host-side __device__ function pointer
kernel<<<...>>>(h_pointFunction);
Hopefully that made some sense. In all, it looks like you would have to change your f1 function to be a __device__ function and follow a similar procedure (the typedefs aren't necessary, but they do make the code nicer) to get it as a valid function pointer on the host-side to pass to your kernel. I'd also advise giving the FunctionPointers CUDA sample a look over
Even though you may be able to compile this code (see #Robert Crovella's answer) this code will not work. You cannot pass function pointers from host code as the host compiler has no way of figuring out the function address.
Here is a simple class for function pointers that are callable from within a kernel I wrote based on this question:
template <typename T>
struct cudaCallableFunctionPointer
{
public:
cudaCallableFunctionPointer(T* f_)
{
T* host_ptr = (T*)malloc(sizeof(T));
cudaMalloc((void**)&ptr, sizeof(T));
cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);
cudaFree(host_ptr)
}
~cudaCallableFunctionPointer()
{
cudaFree(ptr);
}
T* ptr;
};
you could use it like this:
__device__ double func1(double x)
{
return x + 1.0f;
}
typedef double (*func)(double x);
__device__ func f_ = func1;
__global__ void test_kernel(func* f)
{
double x = (*f)(2.0);
printf("%g\n", x);
}
int main()
{
cudaCallableFunctionPointer<func> f(&f_);
test_kernel << < 1, 1 >> > (f.ptr);
}
output:
3

use host function on device

How can I use a host function in a device one ?
For example in below function ,I want to return a value
__device__ float magnitude2( void ) {
return r * r + i * i;
}
But this function is a device function and I received this error :
calling a host function from a __device__/__global__ function is not allowed
What's the best approach for this problem ?
for extra comment on the code :
I want to define this struct :
struct cuComplex {
float r;
float i;
cuComplex( float a, float b ) : r(a), i(b) {}
__device__ float magnitude2( void ) {
return r * r + i * i;
}
__device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};
Now that we know the question involves a C++ structure, the answer is obvious - the constructor of the class must also be available as a __device__ function in order to be able to instantiate the class inside a kernel. In your example, the structure should be defined like this:
struct cuComplex {
float r;
float i;
__device__ __host__
cuComplex( float a, float b ) : r(a), i(b) {}
__device__
float magnitude2( void ) {
return r * r + i * i;
}
__device__
cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__
cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};
The error you are seeing arises because the constructor needs to be called whenever the class is instantiated. In your original code, the constructor is a declared only as a host function, leading to a compilation error.