use host function on device - cuda

How can I use a host function in a device one ?
For example in below function ,I want to return a value
__device__ float magnitude2( void ) {
return r * r + i * i;
}
But this function is a device function and I received this error :
calling a host function from a __device__/__global__ function is not allowed
What's the best approach for this problem ?
for extra comment on the code :
I want to define this struct :
struct cuComplex {
float r;
float i;
cuComplex( float a, float b ) : r(a), i(b) {}
__device__ float magnitude2( void ) {
return r * r + i * i;
}
__device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};

Now that we know the question involves a C++ structure, the answer is obvious - the constructor of the class must also be available as a __device__ function in order to be able to instantiate the class inside a kernel. In your example, the structure should be defined like this:
struct cuComplex {
float r;
float i;
__device__ __host__
cuComplex( float a, float b ) : r(a), i(b) {}
__device__
float magnitude2( void ) {
return r * r + i * i;
}
__device__
cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__
cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};
The error you are seeing arises because the constructor needs to be called whenever the class is instantiated. In your original code, the constructor is a declared only as a host function, leading to a compilation error.

Related

CUDA thrust device pointer with transform copy crash

In CUDA 9.2 I have something like this:
#ifdef __CUDA_ARCH__
struct Context { float n[4]; } context;
#else
typedef __m128 Context;
#endif
struct A { float k[2]; };
struct B { float q[4]; };
struct FTransform : thrust::unary_function<A, B>
{
const Context context;
FTransform(Context context) : context(context){}
__device__ __host__ B operator()(const A& a) const
{
B b{{a.k[0], a.k[1], a.k[0]*context.n[0], a.k[1]*context.n[1]}};
return b;
}
};
void DoThrust(B* _bs, const Context& context, A* _as, uint32_t count)
{
thrust::device_ptr<B> bs = thrust::device_pointer_cast(_bs);
thrust::device_ptr<A> as = thrust::device_pointer_cast(_as);
FTransform fTransform(context);
auto first = thrust::make_transform_iterator(as, fTransform);
auto last = thrust::make_transform_iterator(as + count, fTransform);
thrust::copy(first, last, bs);
}
int main(int c, char **argv)
{
const uint32_t Count = 4;
Context context;
A* as;
B* bs;
cudaMalloc(&as, Count*sizeof(A));
cudaMalloc(&bs, Count*sizeof(B));
A hostAs[Count];
cudaMemcpy(as, hostAs, Count * sizeof(A), cudaMemcpyHostToDevice);
DoThrust(bs, context, as, Count);
B hostBs[Count];
cudaMemcpy(hostBs, bs, Count * sizeof(B), cudaMemcpyDeviceToHost);//crash
return 0;
}
Then when I call a standard cudaMemcpy() call later on the results I get the exception "an illegal memory access was encountered".
If I replace the thrust code with a non-thrust equivalent there is no error and everything works fine. Various combinations of trying to copy to device_vectors etc I get different crashes that seem to be thrust trying to release the device_ptr's for some reason - so maybe it is here for some reason?
== UPDATE ==
Ok that was confusing it appears it's due to the functor FTransform context member variable in my actual more complicated case. This specifically:
struct FTransform : thrust::unary_function<A, B>
{
#ifdef __CUDA_ARCH__
struct Context { float v[4]; } context;
#else
__m128 context;
#endif
...
};
So I guess it's an alignment problem somehow => in fact it is, as this works:
#ifdef __CUDA_ARCH__
struct __align__(16) Context { float v[4]; } context;
#else
__m128 context;
#endif
The solution is to ensure that if you use aligned types in thrust functor members (such as __m128 SSE types) that are copied to the GPU, that they are defined as aligned both during NVCC's CPU and GPU code build passes - and not accidentally assume even if a type may seem to naturally align to it's equivalent in the other pass that it will be ok, as otherwise bad hard to understand things may happen.
So for example the _ align _(16) is necessary in code like this:
struct FTransform : thrust::unary_function<A, B>
{
#ifdef __CUDA_ARCH__
struct __align__(16) Context { float v[4]; } context;
#else
__m128 context;
#endif
FTransform(Context context) : context(context){}
__device__ __host__ B operator()(const A& a) const; // function makes use of context
};

calling Thrust device_vector from a device function

I have a struct Cap which inside I have a thrust::device_vector of another structure. When I compile the code, I get an error which complains about calling a host function (thrust::device_vector<FloatIntPair>) from a device function SphericalFaceManager::makeCaps. When I add __host__ __device__ instead of only __device__ to the member functions and constructors the code then compiles but I receive a warning same as aforementioned error and I think it copies data between host and device. My question is how can I access to device vectors in my classes avoiding any data transfer between CPU and GPU?
Hereafter you can find the code:
struct ParticleID {
Int solver;
Int ngb;
Int oldNgb;
LLInt no;
LLInt masterNo;
__device__ ParticleID() {
solver = -8;
ngb = 0;
oldNgb = 0;
no = 0;
masterNo = -1;
}
};
struct BaseParticle {
Float h;
Float3 pos;
ParticleID id;
__device__ BaseParticle(const Float3& _pos, const Float& _h, const ParticleID& _id) :
h(_h), pos(_pos), id(_id) { }
};
struct FloatIntPair{
Float first;
Int second;
__device__ FloatIntPair(const Float& _first, Int _second) : first(_first), second(_second) { }
__device__ FloatIntPair(const FloatIntPair& sample) : first(sample.first), second(sample.second) { }
static struct {
__device__ bool operator()(const FloatIntPair& a, const FloatIntPair& b) { return a.first < b.first; }
} LessOp;
};
struct Cap {
Float3 eX;
Float3 eY;
Float radius;
Float height;
Float3 center;
Float3 normal;
BaseParticle* aP;
BaseParticle* bP;
thrust::device_vector<FloatIntPair> vertices; // The ordered list of vertices generated from intersections by other circles
__device__ inline Float findAngle(const Float3& vertex) const {
Float result;
Float3 r = (vertex - center);
result = atan2(r|eY,r|eX);
return result += (result < 0.0) * (2.0 * _PI);
}
__device__ void insertVertex(const Float3& vertex, Int id) {
Float theta;
if (!vertices.empty())
theta = findAngle(vertex);
else {
eX = normalVec(vertex - center);
eY = normal ^ eX;
theta = 0.0;
}
vertices.push_back(FloatIntPair(theta,id));
}
__device__ Cap(BaseParticle* _aP, BaseParticle* _bP) : aP(_aP), bP(_bP) {
//Compute normal, center, radius
Float d = mag(bP->pos - aP->pos);
if(d == 0.0){
normal = Vector1(0.0);
center = aP->pos;
radius = height = 0.0;
} else {
normal = (bP->pos - aP->pos) / d;
Float x = (d * d - bP->h * bP->h + aP->h * aP->h) / (2.0 * d);
center = aP->pos + normal * x;
if (x >= aP->h) {
radius = height = 0.0;
return;
}
radius = sqrt(aP->h * aP->h - x * x);
height = min(2.0 * aP->h, aP->h - x);
Float3 vec001 = Vector(0.0,0.0,1.0);
Float3 vec011 = Vector(0.0,1.0,1.0);
eX = normalVec(vec001 ^ normal);
if (mag2(eX) < geoEps()) {
eX = eX = normalVec(vec011 ^ normal);
}
eY = normal ^ eX;
}
}
};
class SphericalFaceManager {
BaseParticle* particle;
Int baseSigma;
public:
thrust::device_vector<Cap> caps;
thrust::device_vector<Float3> vertexPool;
__device__ void makeCaps();
};
__device__ void SphericalFaceManager::makeCaps() {
BaseParticle* aP;
BaseParticle* bP;
Cap aCap(aP,bP);
}
You cannot use thrust vectors (or std::vector) directly in device code. This is mentioned in various other SO questions such as here
If you want to use the data in a thrust::device_vector in device code, you should pass a pointer to the data as a functor initializing parameter. Various other SO questions give examples of this, such as here
Likewise, you cannot use vector methods, e.g. .empty() or .push_back() in device code.
You will need to replace these with ordinary C-style allocators and C-style indexed data access.
For a multi-threaded implementation of push_back in device code, I would recommend something like this. That is a fully worked example that demonstrates how to allocate space for the vector and how each thread can use it for insertVertex for example.

CUDA function pointers

I was trying to make somtehing like this (actually I need to write some integration functions) in CUDA
#include <iostream>
using namespace std;
float f1(float x) {
return x * x;
}
float f2(float x) {
return x;
}
void tabulate(float p_f(float)) {
for (int i = 0; i != 10; ++i) {
std::cout << p_f(i) << ' ';
}
std::cout << std::endl;
}
int main() {
tabulate(f1);
tabulate(f2);
return 0;
}
output:
0 1 4 9 16 25 36 49 64 81
0 1 2 3 4 5 6 7 8 9
I tried the following but only got the error
Error: Function pointers and function template parameters are not supported in sm_1x.
float f1(float x) {
return x;
}
__global__ void tabulate(float lower, float upper, float p_function(float), float* result) {
for (lower; lower < upper; lower++) {
*result = *result + p_function(lower);
}
}
int main() {
float res;
float* dev_res;
cudaMalloc( (void**)&dev_res, sizeof(float) ) ;
tabulate<<<1,1>>>(0.0, 5.0, f1, dev_res);
cudaMemcpy(&res, dev_res, sizeof(float), cudaMemcpyDeviceToHost);
printf("%f\n", res);
/************************************************************************/
scanf("%s");
return 0;
}
To get rid of your compile error, you'll have to use -gencode arch=compute_20,code=sm_20 as a compiler argument when compiling your code. But then you'll likely have some runtime problems:
Taken from the CUDA Programming Guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions
Function pointers to __global__ functions are supported in host code, but not in device code.
Function pointers to __device__ functions are only supported in device code compiled for devices of compute capability 2.x and higher.
It is not allowed to take the address of a __device__ function in host code.
so you can have something like this (adapted from the "FunctionPointers" sample):
//your function pointer type - returns unsigned char, takes parameters of type unsigned char and float
typedef unsigned char(*pointFunction_t)(unsigned char, float);
//some device function to be pointed to
__device__ unsigned char
Threshold(unsigned char in, float thresh)
{
...
}
//pComputeThreshold is a device-side function pointer to your __device__ function
__device__ pointFunction_t pComputeThreshold = Threshold;
//the host-side function pointer to your __device__ function
pointFunction_t h_pointFunction;
//in host code: copy the function pointers to their host equivalent
cudaMemcpyFromSymbol(&h_pointFunction, pComputeThreshold, sizeof(pointFunction_t))
You can then pass the h_pointFunction as a parameter to your kernel, which can use it to call your __device__ function.
//your kernel taking your __device__ function pointer as a parameter
__global__ void kernel(pointFunction_t pPointOperation)
{
unsigned char tmp;
...
tmp = (*pPointOperation)(tmp, 150.0)
...
}
//invoke the kernel in host code, passing in your host-side __device__ function pointer
kernel<<<...>>>(h_pointFunction);
Hopefully that made some sense. In all, it looks like you would have to change your f1 function to be a __device__ function and follow a similar procedure (the typedefs aren't necessary, but they do make the code nicer) to get it as a valid function pointer on the host-side to pass to your kernel. I'd also advise giving the FunctionPointers CUDA sample a look over
Even though you may be able to compile this code (see #Robert Crovella's answer) this code will not work. You cannot pass function pointers from host code as the host compiler has no way of figuring out the function address.
Here is a simple class for function pointers that are callable from within a kernel I wrote based on this question:
template <typename T>
struct cudaCallableFunctionPointer
{
public:
cudaCallableFunctionPointer(T* f_)
{
T* host_ptr = (T*)malloc(sizeof(T));
cudaMalloc((void**)&ptr, sizeof(T));
cudaMemcpyFromSymbol(host_ptr, *f_, sizeof(T));
cudaMemcpy(ptr, host_ptr, sizeof(T), cudaMemcpyHostToDevice);
cudaFree(host_ptr)
}
~cudaCallableFunctionPointer()
{
cudaFree(ptr);
}
T* ptr;
};
you could use it like this:
__device__ double func1(double x)
{
return x + 1.0f;
}
typedef double (*func)(double x);
__device__ func f_ = func1;
__global__ void test_kernel(func* f)
{
double x = (*f)(2.0);
printf("%g\n", x);
}
int main()
{
cudaCallableFunctionPointer<func> f(&f_);
test_kernel << < 1, 1 >> > (f.ptr);
}
output:
3

Elementwise power operation using CUDA Thrust

Is there a way of transforming a thrust vector with a pow function? In other words, I want to transform each element x of a vector to pow(x,a), with a a constant.
Please refer to Section Transformations in Thrust Quict Start Guide for how to write a functor with initialized parameters.
struct saxpy_functor
{
const float a;
saxpy_functor(float _a) : a(_a) {}
__host__ __device__
float operator()(const float& x, const float& y) const {
return a * x + y;
}
};
Here is a full example. As #Eric has mentioned, all what is needed is defining your own power functor and using thrust::transform.
#include <thrust/sequence.h>
#include <thrust/device_vector.h>
class power_functor {
double a;
public:
power_functor(double a_) { a = a_; }
__host__ __device__ double operator()(double x) const
{
return pow(x,a);
}
};
void main() {
int N = 20;
thrust::device_vector<double> d_n(N);
thrust::sequence(d_n.begin(), d_n.end());
thrust::transform(d_n.begin(),d_n.end(),d_n.begin(),power_functor(2.));
for (int i=0; i<N; i++) {
double val = d_n[i];
printf("Device vector element number %i equal to %f\n",i,val);
}
getchar();
}

CUDA errors: identifier "(global/device)" is undefined, no suitable conversion,

Im doing a project on Photon mapping. I coded raytracer part and it ran successfully on CPU. Now im doing the same on GPU(through ssh).
im getting the following errors
nvcc -c -lSDL -lGL -lGLU AntTweakBar.a gpuRayTracer.cu
gpuRayTracer.cu(44): error: identifier "raytracer" is undefined
gpuRayTracer.cu(53): error: no suitable conversion function from
"Float3" to "void *" exists
gpuRayTracer.cu(55): error: no suitable conversion function from
"Float3" to "void *" exists
gpuRayTracer.cu(76): error: identifier "GPUsub" is undefined
gpuRayTracer.cu(77): error: identifier "GPUnormalize" is undefined
gpuRayTracer.cu(78): error: identifier "GPUcross" is undefined
gpuRayTracer.cu(80): error: calling a host function from a
device/_global_ function is not allowed
gpuRayTracer.cu(90): error: identifier "GPUmul" is undefined
gpuRayTracer.cu(95): error: calling a host function from a
device/_global_ function is not allowed
gpuRayTracer.cu(95): error: identifier "GPUadd" is undefined
gpuRayTracer.cu(192): error: calling a host function from a
device/_global_ function is not allowed
15 errors detected in the compilation of
"/tmp/tmpxft_0000432c_00000000-4_gpuRayTracer.cpp1.ii"
.
make: * [gpuRayTracer.o] Error 2
gpuRayTracer.cu
line 44,53, 55(errors) are marked in the below code
Float3 used below is a structure containing 3 float variables(x,y,z coordinates)
void Scene::GPUrayTracer(){
Object *d_objectList[OBJ_MAX];
GLubyte * d_pixels;
int *d_Width, *d_Height;
Float3 *d_eye,*d_lookAt;
int *d_objectCount;
size_t size1=sizeof(Float3);
size_t size2=sizeof(int);
size_t size3=sizeof(GLubyte);
//size_t size4=sizeof(Object);
cudaMalloc(&d_eye,size1);
cudaMalloc(&d_lookAt,size1);
cudaMemcpy(d_eye,&this->eye,size1,cudaMemcpyHostToDevice);
cudaMemcpy(d_lookAt,&this->lookAt,size1,cudaMemcpyHostToDevice);
cudaMalloc(&d_objectCount,size2);
cudaMemcpy(d_objectCount,&this->objectCount,size2,cudaMemcpyHostToDevice);
cudaMalloc(&d_Width,size2);
cudaMalloc(&d_Height,size2);
cudaMemcpy(d_Width,&this->screenWidth,size2,cudaMemcpyHostToDevice);
cudaMemcpy(d_Height,&this->screenHeight,size2,cudaMemcpyHostToDevice);
cudaMalloc(&d_pixels,size3);
cudaMemcpy(d_pixels,&this->pixels,size3,cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_objectList,
(sizeof(this->objectList)));
cudaMemcpy(d_objectList,
&this->objectList,
sizeof(this->objectList),cudaMemcpyHostToDevice);
line 44:raytracer<<<1,500>>>(d_pixels,d_Width,d_Height,
d_objectList,d_eye,d_lookAt);
cudaMemcpy((this->objectList),&d_objectList,sizeof(this-
>objectList),cudaMemcpyDeviceToHost);
cudaMemcpy(this->pixels,&d_pixels,size3,cudaMemcpyDeviceToHost);
cudaMemcpy((int *)this->screenWidth,&d_Width,size2,cudaMemcpyDeviceToHost);
cudaMemcpy((int *)this->screenHeight,&d_Height,size2,cudaMemcpyDeviceToHost);
cudaMemcpy((int *)this->objectCount,&d_objectCount,size2,cudaMemcpyDeviceToHost);
cudaMemcpy(
line:53 (void *)this->eye,
(void *)&d_eye,sizeof(d_eye),cudaMemcpyDeviceToHost);
line:55 cudaMemcpy(this->lookAt,(void *)&d_lookAt,sizeof(d_lookAt),cudaMemcpyDeviceToHost);
}
__global__ void raytracer( unsigned char *out_data,const int screenWidth,const int screenHeight,Object * objectList,Float3 eye,Float3 lookAt,int objectCount)
{
int x = blockDim.x * BLOCK_SIZE + threadIdx.x;
int y = blockDim.y * BLOCK_SIZE + threadIdx.y;
[b]//code goes here[/b]
}
__device__ float GPUffminf(float a, float b){
if(a<b)
return a;
return b;
}
__device__ float GPUffmaxf(float a, float b){
if(a>b)
return a;
return b;
}
__device__ float GPUmag(Float3 a){
float res;
res=a.x*a.x+a.y*a.y+a.z*a.z;
res=sqrt(res);
return res;
}
__device__ Float3 GPUnormalize(Float3 a){
Float3 res;
float magn=mag(a);
if(magn!=0){
magn=(float)1.0/magn;
res.x=a.x*magn;
res.y=a.y*magn;
res.z=a.z*magn;
return res;
}
return a;
}
__device__ Float3 GPUcross(Float3 a ,Float3 b){
Float3 res;
res.x=a.y*b.z-a.z*b.y;
res.y=a.z*b.x-a.x*b.z;
res.z=a.x*b.y-a.y*b.x;
return res;
}
__device__ float GPUdot(Float3 a,Float3 b){
return (float)(a.x*b.x + a.y*b.y + a.z*b.z);
}
__device__ Float3 GPUsub(Float3 a,Float3 b){
Float3 res;
res.x=a.x-b.x;
res.y=a.y-b.y;
res.z=a.z-b.z;
return res;
}
__device__ Float3 GPUadd(Float3 a,Float3 b){
Float3 res;
res.x=a.x+b.x;
res.y=a.y+b.y;
res.z=a.z+b.z;
return res;
}
__device__ Float3 GPUmul(Float3 a,float b){
Float3 res;
res.x=a.x*b;
res.y=a.y*b;
res.z=a.z*b;
return res;
}
wats wrong in the code??
apart from this i have few questions
*The order in which .cu/.cpp files are compiled..is it matter??
*Should the kernel be invoked only from main.cpp??
*If so, should a .cu file consists of only global/device functions ??
Okay first of all, you can put any
C/C++ function in .cu files other
than global/device functions. Neither
does the order of compilation matter.
For this error: no suitable conversion function from "Float3" to
"void *" exists
you need to do
(void**)
instead of
(void*)
For errors like these: gpuRayTracer.cu(76): error: identifier
"GPUsub" is undefined
you need to define GPUsub function
before the functions that calls it in
the .cu file. Just move the function
definition on top of the file.
For errors like this: calling a host function from a device/global function is not
allowed
okay, you can't call any function that
executes on CPU (doesn't have a
device or global identifier in it) from a device or global function.
Here's what you need to do to make
life easy.
Define each function in a separate .cu
file and use header file for their
decelerations. You should have one HOST
function that executes all the
pipeline, it can call gpu as well as
cpu functions.