calling Thrust device_vector from a device function - cuda

I have a struct Cap which inside I have a thrust::device_vector of another structure. When I compile the code, I get an error which complains about calling a host function (thrust::device_vector<FloatIntPair>) from a device function SphericalFaceManager::makeCaps. When I add __host__ __device__ instead of only __device__ to the member functions and constructors the code then compiles but I receive a warning same as aforementioned error and I think it copies data between host and device. My question is how can I access to device vectors in my classes avoiding any data transfer between CPU and GPU?
Hereafter you can find the code:
struct ParticleID {
Int solver;
Int ngb;
Int oldNgb;
LLInt no;
LLInt masterNo;
__device__ ParticleID() {
solver = -8;
ngb = 0;
oldNgb = 0;
no = 0;
masterNo = -1;
}
};
struct BaseParticle {
Float h;
Float3 pos;
ParticleID id;
__device__ BaseParticle(const Float3& _pos, const Float& _h, const ParticleID& _id) :
h(_h), pos(_pos), id(_id) { }
};
struct FloatIntPair{
Float first;
Int second;
__device__ FloatIntPair(const Float& _first, Int _second) : first(_first), second(_second) { }
__device__ FloatIntPair(const FloatIntPair& sample) : first(sample.first), second(sample.second) { }
static struct {
__device__ bool operator()(const FloatIntPair& a, const FloatIntPair& b) { return a.first < b.first; }
} LessOp;
};
struct Cap {
Float3 eX;
Float3 eY;
Float radius;
Float height;
Float3 center;
Float3 normal;
BaseParticle* aP;
BaseParticle* bP;
thrust::device_vector<FloatIntPair> vertices; // The ordered list of vertices generated from intersections by other circles
__device__ inline Float findAngle(const Float3& vertex) const {
Float result;
Float3 r = (vertex - center);
result = atan2(r|eY,r|eX);
return result += (result < 0.0) * (2.0 * _PI);
}
__device__ void insertVertex(const Float3& vertex, Int id) {
Float theta;
if (!vertices.empty())
theta = findAngle(vertex);
else {
eX = normalVec(vertex - center);
eY = normal ^ eX;
theta = 0.0;
}
vertices.push_back(FloatIntPair(theta,id));
}
__device__ Cap(BaseParticle* _aP, BaseParticle* _bP) : aP(_aP), bP(_bP) {
//Compute normal, center, radius
Float d = mag(bP->pos - aP->pos);
if(d == 0.0){
normal = Vector1(0.0);
center = aP->pos;
radius = height = 0.0;
} else {
normal = (bP->pos - aP->pos) / d;
Float x = (d * d - bP->h * bP->h + aP->h * aP->h) / (2.0 * d);
center = aP->pos + normal * x;
if (x >= aP->h) {
radius = height = 0.0;
return;
}
radius = sqrt(aP->h * aP->h - x * x);
height = min(2.0 * aP->h, aP->h - x);
Float3 vec001 = Vector(0.0,0.0,1.0);
Float3 vec011 = Vector(0.0,1.0,1.0);
eX = normalVec(vec001 ^ normal);
if (mag2(eX) < geoEps()) {
eX = eX = normalVec(vec011 ^ normal);
}
eY = normal ^ eX;
}
}
};
class SphericalFaceManager {
BaseParticle* particle;
Int baseSigma;
public:
thrust::device_vector<Cap> caps;
thrust::device_vector<Float3> vertexPool;
__device__ void makeCaps();
};
__device__ void SphericalFaceManager::makeCaps() {
BaseParticle* aP;
BaseParticle* bP;
Cap aCap(aP,bP);
}

You cannot use thrust vectors (or std::vector) directly in device code. This is mentioned in various other SO questions such as here
If you want to use the data in a thrust::device_vector in device code, you should pass a pointer to the data as a functor initializing parameter. Various other SO questions give examples of this, such as here
Likewise, you cannot use vector methods, e.g. .empty() or .push_back() in device code.
You will need to replace these with ordinary C-style allocators and C-style indexed data access.
For a multi-threaded implementation of push_back in device code, I would recommend something like this. That is a fully worked example that demonstrates how to allocate space for the vector and how each thread can use it for insertVertex for example.

Related

CUDA (from C++) Hyperbolic Trig Functions Calculate Different Results in Different Locations

I'm running into an odd issue with a simulation I wrote. I recently restructured my code to make things cleaner and more organized. Basically (among other things) I moved (basically copy-pasted) the CUDA function in question to another file. This function uses asinh to compute something, as well as sinh and cosh. What I've noticed is that before the move, the function produced expected results consistent with hand calculated values (in excel). After the move, the hyperbolic functions are fed the same inputs, yet the results are significantly different (up to 10% in asinh, 0.5% in sinh). This effectively breaks my simulation. I am confident in the rest of the function.
EDIT:
Upon further testing, I've found hard-coding values for the angle (lambdaDegrees) in question - namely double x{ asinh(sqrt(3.0) * sin(lambdaDegrees * 3.1415927 / 180.0)) }; - produces the (good) expected results. Measuring the angle before and after the equation is executed, the angle is unchanged, yet without hard-coding the value, it produces the wrong results. The weirdest part is simply adding another diagnostic printf function caused the function to produce yet another (wrong) result. I'm wondering if it has anything to do with the way I've set up a callback function on the GPU...maybe multiple threads using the function at the same time leading to some (consistent) undefined behavior?
After a bit of screwing around with the code, I reproduced the error. Expected value of x within getSAtLambda (the printf statement) is 1.268... Result is 1.768... Let me know what you think.
main.cu
//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
typedef double(*callbackFcn)(double*, int, double, double, int);
//on GPU global variables
extern __device__ double* fieldConstArray_GPU;
extern __device__ int arraySize_GPU;
extern __device__ callbackFcn callback_GPU;
__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__global__ void setupEnvironmentGPU(double* constArrayPtr);
__global__ void execute()
{
int thdInd{ blockIdx.x * blockDim.x + threadIdx.x };
callback_GPU(fieldConstArray_GPU, arraySize_GPU, (thdInd == 31487) ? 1233005.097 : ((115200 - thdInd) / 50000.0 * 6.371e6), 0.0, thdInd ); //3rd argument are example values
}
void setupEnvironment()
{// consts: [ B0, ILATDeg, L, L_norm, s_max ]
double fieldConstArray_h[]{ 3.12e-5, 72.0, 66717978.17, 10.47213595, 85670894.1 };
double* fieldConstants_d{ nullptr };
cudaMalloc((void **)&fieldConstants_d, 5 * sizeof(double));
cudaMemcpy(fieldConstants_d, fieldConstArray_h, 5 * sizeof(double), cudaMemcpyHostToDevice);
setupEnvironmentGPU <<< 1, 1 >>> (fieldConstants_d);
}
int main()
{
setupEnvironment();
int loops{ 0 };
while (loops < 3)
{
execute <<< 115200 / 256, 256 >>> ();
cudaDeviceSynchronize();
loops++;
}
return 0;
}
otherfunctions.cu
#include <cmath>
#include <iostream>
//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
typedef double(*callbackFcn)(double*, int, double, double, int);
__device__ double* fieldConstArray_GPU{ nullptr };
__device__ int arraySize_GPU{ 7 };
__device__ callbackFcn callback_GPU{ nullptr };
__host__ __device__ double getSAtLambda(double* consts, int arrayLength, double lambdaDegrees, double simtime, int thdInd)
{//returns s in units of L
double x{ asinh(sqrt(3.0) * sin(lambdaDegrees * 3.1415927 / 180.0)) };
if (simtime == 0.0 && thdInd == 31487) { printf("\n\ngetSAtLambda: %f, %f\n\n", lambdaDegrees, x); }
return (0.5 * consts[2] / sqrt(3.0)) * (x + sinh(x) * cosh(x));
}
__host__ __device__ double getLambdaAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_tmp{ (-consts[1] / consts[4]) * s + consts[1] }; //-ILAT / s_max * s + ILAT
double s_tmp{ consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, thdInd) };
double dlambda{ 1.0 };
bool over{ 0 };
while (abs((s_tmp - s) / s) > 1e-4) //errorTolerance
{
while (1)
{
over = (s_tmp >= s);
if (over)
{
lambda_tmp += dlambda;
s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
if (s_tmp < s)
break;
}
else
{
lambda_tmp -= dlambda;
s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
if (s_tmp >= s)
break;
}
}
if (dlambda < 1e-4 / 100.0) //errorTolerance
break;
dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
}
return lambda_tmp;
}
__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_deg{ getLambdaAtS(consts, arrayLength, s, simtime, thdInd) };
double lambda_rad{ lambda_deg * 3.1415927 / 180.0 };
double rnorm{ consts[3] * pow(cos(lambda_rad), 2) };
return -consts[0] / pow(rnorm, 3) * sqrt(1.0 + 3 * pow(sin(lambda_rad), 2));
}
__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{
return (BFieldAtS(consts, arrayLength, s + consts[5], simtime, thdInd) - BFieldAtS(consts, arrayLength, s - consts[5], simtime, thdInd)) / (2 * consts[5]);
}
__global__ void setupEnvironmentGPU(double* constArrayPtr)
{
callback_GPU = gradBAtS; //sets pointer to callback function
arraySize_GPU = 7;
fieldConstArray_GPU = constArrayPtr;
}
A summary of my findings:
On Cuda 8.0:
Correct results are produced when the code above:
Is compiled as debug instead of release (except for -O1)
When a trig identity of asinh is used instead of the actual asinh function
When the argument for asinh is hardcoded
With -O1 instead of -O2 for both release and debug
(Paradoxically) When the function getSAtLambda is called directly instead of through a function pointer
Incorrect results are produced for asinh(x) when:
Compiled as release with -O2 with a non-hardcoded value through a function pointer
Updating to CUDA 9.1 fixed the issue.

Passing Host Function as a function pointer in __global__ OR __device__ function in CUDA

I am currently developing a GPU version of a CPU function
(e.g. function Calc(int a, int b, double* c, souble* d, CalcInvFunction GetInv )), in which a host function is passes as a function pointer(e.g. in above example GetInv is the host function of CalcInvFunction type). My question is, if i have to put Calc() function entirely in GPU, i have to pass the GetInv function as a function pointer argument in device function/kernel function, and is that possible?
Yes, for a GPU implementation of Calc, you should pass the GetInv as a __device__ function pointer.
It is possible, here are some worked examples:
Ex. 1
Ex. 2
Ex. 3
Most of the above examples demonstrate bringing the device function pointer all the way back to the host code. This may not be necessary for your particular case. But it should be fairly obvious from above how to grab a __device__ function pointer (in device code) and use it in a kernel.
Finally, i have been able to pass a host function as a function pointer in cuda kernel function (__global__ function). Thanks to Robert Crovella and njuffa for the answer. I have been able to pass a class member function(cpu function) as a function pointer to a cuda kernel. But, the main problem is, i can only pass the static class member function. I am not being able to pass the function not declared as static.
For Example:
/**/
__host__ __device__
static int
CellfunPtr(
void*ptr, int a
);
/**/
The above function work because this member function is declared as static member function. If i do not declare this member function as a static member as ,
/**/
__host__ __device__
int
CellfunPtr(
void*ptr, int a
);
/**/
then it doesnt work.
The complete code has four files.
First file
/*start of fundef.h file*/
typedef int (*pFunc_t)(void* ptr, int N);
/*end of fundef.h file*/
Second file
/*start of solver.h file*/
class CalcVars {
int eqnCount;
int numCell;
int numTri;
int numTet;
public:
double* cellVel;
double* cellPre;
/** Constructor */
CalcVars(
const int eqnCount_,
const int numCell_,
const int numTri_,
const int numTet_
);
/** Destructor */
~CalcVars(void);
public:
void
CalcAdv();
__host__ __device__
static int
CellfunPtr(
void*ptr, int a
);
};
/*end of solver.h file*/
Third file
/*start of solver.cu file*/
#include "solver.h"
__device__ pFunc_t pF1_d = CalcVars::CellfunPtr;
pFunc_t pF1_h ;
__global__ void kernel(int*a, pFunc_t func, void* thisPtr_){
int tid = threadIdx.x;
a[tid] = (*func)(thisPtr_, a[tid]);
};
/* Constructor */
CalcVars::CalcVars(
const int eqnCount_,
const int numCell_,
const int numTri_,
const int numTet_
)
{
this->eqnCount = eqnCount_;
this->numCell = numCell_;
this->numTri = numTri_;
this->cellVel = (double*) calloc((size_t) eqnCount, sizeof(double));
this->cellPre = (double*) calloc((size_t) eqnCount, sizeof(double));
}
/* Destructor */
CalcVars::~CalcVars(void)
{
free(this->cellVel);
free(this->cellPre);
}
void
CalcVars::CalcAdv(
){
/*int b1 = 0;
b1 = CellfunPtr(this, 1);*/
int Num = 50;
int *a1, *a1_dev;
a1 = (int *)malloc(Num*sizeof(int));
cudaMalloc((void**)&a1_dev, Num*sizeof(int));
for(int i = 0; i <Num; i++){
a1[i] = i;
}
cudaMemcpy(a1_dev, a1, Num*sizeof(int), cudaMemcpyHostToDevice);
//copy addresses of device functions to host
cudaMemcpyFromSymbol(&pF1_h, pF1_d, sizeof(pFunc_t));
kernel<<<1,42>>>(a1_dev, pF1_h, this);
cudaDeviceSynchronize();
cudaMemcpy(a1, a1_dev, Num*sizeof(int), cudaMemcpyDeviceToHost);
};
int
CalcVars::CellfunPtr(
void* ptr, int a
){
//CalcVars* ClsPtr = (CalcVars*)ptr;
printf("Printing from CPU function\n");
//int eqn_size = ClsPtr->eqnCount;
//printf("The number is %d",eqn_size);
return a-1;
};
/*end of solver.cu file*/
Fourth file
/*start of main.cpp file*/
#include "solver.h"
int main(){
int n_Eqn, n_cell, n_tri, n_tetra;
n_Eqn = 100;
n_cell = 200;
n_tri = 300;
n_tetra = 400;
CalcVars* calcvars;
calcvars = new CalcVars(n_Eqn, n_cell, n_tri, n_tetra );
calcvars->CalcAdv();
system("pause");
}
/*end of main.cpp file*/

Elementwise power operation using CUDA Thrust

Is there a way of transforming a thrust vector with a pow function? In other words, I want to transform each element x of a vector to pow(x,a), with a a constant.
Please refer to Section Transformations in Thrust Quict Start Guide for how to write a functor with initialized parameters.
struct saxpy_functor
{
const float a;
saxpy_functor(float _a) : a(_a) {}
__host__ __device__
float operator()(const float& x, const float& y) const {
return a * x + y;
}
};
Here is a full example. As #Eric has mentioned, all what is needed is defining your own power functor and using thrust::transform.
#include <thrust/sequence.h>
#include <thrust/device_vector.h>
class power_functor {
double a;
public:
power_functor(double a_) { a = a_; }
__host__ __device__ double operator()(double x) const
{
return pow(x,a);
}
};
void main() {
int N = 20;
thrust::device_vector<double> d_n(N);
thrust::sequence(d_n.begin(), d_n.end());
thrust::transform(d_n.begin(),d_n.end(),d_n.begin(),power_functor(2.));
for (int i=0; i<N; i++) {
double val = d_n[i];
printf("Device vector element number %i equal to %f\n",i,val);
}
getchar();
}

use host function on device

How can I use a host function in a device one ?
For example in below function ,I want to return a value
__device__ float magnitude2( void ) {
return r * r + i * i;
}
But this function is a device function and I received this error :
calling a host function from a __device__/__global__ function is not allowed
What's the best approach for this problem ?
for extra comment on the code :
I want to define this struct :
struct cuComplex {
float r;
float i;
cuComplex( float a, float b ) : r(a), i(b) {}
__device__ float magnitude2( void ) {
return r * r + i * i;
}
__device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};
Now that we know the question involves a C++ structure, the answer is obvious - the constructor of the class must also be available as a __device__ function in order to be able to instantiate the class inside a kernel. In your example, the structure should be defined like this:
struct cuComplex {
float r;
float i;
__device__ __host__
cuComplex( float a, float b ) : r(a), i(b) {}
__device__
float magnitude2( void ) {
return r * r + i * i;
}
__device__
cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__
cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};
The error you are seeing arises because the constructor needs to be called whenever the class is instantiated. In your original code, the constructor is a declared only as a host function, leading to a compilation error.

opengl -- how to call a function and draw it from the beginning (from menu option)

i have the following code ,which draws mandelbrot set.I created a menu with an option "black&white" which i want to draw the mandelbrot in black and white color.I haven't figured how to do this (if it can be done this way).mandelbrot is called through the display function ,but how can i call mandelbrot_black?
Also, if someone knows hot to make "zoom" in my code...here...http://stackoverflow.com/questions/5705554/how-to-do-zoom-in-my-code-mandelbrot
void mandelbrot();
void mandelbrot_black();
GLsizei width = 600;
GLsizei height = 600;
GLfloat AspectRatio;
int max = 500;
double xpos=0,ypos=0;
int CLEARFLAG=1;
double xmax = 2.0;
double xmin = -2.0;
double ymax = 2.0;
double ymin = -2.0;
using namespace std;
void display()
{
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(-2, width, -2, height);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glClear(GL_COLOR_BUFFER_BIT| GL_DEPTH_BUFFER_BIT );
mandelbrot();
glutSwapBuffers();
}
void reshape(GLsizei w, GLsizei h) {
width=w; height=h;
glViewport(0,0,width,height);
glutPostRedisplay();
}
void setXYpos(int px, int py)
{
xpos=xmin+(xmax-xmin)*px/width;
ypos=ymax-(ymax-ymin)*py/height;
}
void mouse(int button, int state, int x, int y)
{
if(button==GLUT_LEFT_BUTTON && state==GLUT_DOWN) {CLEARFLAG=0; setXYpos(x,y);}
glutPostRedisplay();
}
void mandelbrot()
{
...}
void mandelbrot_black(){
...}
void mymenu(int n)
{
switch(n) {
case 1: zoom_in();break;
case 2: zoom_out();break;
case 3: mandelbrot_black();break;
case 4: exit(0);
}
glutPostRedisplay();
}
void SetupMenu()
{
glutCreateMenu(mymenu);
glutAddMenuEntry("zoom in",1);
glutAddMenuEntry("zoom out",2);
glutAddMenuEntry("black&white",3);
glutAddMenuEntry("exit",4);
glutAttachMenu(GLUT_RIGHT_BUTTON);
}
int main(int argc, char *argv[])
{
glutInit(&argc, argv);
glutInitWindowSize(600, 600);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB);
glutCreateWindow("Mandelbrot");
glutDisplayFunc(display);
glutReshapeFunc(reshape);
glutMainLoop();
return 0;
}
Your display function needs to draw either mandelbrot() or mandelbrot_black() depending on the current state (which can/should be a global variable).
//in global scope
static bool black = false;
...
//in display()
if(black)
mandelbrot_black();
else
mandelbrot();
Change black accordingly in mymenu(). You still need to attach your menu to a mouse button and call SetupMenu().