CUDA thrust device pointer with transform copy crash - cuda

In CUDA 9.2 I have something like this:
#ifdef __CUDA_ARCH__
struct Context { float n[4]; } context;
#else
typedef __m128 Context;
#endif
struct A { float k[2]; };
struct B { float q[4]; };
struct FTransform : thrust::unary_function<A, B>
{
const Context context;
FTransform(Context context) : context(context){}
__device__ __host__ B operator()(const A& a) const
{
B b{{a.k[0], a.k[1], a.k[0]*context.n[0], a.k[1]*context.n[1]}};
return b;
}
};
void DoThrust(B* _bs, const Context& context, A* _as, uint32_t count)
{
thrust::device_ptr<B> bs = thrust::device_pointer_cast(_bs);
thrust::device_ptr<A> as = thrust::device_pointer_cast(_as);
FTransform fTransform(context);
auto first = thrust::make_transform_iterator(as, fTransform);
auto last = thrust::make_transform_iterator(as + count, fTransform);
thrust::copy(first, last, bs);
}
int main(int c, char **argv)
{
const uint32_t Count = 4;
Context context;
A* as;
B* bs;
cudaMalloc(&as, Count*sizeof(A));
cudaMalloc(&bs, Count*sizeof(B));
A hostAs[Count];
cudaMemcpy(as, hostAs, Count * sizeof(A), cudaMemcpyHostToDevice);
DoThrust(bs, context, as, Count);
B hostBs[Count];
cudaMemcpy(hostBs, bs, Count * sizeof(B), cudaMemcpyDeviceToHost);//crash
return 0;
}
Then when I call a standard cudaMemcpy() call later on the results I get the exception "an illegal memory access was encountered".
If I replace the thrust code with a non-thrust equivalent there is no error and everything works fine. Various combinations of trying to copy to device_vectors etc I get different crashes that seem to be thrust trying to release the device_ptr's for some reason - so maybe it is here for some reason?
== UPDATE ==
Ok that was confusing it appears it's due to the functor FTransform context member variable in my actual more complicated case. This specifically:
struct FTransform : thrust::unary_function<A, B>
{
#ifdef __CUDA_ARCH__
struct Context { float v[4]; } context;
#else
__m128 context;
#endif
...
};
So I guess it's an alignment problem somehow => in fact it is, as this works:
#ifdef __CUDA_ARCH__
struct __align__(16) Context { float v[4]; } context;
#else
__m128 context;
#endif

The solution is to ensure that if you use aligned types in thrust functor members (such as __m128 SSE types) that are copied to the GPU, that they are defined as aligned both during NVCC's CPU and GPU code build passes - and not accidentally assume even if a type may seem to naturally align to it's equivalent in the other pass that it will be ok, as otherwise bad hard to understand things may happen.
So for example the _ align _(16) is necessary in code like this:
struct FTransform : thrust::unary_function<A, B>
{
#ifdef __CUDA_ARCH__
struct __align__(16) Context { float v[4]; } context;
#else
__m128 context;
#endif
FTransform(Context context) : context(context){}
__device__ __host__ B operator()(const A& a) const; // function makes use of context
};

Related

cudaOccupancyMaxPotentialBlockSize API template usage

I m trying to understand the CUDA occupancy API cudaOccupancyMaxPotentialBlockSize.
The templated version is defined as below.
template<class T>
__inline__ __host__ CUDART_DEVICE cudaError_t
cudaOccupancyMaxPotentialBlockSize(
int *minGridSize,
int *blockSize,
T func,
size_t dynamicSMemSize = 0,
int blockSizeLimit = 0)
{
return cudaOccupancyMaxPotentialBlockSizeVariableSMem(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit);
}
I haven't found much examples on how to use the API if the kernel is a template.
I have found one in our code as seen in below example.
template <typename T> __global__ void fn(T *a) { *a = 10;}
cudaOccupancyMaxPotentialBlockSize<void(*)(int *)>(&gridSize, &blockSize, fn, 0, 0);
In this case, this void* (int*) represents the function pointer (of the kernel) returning void and int as an argument. Is my understanding correct.
If so, since the return type of the kernel is always void, Is the first parameter in the template list <(void*)(int*)> always void and then the others like int* follows according to the arguments list?
Can anyone explain better(with examples) on how to use this API with kernel being template.
void(*)(int) is regular C(++) function-pointer syntax, so yes, it will always be void(*)(Args...). Alternatively, you can put the template arguments at the function pointer itself at let the argument deduction take over: cudaOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, &fn<float>, 0, 0)

Why am I getting dynamic initialization not supported for __device__, __constant__, __shared__?

I don't understand why am I getting the error dynamic initialization is not supported for __device__, __constant__, __shared__ variables when compiling my code.
My code looks like
wrapper.cu
#include "../Params.hpp"
__constant__ Params cparams;
void wrapperFunction(uint& a)
{
Params ab;
a = 20;
}
Params.hpp
#include "Utils.hpp"
typedef struct Params
{
vectorTypef a;
} Params;
Utils.hpp
#include "Vec2.hpp"
typedef unsigned int uint;
typedef Vec2<float> vectorTypef;
Vec2.hpp
template <typename T>
class Vec2
{
public:
Vec2(){ x = 0.0; y = 0.0;}
T x, y;
};
Building with cmake with the command
CUDA_ADD_EXECUTABLE(test main.cpp cudasrc/wrapper.cu
Your Params struct is used in the __constant__ memory definition of cparams.
Your Params struct contains an element a of type vectorTypef which is a typedef for the Vec2 class for float. This class has a default constructor, that is assigning elements ultimately of the Params struct. This method of assigning data to a __constant__ region is not legal either in device code or host code.
In device code it's not legal to modify a __constant__ value at all. In host code (which is what is in view here), __constant__ values should be assigned using the appropriate API, i.e. cudaMemcpyToSymbol. I would recommend that you assign these in your host code explicitly, rather than via a constructor.
So, one possible approach to fix this would be to change your default constructor to an empty one:
public:
__host__ __device__ Vec2(){ }; // change this line
T x, y;
(you could also just delete the empty default constructor line)
And, in wrapper.cu (perhaps in wrapperFunction), initialize your Params __constant__ struct:
Params hparams;
hparams.a.x = 0.0;
hparams.a.y = 0.0;
cudaMemcpyToSymbol(cparams, &hparams, sizeof(Params));
I got the same problems as you and I found two ways to solve it.
define your struct in C-type, like this:
typedef struct {} ClassName;
define both constructor and destructor as __device__ type, like this:
struct ClassName{
public:
__device__ ClassName(){...}
__device__ ~ClassName(){...}
};

How to implement device side CUDA virtual functions?

I see that CUDA doesn't allow for classes with virtual functions to be passed into kernel functions. Are there any work-arounds to this limitation?
I would really like to be able to use polymorphism within a kernel function.
Thanks!
The most important part of Robert Crovella's comment is:
The objects simply need to be created on the device.
So keeping that in mind, I was dealing with situation where I had an abstract class Function and then some implementations of it encapsulating different function and its evaluation. This is the simplified version of my code how I achieved polymorphism in my situation, but I am not saying it cannot be done better... It will hopefully help you to get the idea:
class Function
{
public:
__device__ Function() {}
__device__ virtual ~Function() {}
__device__ virtual void Evaluate(const real* __restrict__ positions, real* fitnesses, const SIZE_TYPE particlesCount) const = 0;
};
class FunctionRsj : public Function
{
private:
SIZE_TYPE m_DimensionsCount;
SIZE_TYPE m_PointsCount;
real* m_Y;
real* m_X;
public:
__device__ FunctionRsj(const SIZE_TYPE dimensionsCount, const SIZE_TYPE pointsCount, real* configFileData)
: m_DimensionsCount(dimensionsCount),
m_PointsCount(pointsCount),
m_Y(configFileData),
m_X(configFileData + pointsCount) {}
__device__ ~FunctionRsj()
{
// m_Y points to the beginning of the config
// file data, use it for destruction as this
// object took ownership of configFilDeata.
delete[] m_Y;
}
__device__ void Evaluate(const real* __restrict__ positions, real* fitnesses, const SIZE_TYPE particlesCount) const
{
// Implement evaluation of FunctionRsj here.
}
};
__global__ void evaluate_fitnesses(
const real* __restrict__ positions,
real* fitnesses,
Function const* const* __restrict__ function,
const SIZE_TYPE particlesCount)
{
// This whole kernel is just a proxy as kernels
// cannot be member functions.
(*function)->Evaluate(positions, fitnesses, particlesCount);
}
__global__ void create_function(
Function** function,
SIZE_TYPE dimensionsCount,
SIZE_TYPE pointsCount,
real* configFileData)
{
// It is necessary to create object representing a function
// directly in global memory of the GPU device for virtual
// functions to work correctly, i.e. virtual function table
// HAS to be on GPU as well.
if (threadIdx.x == 0 && blockIdx.x == 0)
{
(*function) = new FunctionRsj(dimensionsCount, pointsCount, configFileData);
}
}
__global__ void delete_function(Function** function)
{
delete *function;
}
int main()
{
// Lets just assume d_FunctionConfigData, d_Positions,
// d_Fitnesses are arrays allocated on GPU already ...
// Create function.
Function** d_Function;
cudaMalloc(&d_Function, sizeof(Function**));
create_function<<<1, 1>>>(d_Function, 10, 10, d_FunctionConfigData);
// Evaluate using proxy kernel.
evaluate_fitnesses<<<
m_Configuration.GetEvaluationGridSize(),
m_Configuration.GetEvaluationBlockSize(),
m_Configuration.GetEvaluationSharedMemorySize()>>>(
d_Positions,
d_Fitnesses,
d_Function,
m_Configuration.GetParticlesCount());
// Delete function object on GPU.
delete_function<<<1, 1>>>(d_Function);
}

Function pointers in CUDA __constant__ memory

I have found some strange runtime behaviour while experimenting with function pointers in CUDA.
Goal
My goal is to make my function pointers choose which function to apply to two objects according to an internal property of the latter.
In short, I want to emulate C++ templates with a CUDA kernel - without actually using template arguments or switch clauses, but function pointers and class/struct members instead.
Approach
Define my custom objects struct customObj with one property (int type) that will emulate the arguments of a template.
Define a bunch of dummy functions (Sum(), Subtract(), etc) to choose from.
Keep the list of functions to apply (functionsList) and respective type members to look up (first_types, second_types) in __constant__ memory, such that function functionsList[i](obj1,obj2) is applied to objects with obj1.type == first_types[i] and obj2.type == second_types[i].
Working code
The following code has been compiled for Linux x86_64 with CUDA 5.0, on a GPU with compute capability 3.0 (GeForce GTX 670), and works.
#include <stdio.h>
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct customObj
{
int type;
double d;
// Constructors
__device__ __host__ customObj() {}
__device__ __host__ customObj(const int& _type, const double& _d) : type(_type), d(_d) {}
};
typedef void (*function_t)(customObj&, customObj&);
// Define a bunch of functions
__host__ __device__ void Sum(customObj& obj1, customObj& obj2) {printf("Sum chosen! d1 + d2 = %f\n", obj1.d + obj2.d);}
__host__ __device__ void Subtract(customObj& obj1, customObj& obj2) {printf("Subtract chosen! d1 - d2 = %f\n", obj1.d - obj2.d);}
__host__ __device__ void Multiply(customObj& obj1, customObj& obj2) {printf("Multiply chosen! d1 * d2 = %f\n", obj1.d * obj2.d);}
#define ARRAYLENGTH 3
__constant__ int first_type[ARRAYLENGTH] = {1, 2, 3};
__constant__ int second_type[ARRAYLENGTH] = {1, 1, 2};
__constant__ function_t functionsList[ARRAYLENGTH] = {Sum, Sum, Subtract};
// Kernel to loop through functions list
__global__ void choosefunction(customObj obj1, customObj obj2) {
int i = 0;
function_t f = NULL;
do {
if ((obj1.type == first_type[i]) && (obj2.type == second_type[i])) {
f = functionsList[i];
break;
}
i++;
} while (i < ARRAYLENGTH);
if (f == NULL) printf("No possible interaction!\n");
else f(obj1,obj2);
}
int main() {
customObj obj1(1, 5.2), obj2(1, 2.6);
choosefunction<<<1,1>>>(obj1, obj2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
The issue
The problem that I have found is that, as soon as I replace the datatype of member int type and related variables and functions (__constant__ int first_types[...] and so on)... the code compiles but stops working!
If I change the datatype from int to char or int8_t, the memory checker throws error 4 on my call to cudaDeviceSynchronize().
If I change the datatype to unsigned short int, I get a hardware stack overflow.
So, is anybody having similar issues when working with __constant__ memory? I really have no clue about what is going on. As far as I know, char and int8_t are built-in types of 1 byte length, while the size of int is 4 bytes, so maybe it is about data alignment, but I'm just guessing here. Besides, CUDA is supposed to support function pointers on the GPU since compute capability 2.0. Are there any special constraints for function pointers in __constant__ memory that I'm missing?
I was able to reproduce the problem (error 4, unspecified launch failure) on CUDA 5.0 on 64bit RHEL 5.5, but not on CUDA 6.0.
Please update/upgrade to CUDA 6.

use host function on device

How can I use a host function in a device one ?
For example in below function ,I want to return a value
__device__ float magnitude2( void ) {
return r * r + i * i;
}
But this function is a device function and I received this error :
calling a host function from a __device__/__global__ function is not allowed
What's the best approach for this problem ?
for extra comment on the code :
I want to define this struct :
struct cuComplex {
float r;
float i;
cuComplex( float a, float b ) : r(a), i(b) {}
__device__ float magnitude2( void ) {
return r * r + i * i;
}
__device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};
Now that we know the question involves a C++ structure, the answer is obvious - the constructor of the class must also be available as a __device__ function in order to be able to instantiate the class inside a kernel. In your example, the structure should be defined like this:
struct cuComplex {
float r;
float i;
__device__ __host__
cuComplex( float a, float b ) : r(a), i(b) {}
__device__
float magnitude2( void ) {
return r * r + i * i;
}
__device__
cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__device__
cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};
The error you are seeing arises because the constructor needs to be called whenever the class is instantiated. In your original code, the constructor is a declared only as a host function, leading to a compilation error.