Why am I getting dynamic initialization not supported for __device__, __constant__, __shared__? - cuda

I don't understand why am I getting the error dynamic initialization is not supported for __device__, __constant__, __shared__ variables when compiling my code.
My code looks like
wrapper.cu
#include "../Params.hpp"
__constant__ Params cparams;
void wrapperFunction(uint& a)
{
Params ab;
a = 20;
}
Params.hpp
#include "Utils.hpp"
typedef struct Params
{
vectorTypef a;
} Params;
Utils.hpp
#include "Vec2.hpp"
typedef unsigned int uint;
typedef Vec2<float> vectorTypef;
Vec2.hpp
template <typename T>
class Vec2
{
public:
Vec2(){ x = 0.0; y = 0.0;}
T x, y;
};
Building with cmake with the command
CUDA_ADD_EXECUTABLE(test main.cpp cudasrc/wrapper.cu

Your Params struct is used in the __constant__ memory definition of cparams.
Your Params struct contains an element a of type vectorTypef which is a typedef for the Vec2 class for float. This class has a default constructor, that is assigning elements ultimately of the Params struct. This method of assigning data to a __constant__ region is not legal either in device code or host code.
In device code it's not legal to modify a __constant__ value at all. In host code (which is what is in view here), __constant__ values should be assigned using the appropriate API, i.e. cudaMemcpyToSymbol. I would recommend that you assign these in your host code explicitly, rather than via a constructor.
So, one possible approach to fix this would be to change your default constructor to an empty one:
public:
__host__ __device__ Vec2(){ }; // change this line
T x, y;
(you could also just delete the empty default constructor line)
And, in wrapper.cu (perhaps in wrapperFunction), initialize your Params __constant__ struct:
Params hparams;
hparams.a.x = 0.0;
hparams.a.y = 0.0;
cudaMemcpyToSymbol(cparams, &hparams, sizeof(Params));

I got the same problems as you and I found two ways to solve it.
define your struct in C-type, like this:
typedef struct {} ClassName;
define both constructor and destructor as __device__ type, like this:
struct ClassName{
public:
__device__ ClassName(){...}
__device__ ~ClassName(){...}
};

Related

cudaOccupancyMaxPotentialBlockSize API template usage

I m trying to understand the CUDA occupancy API cudaOccupancyMaxPotentialBlockSize.
The templated version is defined as below.
template<class T>
__inline__ __host__ CUDART_DEVICE cudaError_t
cudaOccupancyMaxPotentialBlockSize(
int *minGridSize,
int *blockSize,
T func,
size_t dynamicSMemSize = 0,
int blockSizeLimit = 0)
{
return cudaOccupancyMaxPotentialBlockSizeVariableSMem(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit);
}
I haven't found much examples on how to use the API if the kernel is a template.
I have found one in our code as seen in below example.
template <typename T> __global__ void fn(T *a) { *a = 10;}
cudaOccupancyMaxPotentialBlockSize<void(*)(int *)>(&gridSize, &blockSize, fn, 0, 0);
In this case, this void* (int*) represents the function pointer (of the kernel) returning void and int as an argument. Is my understanding correct.
If so, since the return type of the kernel is always void, Is the first parameter in the template list <(void*)(int*)> always void and then the others like int* follows according to the arguments list?
Can anyone explain better(with examples) on how to use this API with kernel being template.
void(*)(int) is regular C(++) function-pointer syntax, so yes, it will always be void(*)(Args...). Alternatively, you can put the template arguments at the function pointer itself at let the argument deduction take over: cudaOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, &fn<float>, 0, 0)

CUDA thrust device pointer with transform copy crash

In CUDA 9.2 I have something like this:
#ifdef __CUDA_ARCH__
struct Context { float n[4]; } context;
#else
typedef __m128 Context;
#endif
struct A { float k[2]; };
struct B { float q[4]; };
struct FTransform : thrust::unary_function<A, B>
{
const Context context;
FTransform(Context context) : context(context){}
__device__ __host__ B operator()(const A& a) const
{
B b{{a.k[0], a.k[1], a.k[0]*context.n[0], a.k[1]*context.n[1]}};
return b;
}
};
void DoThrust(B* _bs, const Context& context, A* _as, uint32_t count)
{
thrust::device_ptr<B> bs = thrust::device_pointer_cast(_bs);
thrust::device_ptr<A> as = thrust::device_pointer_cast(_as);
FTransform fTransform(context);
auto first = thrust::make_transform_iterator(as, fTransform);
auto last = thrust::make_transform_iterator(as + count, fTransform);
thrust::copy(first, last, bs);
}
int main(int c, char **argv)
{
const uint32_t Count = 4;
Context context;
A* as;
B* bs;
cudaMalloc(&as, Count*sizeof(A));
cudaMalloc(&bs, Count*sizeof(B));
A hostAs[Count];
cudaMemcpy(as, hostAs, Count * sizeof(A), cudaMemcpyHostToDevice);
DoThrust(bs, context, as, Count);
B hostBs[Count];
cudaMemcpy(hostBs, bs, Count * sizeof(B), cudaMemcpyDeviceToHost);//crash
return 0;
}
Then when I call a standard cudaMemcpy() call later on the results I get the exception "an illegal memory access was encountered".
If I replace the thrust code with a non-thrust equivalent there is no error and everything works fine. Various combinations of trying to copy to device_vectors etc I get different crashes that seem to be thrust trying to release the device_ptr's for some reason - so maybe it is here for some reason?
== UPDATE ==
Ok that was confusing it appears it's due to the functor FTransform context member variable in my actual more complicated case. This specifically:
struct FTransform : thrust::unary_function<A, B>
{
#ifdef __CUDA_ARCH__
struct Context { float v[4]; } context;
#else
__m128 context;
#endif
...
};
So I guess it's an alignment problem somehow => in fact it is, as this works:
#ifdef __CUDA_ARCH__
struct __align__(16) Context { float v[4]; } context;
#else
__m128 context;
#endif
The solution is to ensure that if you use aligned types in thrust functor members (such as __m128 SSE types) that are copied to the GPU, that they are defined as aligned both during NVCC's CPU and GPU code build passes - and not accidentally assume even if a type may seem to naturally align to it's equivalent in the other pass that it will be ok, as otherwise bad hard to understand things may happen.
So for example the _ align _(16) is necessary in code like this:
struct FTransform : thrust::unary_function<A, B>
{
#ifdef __CUDA_ARCH__
struct __align__(16) Context { float v[4]; } context;
#else
__m128 context;
#endif
FTransform(Context context) : context(context){}
__device__ __host__ B operator()(const A& a) const; // function makes use of context
};

Function pointers in CUDA __constant__ memory

I have found some strange runtime behaviour while experimenting with function pointers in CUDA.
Goal
My goal is to make my function pointers choose which function to apply to two objects according to an internal property of the latter.
In short, I want to emulate C++ templates with a CUDA kernel - without actually using template arguments or switch clauses, but function pointers and class/struct members instead.
Approach
Define my custom objects struct customObj with one property (int type) that will emulate the arguments of a template.
Define a bunch of dummy functions (Sum(), Subtract(), etc) to choose from.
Keep the list of functions to apply (functionsList) and respective type members to look up (first_types, second_types) in __constant__ memory, such that function functionsList[i](obj1,obj2) is applied to objects with obj1.type == first_types[i] and obj2.type == second_types[i].
Working code
The following code has been compiled for Linux x86_64 with CUDA 5.0, on a GPU with compute capability 3.0 (GeForce GTX 670), and works.
#include <stdio.h>
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct customObj
{
int type;
double d;
// Constructors
__device__ __host__ customObj() {}
__device__ __host__ customObj(const int& _type, const double& _d) : type(_type), d(_d) {}
};
typedef void (*function_t)(customObj&, customObj&);
// Define a bunch of functions
__host__ __device__ void Sum(customObj& obj1, customObj& obj2) {printf("Sum chosen! d1 + d2 = %f\n", obj1.d + obj2.d);}
__host__ __device__ void Subtract(customObj& obj1, customObj& obj2) {printf("Subtract chosen! d1 - d2 = %f\n", obj1.d - obj2.d);}
__host__ __device__ void Multiply(customObj& obj1, customObj& obj2) {printf("Multiply chosen! d1 * d2 = %f\n", obj1.d * obj2.d);}
#define ARRAYLENGTH 3
__constant__ int first_type[ARRAYLENGTH] = {1, 2, 3};
__constant__ int second_type[ARRAYLENGTH] = {1, 1, 2};
__constant__ function_t functionsList[ARRAYLENGTH] = {Sum, Sum, Subtract};
// Kernel to loop through functions list
__global__ void choosefunction(customObj obj1, customObj obj2) {
int i = 0;
function_t f = NULL;
do {
if ((obj1.type == first_type[i]) && (obj2.type == second_type[i])) {
f = functionsList[i];
break;
}
i++;
} while (i < ARRAYLENGTH);
if (f == NULL) printf("No possible interaction!\n");
else f(obj1,obj2);
}
int main() {
customObj obj1(1, 5.2), obj2(1, 2.6);
choosefunction<<<1,1>>>(obj1, obj2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
The issue
The problem that I have found is that, as soon as I replace the datatype of member int type and related variables and functions (__constant__ int first_types[...] and so on)... the code compiles but stops working!
If I change the datatype from int to char or int8_t, the memory checker throws error 4 on my call to cudaDeviceSynchronize().
If I change the datatype to unsigned short int, I get a hardware stack overflow.
So, is anybody having similar issues when working with __constant__ memory? I really have no clue about what is going on. As far as I know, char and int8_t are built-in types of 1 byte length, while the size of int is 4 bytes, so maybe it is about data alignment, but I'm just guessing here. Besides, CUDA is supposed to support function pointers on the GPU since compute capability 2.0. Are there any special constraints for function pointers in __constant__ memory that I'm missing?
I was able to reproduce the problem (error 4, unspecified launch failure) on CUDA 5.0 on 64bit RHEL 5.5, but not on CUDA 6.0.
Please update/upgrade to CUDA 6.

thrust::device_vector in constant memory

I have a float array that needs to be referenced many times on the device, so I believe the best place to store it is in __ constant __ memory (using this reference). The array (or vector) will need to be written once at run-time when initializing, but read by multiple different functions many millions of times, so constant copying to the kernel each function call seems like A Bad Idea.
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
__host__ __device__ float operator()(const float& x) const { return fmax(x,C);}
};
void foo(const thrust::host_vector<float> &, const float &);
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x,0.0);
return(0);
}
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
//this method works fine, but the memory transfer is unacceptable
thrust::device_vector<float> input_dev_vec(n);
input_dev_vec = input_host_x; //I want to avoid this
thrust::transform(input_dev_vec.begin(),input_dev_vec.end(),dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this memory transfer for debugging
//this method compiles fine, but crashes at runtime
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(dev_ptr,dev_ptr+n,dev_sol.begin(),struct_max(x0));
host_sol = dev_sol; //this line crashes
}
I tried adding a global thrust::device_vector dev_x(n), but that also crashed at run-time, and would be in __ global __ memory rather than __ constant__ memory
This can all be made to work if I just discard the thrust library, but is there a way to use the thrust library with globals and device constant memory?
Good question! You can't cast a __constant__ array as if it's a regular device pointer.
I will answer your question (after the line below), but first: this is a bad use of __constant__, and it isn't really what you want. The constant cache in CUDA is optimized for uniform access across threads in a warp. That means all threads in the warp access the same location at the same time. If each thread of the warp accesses a different constant memory location, then the accesses get serialized. So your access pattern, where consecutive threads access consecutive memory locations, will be 32 times slower than a uniform access. You should really just use device memory. If you need to write the data once, but read it many times, then just use a device_vector: initialize it once, and then read it many times.
To do what you asked, you can use a thrust::counting_iterator as the input to thrust::transform to generate a range of indices into your __constant__ array. Then your functor's operator() takes an int index operand rather than a float value operand, and does the lookup into constant memory.
(Note that this means your functor is now __device__ code only. You could easily overload the operator to take a float and call it differently on host data if you need portability.)
I modified your example to initialize the data and print the result to verify that it is correct.
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
const int n = 32;
__constant__ float dev_x[n]; //the array in question
struct struct_max : public thrust::unary_function<float,float> {
float C;
struct_max(float _C) : C(_C) {}
// only works as a device function
__device__ float operator()(const int& i) const {
// use index into constant array
return fmax(dev_x[i],C);
}
};
void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
thrust::device_vector<float> dev_sol(n);
thrust::host_vector<float> host_sol(n);
thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
thrust::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(n),
dev_sol.begin(),
struct_max(x0));
host_sol = dev_sol; //this line crashes
for (int i = 0; i < n; i++)
printf("%f\n", host_sol[i]);
}
int main() {
thrust::host_vector<float> x(n);
//magic happens populate x
for (int i = 0; i < n; i++) x[i] = rand() / (float)RAND_MAX;
cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));
foo(x, 0.5);
return(0);
}

How to advance iterator in thrust function

I'm doing some study on thrust. But I didn't understand how to get the value of an iterator point to.
An example code is like:
#include <thrust/for_each.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <vector>
using namespace std;
class ADD
{
private:
typedef typename thrust::device_vector<int>::iterator PTR;
public:
ADD(){}
~ADD(){}
void setPtr(PTR &ptr)
{this->ptr=ptr;}
__host__ __device__
void operator()(int &x)
{
// note that using printf in a __device__ function requires
// code compiled for a GPU with compute capability 2.0 or
// higher (nvcc --arch=sm_20)
x+=add();
}
__host__ __device__
int add()
{return *ptr++;}
private:
PTR ptr;
};
int main()
{
thrust::device_vector<int> d_vec(3);
d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
thrust::device_vector<int>::iterator itr=d_vec.begin();
ADD *addtest=new ADD();
addtest->setPtr(itr);
thrust::for_each(d_vec.begin(), d_vec.end(), *addtest);
for(int i=0;i<3;i++)
cout<<d_vec[i]<<endl;
return 0;
}
When I compile this using nvcc -arch=sm_20 test.cu
I got the following warning:
test.cu(28): warning: calling a host function("thrust::experimental::iterator_facade<thrust::detail::normal_iterator<thrust::device_ptr<int> > , thrust::device_ptr<int> , int, thrust::detail::cuda_device_space_tag, thrust::random_access_traversal_tag, thrust::device_reference<int> , long> ::operator *") from a __device__/__global__ function("printf_functor::add") is not allowed
test.cu(28): warning: calling a host function("thrust::experimental::iterator_facade<thrust::detail::normal_iterator<thrust::device_ptr<int> > , thrust::device_ptr<int> , int, thrust::detail::cuda_device_space_tag, thrust::random_access_traversal_tag, thrust::device_reference<int> , long> ::operator *") from a __device__/__global__ function("printf_functor::add") is not allowed
I cannot get this to compile. How can I solve this problem?
#Gang.Wang: I think you just mixing up 2 different things: all STL-like functionality including for_each, device_vector iterators etc. is just a "facade" which exists on the host only.
While operator() contains the actual GPU code which is compiled to CUDA kernel and applied to each element of your vector in parallel. Hence, device_vector::iterators are not accessible from your functor.