Sorting using thrust library on device raw double pointers [closed] - cuda

Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 8 days ago.
Improve this question
Note : Please refer to the following code : -
#include <iostream>
#include <curand_kernel.h>
#include "cuda_common.cuh"
#include "constants.cuh"
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/functional.h>
class parent_int{
public:
__device__ __host__ virtual parent_int* Clone()=0;
__device__ __host__ virtual int get_int() const=0;
__device__ __host__ virtual void increment_int()=0;
};
class a_int: public parent_int{
public:
int a;
__device__ __host__ a_int(int _a=0): a(_a){}
__device__ __host__ virtual parent_int* Clone() override{
return new a_int(*this);
}
__device__ __host__ virtual int get_int()const override{return a;}
__device__ __host__ virtual void increment_int()override{a+=1;}
};
class b_int:public parent_int{
public:
int b;
__device__ __host__ b_int(int _b=0): b(_b){}
__device__ __host__ virtual parent_int* Clone() override{
return new b_int(*this);
}
__device__ __host__ virtual int get_int() const override{return b;}
__device__ __host__ virtual void increment_int() override{b+=2;}
};
__device__ bool comparator(parent_int *a,parent_int *b)
{
return a->get_int()<b->get_int();
}
__global__ void print_values(parent_int **d_a,int size,curandState *rand_state)
{
if(threadIdx.x==0&&blockIdx.x==0)
{
curand_init(1984,0,0,rand_state);
for(int i=0;i<size;i++)
{
if(curand_uniform(rand_state)<=0.5f)
{
d_a[i] = new a_int(curand_uniform(rand_state)*10);
}
else
{
d_a[i] = new b_int(curand_uniform(rand_state)*10);
}
}
printf("\nOriginal array - \n");
for(int i=0;i<size;i++)
{
printf("%d \n", d_a[i]->get_int());
}
auto dev_data = thrust::device_pointer_cast(thrust::device_pointer_cast(d_a[0]));
thrust::sort(dev_data,dev_data+size,comparator);
printf("\nSorted array - \n");
for(int i=0;i<size;i++)
{
printf("%d \n", d_a[i]->get_int());
}
}
}
int main(){
parent_int **d_a;
int size = 10;
curandState *rand_state;
gpuErrchk(cudaMalloc(&rand_state,sizeof(rand_state)));
gpuErrchk(cudaMalloc(&d_a,sizeof(parent_int*)*size));
print_values<<<1,1>>>(d_a,size,rand_state);
gpuErrchk(cudaDeviceSynchronize());
}
Question :
I am trying to use the thrust library to sort my class object pointers using a custom comparator. But I am getting several errors like -
error: no suitable conversion function from "parent_int" to "parent_int *"
error: array of abstract class "parent_int" is not allowed:
I have just started working on this library and I am not sure if its the perfect option for my case, do suggest some work arounds or any alternatives- my main aim is to just sort my objects.
Constraints:
This code snippet is actually a reproduction of a problem that I am facing in my project , so I cannot consider changing the layout of the classes and the design of my class structure

Related

__CUDA_ARCH__ flag with Thrust execution policy

I have a __host__ __device__ function which is a wrapper that calls into "sort" function of the thrust library. Inside this wrapper, I am using the __CUDA_ARCH__ flag to set the execution policy to "thrust::device" when called from host and "thrust::seq" when called from device. The following piece of code generates a runtime error -
#ifndef __CUDA_ARCH__
thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
#else
thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
#endif
The error is-
Unexpected Standard exception:
What() is:merge_sort: failed on 2nd step: invalid device function
As per my understanding, CUDA_ARCH can be used for conditional compilation. I request for help in understanding why this error is thrown.
It seems you are stepping on this issue. In a nutshell, thrust uses CUB functionality under the hood for certain algorithms (including sort). Your use of __CUDA_ARCH__ macro in your code, which wraps around thrust algorithm calls that use CUB, is interfering with CUB code that expects to be able to use this macro for all paths.
A possible workaround is to do "your own dispatch":
$ cat t142.cu
#include <iostream>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
template <typename T>
struct customGreater {
__host__ __device__ bool operator()(T &t1, T &t2){
return (t1 > t2);}
};
template <typename T>
__host__ __device__
void my_sort_wrapper(T *data, size_t num){
int hostdev = 0; // 0=device code
#ifndef __CUDA_ARCH__
hostdev = 1; // 1=host code
#endif
if (hostdev == 0) thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
else thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
}
template <typename T>
__global__ void my_dev_sort(T *data, size_t num){
my_sort_wrapper(data, num);
}
typedef int mytype;
const size_t sz = 10;
int main(){
mytype *d_data;
cudaMalloc(&d_data, sz*sizeof(mytype));
cudaMemset(d_data, 0, sz*sizeof(mytype));
my_sort_wrapper(d_data, sz);
my_dev_sort<<<1,1>>>(d_data, sz);
cudaDeviceSynchronize();
}
$ nvcc t142.cu -o t142
$ cuda-memcheck ./t142
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
With this realization, the use of the __CUDA_ARCH__ macro does not perturb the compilation of the thrust algorithms.
Another possible workaround is simply to use thrust::device policy for both cases (no dispatch - just the thrust algorithm call). Except in the case of CUDA Dynamic Parallelism, thrust::device will "decay" to thrust::seq when used in device code.
I would expect that these suggestions would only be necessary/relevant when the thrust algorithm uses CUB functionality in the underlying implementation.
If you don't like this behavior, you could file a thrust issue.
Unfortunately, we can't fix this in Thrust. The trouble here is that the NVCC compiler needs to see all __global__ function template instantiations during host compilation (e.g. when __CUDA_ARCH__ is not defined), otherwise the kernels will be treated as unused and discarded. See this CUB GitHub issue for more details.
As Robert suggested, a workaround such as this should be fine:
#include <iostream>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
template <typename T>
struct customGreater {
__host__ __device__ bool operator()(T &t1, T &t2){
return (t1 > t2);}
};
#if defined(__CUDA_ARCH__)
#define DEVICE_COMPILATION 1
#else
#define DEVICE_COMPILATION 0
#endif
template <typename T>
__host__ __device__
void my_sort(T *data, size_t num){
if (DEVICE_COMPILATION)
thrust::stable_sort(thrust::device, data, data + num, customGreater<T>());
else
thrust::stable_sort(thrust::seq, data, data + num, customGreater<T>());
}
template <typename T>
__global__ void my_dev_sort(T *data, size_t num){
my_sort(data, num);
}
typedef int mytype;
const size_t sz = 10;
int main(){
mytype *d_data;
cudaMallocManaged(&d_data, sz*sizeof(mytype));
cudaMemset(d_data, 0, sz*sizeof(mytype));
my_sort(d_data, sz);
my_dev_sort<<<1,1>>>(d_data, sz);
cudaFree(d_data);
cudaDeviceSynchronize();
}

Template __host__ __device__ calling host defined functions

During implementation of CUDA code I often need some utility functions, which will be called from device and also from host code. So I declare these functions as __host__ __device__. This is OK and possible device/host incompabilities can be handled by #ifdef CUDA_ARCH.
Problems come when the utility function is templated ie. by some functor type. If the template instance calls a __host__ function I get this warning:
calling a __host__ function from a __host__ __device__ function is not allowed
detected during instantiation of "int foo(const T &) [with T=HostObject]"
Only solution I know is to define the function twice - once for device and once for host code with different name (I cannot overload on __host__ __device__). But this means that there is code duplication and all other __host__ __device__ functions which will call it, must be also defined twice (even more code duplication).
Simplified example:
#include <cuda.h>
#include <iostream>
struct HostObject {
__host__
int value() const { return 42; }
};
struct DeviceObject {
__device__
int value() const { return 3; }
};
template <typename T>
__host__ __device__
int foo(const T &obj) {
return obj.value();
}
/*
template <typename T>
__host__
int foo_host(const T &obj) {
return obj.value();
}
template <typename T>
__device__
int foo_device(const T &obj) {
return obj.value();
}
*/
__global__ void kernel(int *data) {
data[threadIdx.x] = foo(DeviceObject());
}
int main() {
foo(HostObject());
int *data;
cudaMalloc((void**)&data, sizeof(int) * 64);
kernel<<<1, 64>>>(data);
cudaThreadSynchronize();
cudaFree(data);
}
Warning is caused by the foo(HostObject()); call inside the main() function.
foo_host<> and foo_device<> are possible replacements for the problematic foo<>.
Is there a better solution? Can I prevent instantion of foo() on the device side?
You cannot prevent instantiation of either half of a __host__ __device__ function template instantiation. If you instantiate the function by calling it on the host (device), the compiler will also instantiate the device (host) half.
The best you can do for your use case as of CUDA 7.0 is to suppress the warning using #pragma hd_warning_disable as in the following example and ensure that the function is not called incorrectly.
#include <iostream>
#include <cstdio>
#pragma hd_warning_disable
template<class Function>
__host__ __device__
void invoke(Function f)
{
f();
}
struct host_only
{
__host__
void operator()()
{
std::cout << "host_only()" << std::endl;
}
};
struct device_only
{
__device__
void operator()()
{
printf("device_only(): thread %d\n", threadIdx.x);
}
};
__global__
void kernel()
{
// use from device with device functor
invoke(device_only());
// XXX error
// invoke(host_only());
}
int main()
{
// use from host with host functor
invoke(host_only());
kernel<<<1,1>>>();
cudaDeviceSynchronize();
// XXX error
// invoke(device_only());
return 0;
}
I was struggling with the same problem, and found half of a solution. One can overload the host and device function by adding dummy template parameters to them.
In device code, the __device__ "overload" of f is called, in host code the __host__ "overload" of f is called.
Unfortunately, this makes f to a template function. In particular, for constructors this can make big problems (which I am still struggling with).
#include <type_traits>
#include <cstdio>
#ifndef __CUDA_ARCH__
static constexpr bool in_cuda_code = false;
#else
static constexpr bool in_cuda_code = true;
#endif
__device__ void g_device() { printf( "device\n" ); };
__host__ void g_host() { printf( "host\n" ); };
template< bool b = in_cuda_code > void f();
template<> __device__ void f<true>() { g_device(); }
template<> __host__ void f<false>() { g_host(); }
__global__ void kernel () {
f();
}
int main() {
f();
kernel<<<1,1>>>();
cudaDeviceSynchronize();
}

Function pointers in CUDA __constant__ memory

I have found some strange runtime behaviour while experimenting with function pointers in CUDA.
Goal
My goal is to make my function pointers choose which function to apply to two objects according to an internal property of the latter.
In short, I want to emulate C++ templates with a CUDA kernel - without actually using template arguments or switch clauses, but function pointers and class/struct members instead.
Approach
Define my custom objects struct customObj with one property (int type) that will emulate the arguments of a template.
Define a bunch of dummy functions (Sum(), Subtract(), etc) to choose from.
Keep the list of functions to apply (functionsList) and respective type members to look up (first_types, second_types) in __constant__ memory, such that function functionsList[i](obj1,obj2) is applied to objects with obj1.type == first_types[i] and obj2.type == second_types[i].
Working code
The following code has been compiled for Linux x86_64 with CUDA 5.0, on a GPU with compute capability 3.0 (GeForce GTX 670), and works.
#include <stdio.h>
#include <iostream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct customObj
{
int type;
double d;
// Constructors
__device__ __host__ customObj() {}
__device__ __host__ customObj(const int& _type, const double& _d) : type(_type), d(_d) {}
};
typedef void (*function_t)(customObj&, customObj&);
// Define a bunch of functions
__host__ __device__ void Sum(customObj& obj1, customObj& obj2) {printf("Sum chosen! d1 + d2 = %f\n", obj1.d + obj2.d);}
__host__ __device__ void Subtract(customObj& obj1, customObj& obj2) {printf("Subtract chosen! d1 - d2 = %f\n", obj1.d - obj2.d);}
__host__ __device__ void Multiply(customObj& obj1, customObj& obj2) {printf("Multiply chosen! d1 * d2 = %f\n", obj1.d * obj2.d);}
#define ARRAYLENGTH 3
__constant__ int first_type[ARRAYLENGTH] = {1, 2, 3};
__constant__ int second_type[ARRAYLENGTH] = {1, 1, 2};
__constant__ function_t functionsList[ARRAYLENGTH] = {Sum, Sum, Subtract};
// Kernel to loop through functions list
__global__ void choosefunction(customObj obj1, customObj obj2) {
int i = 0;
function_t f = NULL;
do {
if ((obj1.type == first_type[i]) && (obj2.type == second_type[i])) {
f = functionsList[i];
break;
}
i++;
} while (i < ARRAYLENGTH);
if (f == NULL) printf("No possible interaction!\n");
else f(obj1,obj2);
}
int main() {
customObj obj1(1, 5.2), obj2(1, 2.6);
choosefunction<<<1,1>>>(obj1, obj2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
The issue
The problem that I have found is that, as soon as I replace the datatype of member int type and related variables and functions (__constant__ int first_types[...] and so on)... the code compiles but stops working!
If I change the datatype from int to char or int8_t, the memory checker throws error 4 on my call to cudaDeviceSynchronize().
If I change the datatype to unsigned short int, I get a hardware stack overflow.
So, is anybody having similar issues when working with __constant__ memory? I really have no clue about what is going on. As far as I know, char and int8_t are built-in types of 1 byte length, while the size of int is 4 bytes, so maybe it is about data alignment, but I'm just guessing here. Besides, CUDA is supposed to support function pointers on the GPU since compute capability 2.0. Are there any special constraints for function pointers in __constant__ memory that I'm missing?
I was able to reproduce the problem (error 4, unspecified launch failure) on CUDA 5.0 on 64bit RHEL 5.5, but not on CUDA 6.0.
Please update/upgrade to CUDA 6.

How to use make_transform_iterator() with counting_iterator<> and execution_policy in Thrust?

I try to compile this code with MSVS2012, CUDA5.5, Thrust 1.7:
#include <iostream>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/find.h>
#include <thrust/execution_policy.h>
struct is_odd {
__host__ __device__ bool operator()(uint64_t &x) {
return x & 1;
}
};
int main() {
thrust::counting_iterator<uint64_t> first(0);
thrust::counting_iterator<uint64_t> last = first + 100;
auto iter = thrust::find(thrust::device,
thrust::make_transform_iterator(first, is_odd()),
thrust::make_transform_iterator(last, is_odd()),
true);
int bbb; std::cin >> bbb;
return 0;
}
and get an error:
Error 1 error : incomplete type is not allowed C:\Program Files\NVIDIA
GPU Computing Toolkit\CUDA\v5.5\include\thrust\detail\type_traits.h
413 1 HostDevice
If I use host/device_vector instead of counting_iterator then all ok. What's wrong?
I changed your functor definition slightly, from this:
struct is_odd {
__host__ __device__ bool operator()(uint64_t &x) {
to this:
struct is_odd : public thrust::unary_function<uint64_t, bool> {
__host__ __device__ bool operator()(const uint64_t &x) {
and it compiled for me.

How to advance iterator in thrust function

I'm doing some study on thrust. But I didn't understand how to get the value of an iterator point to.
An example code is like:
#include <thrust/for_each.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <vector>
using namespace std;
class ADD
{
private:
typedef typename thrust::device_vector<int>::iterator PTR;
public:
ADD(){}
~ADD(){}
void setPtr(PTR &ptr)
{this->ptr=ptr;}
__host__ __device__
void operator()(int &x)
{
// note that using printf in a __device__ function requires
// code compiled for a GPU with compute capability 2.0 or
// higher (nvcc --arch=sm_20)
x+=add();
}
__host__ __device__
int add()
{return *ptr++;}
private:
PTR ptr;
};
int main()
{
thrust::device_vector<int> d_vec(3);
d_vec[0] = 0; d_vec[1] = 1; d_vec[2] = 2;
thrust::device_vector<int>::iterator itr=d_vec.begin();
ADD *addtest=new ADD();
addtest->setPtr(itr);
thrust::for_each(d_vec.begin(), d_vec.end(), *addtest);
for(int i=0;i<3;i++)
cout<<d_vec[i]<<endl;
return 0;
}
When I compile this using nvcc -arch=sm_20 test.cu
I got the following warning:
test.cu(28): warning: calling a host function("thrust::experimental::iterator_facade<thrust::detail::normal_iterator<thrust::device_ptr<int> > , thrust::device_ptr<int> , int, thrust::detail::cuda_device_space_tag, thrust::random_access_traversal_tag, thrust::device_reference<int> , long> ::operator *") from a __device__/__global__ function("printf_functor::add") is not allowed
test.cu(28): warning: calling a host function("thrust::experimental::iterator_facade<thrust::detail::normal_iterator<thrust::device_ptr<int> > , thrust::device_ptr<int> , int, thrust::detail::cuda_device_space_tag, thrust::random_access_traversal_tag, thrust::device_reference<int> , long> ::operator *") from a __device__/__global__ function("printf_functor::add") is not allowed
I cannot get this to compile. How can I solve this problem?
#Gang.Wang: I think you just mixing up 2 different things: all STL-like functionality including for_each, device_vector iterators etc. is just a "facade" which exists on the host only.
While operator() contains the actual GPU code which is compiled to CUDA kernel and applied to each element of your vector in parallel. Hence, device_vector::iterators are not accessible from your functor.