How to use make_transform_iterator() with counting_iterator<> and execution_policy in Thrust? - cuda

I try to compile this code with MSVS2012, CUDA5.5, Thrust 1.7:
#include <iostream>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/find.h>
#include <thrust/execution_policy.h>
struct is_odd {
__host__ __device__ bool operator()(uint64_t &x) {
return x & 1;
}
};
int main() {
thrust::counting_iterator<uint64_t> first(0);
thrust::counting_iterator<uint64_t> last = first + 100;
auto iter = thrust::find(thrust::device,
thrust::make_transform_iterator(first, is_odd()),
thrust::make_transform_iterator(last, is_odd()),
true);
int bbb; std::cin >> bbb;
return 0;
}
and get an error:
Error 1 error : incomplete type is not allowed C:\Program Files\NVIDIA
GPU Computing Toolkit\CUDA\v5.5\include\thrust\detail\type_traits.h
413 1 HostDevice
If I use host/device_vector instead of counting_iterator then all ok. What's wrong?

I changed your functor definition slightly, from this:
struct is_odd {
__host__ __device__ bool operator()(uint64_t &x) {
to this:
struct is_odd : public thrust::unary_function<uint64_t, bool> {
__host__ __device__ bool operator()(const uint64_t &x) {
and it compiled for me.

Related

calling a __host__ function from a __host__ __device__ function

When compiling the MWE
#include <iostream>
#include "cuda.h"
struct Foo{
///*
Foo( ){
std::cout << "Construct" << std::endl;
}
Foo( const Foo & that ){
std::cout << "Copy construct" << std::endl;
}
//*/
__host__ __device__
int bar( ) const {
return 0;
}
};
template<typename CopyBody>
__global__
void kernel( CopyBody cBody ){
cBody( );
}
template <typename CopyBody>
void wrapper( CopyBody && cBody ){
std::cout << "enquing kernel" << std::endl;
kernel<<<1,32>>>( cBody );
std::cout << "kernel enqued" << std::endl;
}
int main(int argc, char** argv) {
Foo foo;
std::cout << "enquing kernel" << std::endl;
kernel<<<1,32>>>( [=] __device__ ( ) { foo.bar( ); } );
std::cout << "kernel enqued" << std::endl;
cudaDeviceSynchronize( );
wrapper( [=] __device__ ( ) { foo.bar( ); } );
cudaDeviceSynchronize( );
return 0;
}
with CUDA 10.1 (nvcc --expt-extended-lambda test.cu -o test) the compiler warns about test.cu(16): warning: calling a __host__ function("Foo::Foo") from a __host__ __device__ function("") is not allowed. However, the copy constructor is never called on the device. CUDA 9.1 does not produce this warning.
What is the difference between the direct call to kernel (not producing the warning) and the wrapper version?
Is is safe to ignore this warning?
Where to put #pragma hd_warning_disable or #pragma nv_exec_check_disable to get rid of it?
The given MWE is a based on a larger project, where the wrapper decides whether to use a __device__ or __host__ lambda. The constructors/destructors cannot be marked as __host__ __device__ since they need to be called on CPU only ((de)allocating CUDA memory) - this or deleting the constructors/destructor (and letting the compilers to create the default __host__ and __device__ versions) would otherwise help.
With the following modifications I don't get mentioned warnings: ( I used CUDA 10.1 on Windows 10 )
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
struct Baz {
Baz() {
printf("%s: Construct\n", __FUNCTION__);
}
Baz(const Baz & that) {
printf("%s: Copy Construct\n", __FUNCTION__);
}
};
struct Foo:
public Baz {
__host__ __device__
int bar() const {
return 0;
}
};
template<typename CopyBody>
__global__
void kernel(CopyBody cBody) {
cBody();
}
template <typename CopyBody>
void wrapper(CopyBody && cBody) {
printf("%s: enquing kernel\n",__FUNCTION__);
kernel << <1, 32 >> > (cBody);
printf("%s: kernel enqued\n", __FUNCTION__);
}
int main(int argc, char** argv) {
Foo foo;
printf("%s: enquing kernel\n", __FUNCTION__);
kernel << <1, 32 >> > ([=] __device__() { foo.bar(); });
printf("%s: kernel enqued\n", __FUNCTION__);
cudaDeviceSynchronize();
wrapper([=] __device__() { foo.bar(); });
cudaDeviceSynchronize();
return 0;
}
The above code produces the following output:
Foo::Foo: Construct
main: enquing kernel
Foo::Foo: Copy Construct
Foo::Foo: Copy Construct
main: kernel enqued
Foo::Foo: Copy Construct
Foo::Foo: Copy Construct
wrapper: enquing kernel
Foo::Foo: Copy Construct
wrapper: kernel enqued
I replaced <iostream> with <stdio.h> for convenience. printf() works from the kernel.

warning: calling a __host__ function from a __host__ __device__ function is not allowed

I referenced almost all similar questions but did not find an answer. Error checking is recommended by many people, So I tried to use CHECKED_CALL() type macro to make the program strong, but my code has encountered two problems:
As the title says, I got a warning message, but before I used #pragma hd_warning_disable, I got the error message:
cuEntityIDBuffer.cu(9): error: identifier "stderr" is undefined in device code
When I compiled the maintest.cpp, I got another error:
EDIT:
g++ -c maintest.cpp -std=c++11
cuEntityIDBuffer.h:1:27: fatal error: thrust/reduce.h: No such file or directory
However, it works fine when compiling cuEntityIDBuffer.cu cuEntityIDBuffer.h is also included in this file.
nvcc -arch=sm_35 -Xcompiler '-fPIC' -dc cuEntityIDBuffer.cu
Both cuEntityIDBuffer.cu and maintest.cpp #include "cuEntityIDBuffer.h", but maintest.cpp throws an error, I have no ideas about it.
The code is below:
cuEntityIDBuffer.h
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif
class cuEntityIDBuffer
{
public:
CUDA_CALLABLE_MEMBER cuEntityIDBuffer();
CUDA_CALLABLE_MEMBER cuEntityIDBuffer(unsigned int* buffer);
CUDA_CALLABLE_MEMBER void cuCallBackEntityIDBuffer(unsigned int* buffer);
CUDA_CALLABLE_MEMBER ~cuEntityIDBuffer();
CUDA_CALLABLE_MEMBER void cuTest();
private:
size_t buffersize;
unsigned int* cuBuffer;
};
cuEntityIDBuffer.cu
#include "cuEntityIDBuffer.h"
#include <stdio.h>
#pragma hd_warning_disable
#define nTPB 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void mykernel(unsigned int* buffer)
{
int idx = threadIdx.x + (blockDim.x * blockIdx.x);
buffer[idx]++;
//other things.
}
cuEntityIDBuffer::cuEntityIDBuffer()
{
buffersize=1024;
gpuErrchk(cudaMalloc(&cuBuffer, buffersize * sizeof(unsigned int)));
}
cuEntityIDBuffer::cuEntityIDBuffer(unsigned int* buffer)
{
buffersize=1024;
gpuErrchk(cudaMalloc(&cuBuffer, buffersize * sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(cuBuffer,buffer,buffersize*sizeof(unsigned int),cudaMemcpyHostToDevice));
}
void cuEntityIDBuffer::cuCallBackEntityIDBuffer(unsigned int* buffer)
{
gpuErrchk(cudaMemcpy(buffer,cuBuffer,buffersize*sizeof(unsigned int),cudaMemcpyDeviceToHost));
}
cuEntityIDBuffer::~cuEntityIDBuffer()
{
gpuErrchk(cudaFree((cuBuffer)));
}
void cuEntityIDBuffer::cuTest()
{
mykernel<<<((buffersize+nTPB-1)/nTPB),nTPB>>>(cuBuffer);
gpuErrchk(cudaPeekAtLastError());
}
maintest.cpp
#include "cuEntityIDBuffer.h"
#include <iostream>
int main(int argc, char const *argv[])
{
unsigned int *h_buf;
h_buf=malloc(1024*sizeof(unsigned int));
cuEntityIDBuffer d_buf(h_buf);
d_buf.cuTest();
d_buf.cuCallBackEntityIDBuffer(h_buf);
return 0;
}
Is it the wrong way that I used the CHECKED_CALL() type macro or is there a problem with my code organization? any suggestion is appreciated.
Your methods are defined as __host__ and __device, which means they will be compiled once for CPU and once for the device. I don't see any big issue for the CPU version. However, you have two problems for the device version:
cuEntityIDBuffer.cu(9): error: identifier "stderr" is undefined in device code is very clear, you're trying to use the CPU variable stderr in device code.
warning: calling a __host__ function from a __host__ __device__ function is not allowed is the same kind of problem: without any of __host__, __device__ or __global__ attribute, symbols are implicitly set to __host__, which means in your case that the device version of your methods is trying to use gpuAssert which is only on CPU side.
For cuEntityIDBuffer.h:1:27: fatal error: thrust/reduce.h: No such file or directory, as #Talonmies pointed out, any Thrust code has to be built using nvcc.

thrust iterator mix usage

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#include <iostream>
#include <thrust/transform.h>
struct text_functor {
text_functor() {}
__host__ __device__ int operator()(const char t) const {
if (t == '\n') return 0;
return 1;
}
};
void CountPosition1(const char *text, int *pos, int text_size)
{
thrust::transform(text, text + text_size, pos, text_functor());
}
int main() {
char s[4] = {'a', 'a', '\n', 'a'};
int r[4] = {0};
int *k;
cudaMalloc((void**) &k, sizeof(int) * 4);
CountPosition1(s, k, 4);
}
In thrust::transform, I mix host iterator s and device iterator k. This results in segmentation fault. If I change argument k to r in CountPosition1 the program will be correct.
Should all iterator in a thrust function be from same source(both host or both device)? Or is there something wrong in this code?
Yes, either all iterators should be from host containers, or all iterators should be from device containers.
Upon algorithm dispatch, thrust will dispatch either the host path or the device path. All iterators should be consistent with the dispatch method.

Template __host__ __device__ calling host defined functions

During implementation of CUDA code I often need some utility functions, which will be called from device and also from host code. So I declare these functions as __host__ __device__. This is OK and possible device/host incompabilities can be handled by #ifdef CUDA_ARCH.
Problems come when the utility function is templated ie. by some functor type. If the template instance calls a __host__ function I get this warning:
calling a __host__ function from a __host__ __device__ function is not allowed
detected during instantiation of "int foo(const T &) [with T=HostObject]"
Only solution I know is to define the function twice - once for device and once for host code with different name (I cannot overload on __host__ __device__). But this means that there is code duplication and all other __host__ __device__ functions which will call it, must be also defined twice (even more code duplication).
Simplified example:
#include <cuda.h>
#include <iostream>
struct HostObject {
__host__
int value() const { return 42; }
};
struct DeviceObject {
__device__
int value() const { return 3; }
};
template <typename T>
__host__ __device__
int foo(const T &obj) {
return obj.value();
}
/*
template <typename T>
__host__
int foo_host(const T &obj) {
return obj.value();
}
template <typename T>
__device__
int foo_device(const T &obj) {
return obj.value();
}
*/
__global__ void kernel(int *data) {
data[threadIdx.x] = foo(DeviceObject());
}
int main() {
foo(HostObject());
int *data;
cudaMalloc((void**)&data, sizeof(int) * 64);
kernel<<<1, 64>>>(data);
cudaThreadSynchronize();
cudaFree(data);
}
Warning is caused by the foo(HostObject()); call inside the main() function.
foo_host<> and foo_device<> are possible replacements for the problematic foo<>.
Is there a better solution? Can I prevent instantion of foo() on the device side?
You cannot prevent instantiation of either half of a __host__ __device__ function template instantiation. If you instantiate the function by calling it on the host (device), the compiler will also instantiate the device (host) half.
The best you can do for your use case as of CUDA 7.0 is to suppress the warning using #pragma hd_warning_disable as in the following example and ensure that the function is not called incorrectly.
#include <iostream>
#include <cstdio>
#pragma hd_warning_disable
template<class Function>
__host__ __device__
void invoke(Function f)
{
f();
}
struct host_only
{
__host__
void operator()()
{
std::cout << "host_only()" << std::endl;
}
};
struct device_only
{
__device__
void operator()()
{
printf("device_only(): thread %d\n", threadIdx.x);
}
};
__global__
void kernel()
{
// use from device with device functor
invoke(device_only());
// XXX error
// invoke(host_only());
}
int main()
{
// use from host with host functor
invoke(host_only());
kernel<<<1,1>>>();
cudaDeviceSynchronize();
// XXX error
// invoke(device_only());
return 0;
}
I was struggling with the same problem, and found half of a solution. One can overload the host and device function by adding dummy template parameters to them.
In device code, the __device__ "overload" of f is called, in host code the __host__ "overload" of f is called.
Unfortunately, this makes f to a template function. In particular, for constructors this can make big problems (which I am still struggling with).
#include <type_traits>
#include <cstdio>
#ifndef __CUDA_ARCH__
static constexpr bool in_cuda_code = false;
#else
static constexpr bool in_cuda_code = true;
#endif
__device__ void g_device() { printf( "device\n" ); };
__host__ void g_host() { printf( "host\n" ); };
template< bool b = in_cuda_code > void f();
template<> __device__ void f<true>() { g_device(); }
template<> __host__ void f<false>() { g_host(); }
__global__ void kernel () {
f();
}
int main() {
f();
kernel<<<1,1>>>();
cudaDeviceSynchronize();
}

CUDA 5.0 namespaces for constant memory variable usage

In my program I want to use a structure containing constant variables and keep it on device all long as the program executes to completion.
I have several header files containing the declaration of 'global' functions and their respective '.cu' files for their definitions. I kept this scheme because it helps me contain similar code in one place. e.g. all the 'device' functions required to complete 'KERNEL_1' are separated from those 'device' functions required to complete 'KERNEL_2' along with kernels definitions.
I had no problems with this scheme during compilation and linking. Until I encountered constant variables. I want to use the same constant variable through all kernels and device functions but it doesn't seem to work.
##########################################################################
CODE EXAMPLE
###########################################################################
filename: 'common.h'
--------------------------------------------------------------------------
typedef struct {
double height;
double weight;
int age;
} __CONSTANTS;
__constant__ __CONSTANTS d_const;
---------------------------------------------------------------------------
filename: main.cu
---------------------------------------------------------------------------
#include "common.h"
#include "gpukernels.h"
int main(int argc, char **argv) {
__CONSTANTS T;
T.height = 1.79;
T.weight = 73.2;
T.age = 26;
cudaMemcpyToSymbol(d_const, &T, sizeof(__CONSTANTS));
test_kernel <<< 1, 16 >>>();
cudaDeviceSynchronize();
}
---------------------------------------------------------------------------
filename: gpukernels.h
---------------------------------------------------------------------------
__global__ void test_kernel();
---------------------------------------------------------------------------
filename: gpukernels.cu
---------------------------------------------------------------------------
#include <stdio.h>
#include "gpukernels.h"
#include "common.h"
__global__ void test_kernel() {
printf("Id: %d, height: %f, weight: %f\n", threadIdx.x, d_const.height, d_const.weight);
}
When I execute this code, the kernel executes, displays the thread ids, but the constant values are displayed as zeros. How can I fix this?
MODIFICATIONS AS SUGGESTED
filename: gpukernels.h
----------------------------------------------------------------------
__global__ void test_kernel();
----------------------------------------------------------------------
filename: gpukernels.cu
----------------------------------------------------------------------
#include <stdio.h>
#include "common.h"
#include "gpukernels.h"
extern "C" __constant__ __CONSTANTS d_const;
__global__ void test_kernel() {
printf("Id: %d, Height: %f, Weight: %f\n", threadIdx.x, d_const.height, d_const.weight);
}
----------------------------------------------------------------------
filename: common.h
----------------------------------------------------------------------
typedef struct {
double height;
double weight;
int age;
} __CONSTANTS;
----------------------------------------------------------------------
filename: main.cu
----------------------------------------------------------------------
#include "common.h"
#include "gpukernels.h"
__constant__ __CONSTANTS d_const;
int main(int argc, char **argv) {
__CONSTANTS T;
T.height = 1.79;
T.weight = 73.2;
T.age = 26;
cudaMemcpyToSymbol(d_const, &T, sizeof(__CONSTANTS));
test_kernel <<< 1, 16 >>> ();
cudaDeviceSynchronize();
return 0;
}
So as suggested, I tried the code, still doesn't work. Did I miss something here?
Below, I report the solution which is working for me. Remember that you are using separate compilation, so do not forget to use Generate Relocatable Device Code (-rdc=true option).
FILE main.cu
#include <cuda.h>
#include <cuda_runtime.h>
typedef struct {
double height;
double weight;
int age;
} __CONSTANTS;
__constant__ __CONSTANTS d_const;
__global__ void test_kernel();
#include <conio.h>
int main(int argc, char **argv) {
__CONSTANTS T;
T.height = 1.79;
T.weight = 73.2;
T.age = 26;
cudaMemcpyToSymbol(d_const, &T, sizeof(__CONSTANTS));
test_kernel <<< 1, 16 >>>();
cudaDeviceSynchronize();
getch();
return 0;
}
FILE kernel.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
typedef struct {
double height;
double weight;
int age;
} __CONSTANTS;
extern __constant__ __CONSTANTS d_const;
__global__ void test_kernel() {
printf("Id: %d, height: %f, weight: %f\n", threadIdx.x, d_const.height, d_const.weight);
}