calling a __host__ function from a __host__ __device__ function - cuda

When compiling the MWE
#include <iostream>
#include "cuda.h"
struct Foo{
///*
Foo( ){
std::cout << "Construct" << std::endl;
}
Foo( const Foo & that ){
std::cout << "Copy construct" << std::endl;
}
//*/
__host__ __device__
int bar( ) const {
return 0;
}
};
template<typename CopyBody>
__global__
void kernel( CopyBody cBody ){
cBody( );
}
template <typename CopyBody>
void wrapper( CopyBody && cBody ){
std::cout << "enquing kernel" << std::endl;
kernel<<<1,32>>>( cBody );
std::cout << "kernel enqued" << std::endl;
}
int main(int argc, char** argv) {
Foo foo;
std::cout << "enquing kernel" << std::endl;
kernel<<<1,32>>>( [=] __device__ ( ) { foo.bar( ); } );
std::cout << "kernel enqued" << std::endl;
cudaDeviceSynchronize( );
wrapper( [=] __device__ ( ) { foo.bar( ); } );
cudaDeviceSynchronize( );
return 0;
}
with CUDA 10.1 (nvcc --expt-extended-lambda test.cu -o test) the compiler warns about test.cu(16): warning: calling a __host__ function("Foo::Foo") from a __host__ __device__ function("") is not allowed. However, the copy constructor is never called on the device. CUDA 9.1 does not produce this warning.
What is the difference between the direct call to kernel (not producing the warning) and the wrapper version?
Is is safe to ignore this warning?
Where to put #pragma hd_warning_disable or #pragma nv_exec_check_disable to get rid of it?
The given MWE is a based on a larger project, where the wrapper decides whether to use a __device__ or __host__ lambda. The constructors/destructors cannot be marked as __host__ __device__ since they need to be called on CPU only ((de)allocating CUDA memory) - this or deleting the constructors/destructor (and letting the compilers to create the default __host__ and __device__ versions) would otherwise help.

With the following modifications I don't get mentioned warnings: ( I used CUDA 10.1 on Windows 10 )
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
struct Baz {
Baz() {
printf("%s: Construct\n", __FUNCTION__);
}
Baz(const Baz & that) {
printf("%s: Copy Construct\n", __FUNCTION__);
}
};
struct Foo:
public Baz {
__host__ __device__
int bar() const {
return 0;
}
};
template<typename CopyBody>
__global__
void kernel(CopyBody cBody) {
cBody();
}
template <typename CopyBody>
void wrapper(CopyBody && cBody) {
printf("%s: enquing kernel\n",__FUNCTION__);
kernel << <1, 32 >> > (cBody);
printf("%s: kernel enqued\n", __FUNCTION__);
}
int main(int argc, char** argv) {
Foo foo;
printf("%s: enquing kernel\n", __FUNCTION__);
kernel << <1, 32 >> > ([=] __device__() { foo.bar(); });
printf("%s: kernel enqued\n", __FUNCTION__);
cudaDeviceSynchronize();
wrapper([=] __device__() { foo.bar(); });
cudaDeviceSynchronize();
return 0;
}
The above code produces the following output:
Foo::Foo: Construct
main: enquing kernel
Foo::Foo: Copy Construct
Foo::Foo: Copy Construct
main: kernel enqued
Foo::Foo: Copy Construct
Foo::Foo: Copy Construct
wrapper: enquing kernel
Foo::Foo: Copy Construct
wrapper: kernel enqued
I replaced <iostream> with <stdio.h> for convenience. printf() works from the kernel.

Related

Thrust: why always host code is executed in spite of __CUDA_ARCH__

I try to define two branches in code: one for CUDA execution and the other - without it (with future OMP in mind). But when I use macro __CUDA_ARCH__ it looks as if always the host code is executed. But I supposed that Thrust by default use CUDA (and branch for device code). What's wrong with my code?
Here it is:
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, data, op);
for each (int el in data)
std::cout << el << " ";
std::cout << std::endl;
}
I expect that "transform" will define vector as multiplied by 2*constanta but I see that host code is used - the output is "10 11 12 13 14 15 16", not "20 22 24 26 28 30 32" (as expected).
Why?
Thrust is choosing the host path because one of your data items supplied to the thrust transform operation is in host memory:
thrust::transform(first, last, data, op);
^^^^
If you want a thrust algorithm to operate on the device, generally speaking all the container data you pass to/from must also reside in device memory.
Here's a modification to your code that demonstrates that thrust will follow the device path if we replace data with a device-resident container:
$ cat t13.cu
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
// int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
thrust::device_vector<int> d_data(7);
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, d_data.begin(), op);
for (int el = 0; el < 7; el++) {
int dat = d_data[el];
std::cout << dat << " "; }
std::cout << std::endl;
}
$ nvcc -arch=sm_61 -o t13 t13.cu
$ ./t13
20 22 24 26 28 30 32
$
You may want to read the thrust quick start guide to learn about thrust algorithm dispatch.

CUDA Thrust copy transformed result only if it satisfies a predicate

I want to perform a transformation on a input thrust::device_vector and only copy the result to the output vector if the result satisfies a predicate. So the number of results could be less than the size of the input device_vector (similar to the output vector of thrust::copy_if). I have not found a way to do this with thrust::transform_if. Currently I can do this with thrust::transform and thrust::remove_if as shown in the example below:
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random {
__host__ __device__ add_random() {}
__device__ int operator()(const int n, const int x) const {
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x > 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(5),
d_x.begin(),
d_x.begin(),
add_random());
std::cout << "after adding random number:" << std::endl;
std::ostream_iterator<int> o(std::cout, " ");
thrust::copy(d_x.begin(), d_x.end(), o);
std::cout << std::endl;
thrust::device_vector<int>::iterator new_end(thrust::remove_if(d_x.begin(), d_x.end(), is_greater()));
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy(d_x.begin(), new_end, o);
std::cout << std::endl;
return 0;
}
Which gives the output:
after adding random number:
18 4 8 7 11
after removing values greater than 6:
4
I would like to avoid copying the results to memory twice, first by thrust::transform and then by thrust::remove_if in the above example. Is it possible to get the above output with a single transformation function? How can I do this? My biggest concern is the computational cost, so any optimized solution, even if it doesn't use the Thrust library would be great.
Welcome to the world of thrust fancy iterators. You can get a quick overview of some fancy iterator types by looking at the thrust quick start guide. In particular, a thrust transform iterator can frequently be used to replace a thrust transform operation that is applied to the input of another thrust algorithm, "fusing" the two algorithms into a single operation.
Here's a worked example applied to your case:
$ cat t1254.cu
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random : public thrust::unary_function<thrust::tuple<int, int>, int> {
__host__ __device__ int operator()(thrust::tuple<int, int> t) const {
int n = thrust::get<0>(t);
int x = thrust::get<1>(t);
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x < 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::device_vector<int> d_r(5);
int rsize = thrust::copy_if(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), d_x.begin())), add_random()), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(5), d_x.end())), add_random()), d_r.begin(), is_greater())- d_r.begin();
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy_n(d_r.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1254 t1254.cu
$ ./t1254
after removing values greater than 6:
4
$
We've replaced your transform operation with a transform iterator applied to the same two inputs. Since you have two inputs to your transform operation, we're using a zip iterator to combine these, and the transform functor has also been reworked slightly to accept that tuple as its input.
Converted your remove_if to a copy_if, to work with the transform iterator as input. This requires a slight change in the logic of the copy predicate.

How to use make_transform_iterator() with counting_iterator<> and execution_policy in Thrust?

I try to compile this code with MSVS2012, CUDA5.5, Thrust 1.7:
#include <iostream>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/find.h>
#include <thrust/execution_policy.h>
struct is_odd {
__host__ __device__ bool operator()(uint64_t &x) {
return x & 1;
}
};
int main() {
thrust::counting_iterator<uint64_t> first(0);
thrust::counting_iterator<uint64_t> last = first + 100;
auto iter = thrust::find(thrust::device,
thrust::make_transform_iterator(first, is_odd()),
thrust::make_transform_iterator(last, is_odd()),
true);
int bbb; std::cin >> bbb;
return 0;
}
and get an error:
Error 1 error : incomplete type is not allowed C:\Program Files\NVIDIA
GPU Computing Toolkit\CUDA\v5.5\include\thrust\detail\type_traits.h
413 1 HostDevice
If I use host/device_vector instead of counting_iterator then all ok. What's wrong?
I changed your functor definition slightly, from this:
struct is_odd {
__host__ __device__ bool operator()(uint64_t &x) {
to this:
struct is_odd : public thrust::unary_function<uint64_t, bool> {
__host__ __device__ bool operator()(const uint64_t &x) {
and it compiled for me.

Thrust Gathering/Filtering

What I am trying to do is create a filter on a vector so it removes elements that do not pass a predicate test; but not too sure how I go about it.
I evaluate each element in my inputer vector against the predicate, for example in my code the is_even functor, in a device_vector vector. It is true if it passes the test and false if it's not.
Now I am stuck because I now have this bool vector and I want to gather the elements that passed this predicate test. I store it in a bool vector because I want to keep the result to filter other vectors.
#include ...
template<typename T>
struct is_even : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x%2)==0;
}
};
int main(void)
{
std::cout << "Loading test!" << std::endl;
const int N = 1000000;
thrust::device_vector<int> col1(N);
thrust::device_vector<float> col2(N, 1);
thrust::sequence(col1.begin(), col1.end());
thrust::device_vector<bool> filter(N);
thrust::transform(col1.begin(), col1.end(), filter.begin(), is_even<int>());
// filter col1 and col2 based on filter
return 0;
}
Within the stream compaction group you may be interested in thrust::copy_if
We can select the even elements into a new vector directly using your defined predicate without making an intermediate filter vector:
thrust::copy_if(col1.begin(), col1.end(), result.begin(), is_even<int>());
(result should be a vector of identical type to col1, and already defined to be a length equal to or greater than col1, since it's unknown how many elements will pass the predicate test.)
If you want to work off of the filter vector you have created, use the stencil version of copy_if instead.
Here's a worked example using the stencil method based on your comments:
$ cat t267.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
template<typename T>
struct is_even : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x%2)==0;
}
};
struct is_true : thrust::unary_function<bool, bool>
{
__host__ __device__
bool operator()(const bool &x)
{
return x;
}
};
int main(void)
{
std::cout << "Loading test!" << std::endl;
const int N = 1000000;
thrust::device_vector<int> col1(N);
thrust::device_vector<float> col2(N, 1);
thrust::sequence(col1.begin(), col1.end());
thrust::device_vector<bool> filter(N);
thrust::device_vector<int> result(N);
thrust::transform(col1.begin(), col1.end(), filter.begin(), is_even<int>());
// filter col1 based on filter
thrust::device_vector<int>::iterator end = thrust::copy_if(col1.begin(), col1.end(), filter.begin(), result.begin(), is_true());
int len = end - result.begin();
thrust::host_vector<int> h_result(len);
thrust::copy_n(result.begin(), len, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, "\n"));
return 0;
}
$ nvcc -arch=sm_20 -o t267 t267.cu
$ ./t267
Loading test!
0
2
4
6
8
10
12
14
16
18
$

CUDA host and device using same __constant__ memory

I have device/host function that uses constant memory. It runs OK on device, but on host it seems like this memory remains uninitialized.
#include <iostream>
#include <stdio.h>
const __constant__ double vals[2] = { 0.0, 1000.0 };
__device__ __host__ double f(size_t i)
{
return vals[i];
}
__global__ void kern()
{
printf("vals[%d] = %lf\n", threadIdx.x, vals[threadIdx.x]);
}
int main() {
std::cerr << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}
This prints (requires CC 2.0 or above)
0 0
vals[0] = 0.000000
vals[1] = 1000.000000
What is the problem and how can I get both device and host memory constants initialized simultaneously?
Since CygnusX1 misunderstood what I meant in my comment on MurphEngineer's answer, maybe I should post my own answer. What I meant was this:
__constant__ double dc_vals[2] = { 0.0, 1000.0 };
const double hc_vals[2] = { 0.0, 1000.0 };
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
This has the same result as Cygnus', but it is more flexible in the face of real code: it lets you have runtime-defined values in your constant arrays, for example, and allows you to use CUDA API functions like cudaMemcpyToSymbol/cudsaMemcpyFromSymbol on the __constant__ array.
A more realistic complete example:
#include <iostream>
#include <stdio.h>
__constant__ double dc_vals[2];
const double hc_vals[2];
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
__global__ void kern()
{
printf("vals[%d] = %lf\n", threadIdx.x, vals[threadIdx.x]);
}
int main() {
hc_vals[0] = 0.0;
hc_vals[1] = 1000.0;
cudaMemcpyToSymbol(dc_vals, hc_vals, 2 * sizeof(double), 0, cudaMemcpyHostToDevice);
std::cerr << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}
I think MurphEngineer explained well why it does not work.
To quickly fix this problem, you can follow harrism's idea, something like this:
#ifdef __CUDA_ARCH__
#define CONSTANT __constant__
#else
#define CONSTANT
#endif
const CONSTANT double vals[2] = { 0.0, 1000.0 };
This way the host compilation will create a normal host const array, while device compilation will create a device __constant__ compilation.
Do note that with this trick it might be harder to use CUDA API to access that device array with functions like cudaMemcpyToSymbol() if you ever decide to do so.
Using the __constant__ qualifier explicitly allocates that memory on the device. There is no way to access that memory from the host -- not even with the new CUDA Unified Addressing stuff (that only works for memory allocated with cudaMalloc() and its friends). Qualifying the variable with const just says "this is a constant pointer to (...)".
The correct way to do this is, indeed, to have two arrays: one on the host, and one on the device. Initialize your host array, then use cudaMemcpyToSymbol() to copy data to the device array at runtime. For more information on how to do this, see this thread: http://forums.nvidia.com/index.php?showtopic=69724
Absolutely great. I was struggling with the same issue and this provides a solution. However, the code suggested by harrism gives errors on compilation. Here is the fixed code which compiles correctly with nvcc:
#include <iostream>
#include <stdio.h>
__constant__ double dc_vals[2];
const double hc_vals[2] = {0.0, 1000.0};
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
__global__ void kern()
{
printf("Device: vals[%d] = %lf\n", threadIdx.x, f(threadIdx.x));
}
int main() {
cudaMemcpyToSymbol(dc_vals, hc_vals, 2 * sizeof(double), 0, cudaMemcpyHostToDevice);
std::cerr << "Host: " << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}