Thrust: why always host code is executed in spite of __CUDA_ARCH__ - cuda

I try to define two branches in code: one for CUDA execution and the other - without it (with future OMP in mind). But when I use macro __CUDA_ARCH__ it looks as if always the host code is executed. But I supposed that Thrust by default use CUDA (and branch for device code). What's wrong with my code?
Here it is:
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, data, op);
for each (int el in data)
std::cout << el << " ";
std::cout << std::endl;
}
I expect that "transform" will define vector as multiplied by 2*constanta but I see that host code is used - the output is "10 11 12 13 14 15 16", not "20 22 24 26 28 30 32" (as expected).
Why?

Thrust is choosing the host path because one of your data items supplied to the thrust transform operation is in host memory:
thrust::transform(first, last, data, op);
^^^^
If you want a thrust algorithm to operate on the device, generally speaking all the container data you pass to/from must also reside in device memory.
Here's a modification to your code that demonstrates that thrust will follow the device path if we replace data with a device-resident container:
$ cat t13.cu
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
// int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
thrust::device_vector<int> d_data(7);
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, d_data.begin(), op);
for (int el = 0; el < 7; el++) {
int dat = d_data[el];
std::cout << dat << " "; }
std::cout << std::endl;
}
$ nvcc -arch=sm_61 -o t13 t13.cu
$ ./t13
20 22 24 26 28 30 32
$
You may want to read the thrust quick start guide to learn about thrust algorithm dispatch.

Related

How to use thrust::remove_if to check and remove blocks 2×i and 2×i+1 together

I am using Cuda C++, and I have a big array Arr including 64-bit unsigned integers in a form like the below:
Arr = {a1, b1, a2, b2, ..., an, bn}
The number of items in Arr is 2n which is an even number. Now, given a boolean function f(int a, int b), I wonder if I can use thrust::remove_if to check f(a1,b1), f(a2, b2), ..., f(an, bn) and remove both consecutive numbers (ai, bi) together if needed?
Rather than zip_iterator, I think a simpler approach is just to reinterpret the array of 64-bit integers as an array of thrust::pair. here is an example:
$ cat t2157.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
struct my_remove
{
template <typename T>
__host__ __device__
bool operator()(T t){
return (thrust::get<0>(t) > thrust::get<1>(t));
}
};
const size_t n = 32768;
const size_t s = 2*n;
using mt=unsigned long long;
using dt=thrust::pair<mt,mt>;
int main(){
thrust::device_vector<mt> A(s);
thrust::sequence(A.begin(), A.end());
A[0] = 2; // expecting removal of the first pair
thrust::copy_n(A.begin(), 6, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
auto D = thrust::device_pointer_cast<dt>(reinterpret_cast<dt *>(thrust::raw_pointer_cast(A.data())));
thrust::remove_if(D, D+n, my_remove());
thrust::copy_n(A.begin(), 6, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t2157 t2157.cu
$ compute-sanitizer ./t2157
========= COMPUTE-SANITIZER
2,1,2,3,4,5,
2,3,4,5,6,7,
========= ERROR SUMMARY: 0 errors
$

CUDA Thrust copy transformed result only if it satisfies a predicate

I want to perform a transformation on a input thrust::device_vector and only copy the result to the output vector if the result satisfies a predicate. So the number of results could be less than the size of the input device_vector (similar to the output vector of thrust::copy_if). I have not found a way to do this with thrust::transform_if. Currently I can do this with thrust::transform and thrust::remove_if as shown in the example below:
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random {
__host__ __device__ add_random() {}
__device__ int operator()(const int n, const int x) const {
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x > 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(5),
d_x.begin(),
d_x.begin(),
add_random());
std::cout << "after adding random number:" << std::endl;
std::ostream_iterator<int> o(std::cout, " ");
thrust::copy(d_x.begin(), d_x.end(), o);
std::cout << std::endl;
thrust::device_vector<int>::iterator new_end(thrust::remove_if(d_x.begin(), d_x.end(), is_greater()));
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy(d_x.begin(), new_end, o);
std::cout << std::endl;
return 0;
}
Which gives the output:
after adding random number:
18 4 8 7 11
after removing values greater than 6:
4
I would like to avoid copying the results to memory twice, first by thrust::transform and then by thrust::remove_if in the above example. Is it possible to get the above output with a single transformation function? How can I do this? My biggest concern is the computational cost, so any optimized solution, even if it doesn't use the Thrust library would be great.
Welcome to the world of thrust fancy iterators. You can get a quick overview of some fancy iterator types by looking at the thrust quick start guide. In particular, a thrust transform iterator can frequently be used to replace a thrust transform operation that is applied to the input of another thrust algorithm, "fusing" the two algorithms into a single operation.
Here's a worked example applied to your case:
$ cat t1254.cu
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random : public thrust::unary_function<thrust::tuple<int, int>, int> {
__host__ __device__ int operator()(thrust::tuple<int, int> t) const {
int n = thrust::get<0>(t);
int x = thrust::get<1>(t);
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x < 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::device_vector<int> d_r(5);
int rsize = thrust::copy_if(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), d_x.begin())), add_random()), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(5), d_x.end())), add_random()), d_r.begin(), is_greater())- d_r.begin();
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy_n(d_r.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1254 t1254.cu
$ ./t1254
after removing values greater than 6:
4
$
We've replaced your transform operation with a transform iterator applied to the same two inputs. Since you have two inputs to your transform operation, we're using a zip iterator to combine these, and the transform functor has also been reworked slightly to accept that tuple as its input.
Converted your remove_if to a copy_if, to work with the transform iterator as input. This requires a slight change in the logic of the copy predicate.

cuda function application elementwise in cuda

After multiplying a matrix A and a vector x obtaining the result y, I want to apply a function h elementwise to y.
I want to obtain z = h(Ax), where h is applied elementwise to the vector Ax.
I know how to make the matrix/vector multiplication on the GPU (with cublas). Now I want h (which is my own function, coded in C++) to be applied to the resultant vector also in GPU, how can I do that?
Two possible approaches are:
Write your own CUDA kernel to perform the operation
Use thrust (e.g. thrust::for_each() ).
Here is a worked example of both approaches:
$ cat t934.cu
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/for_each.h>
#define DSIZE 4
#define nTPB 256
template <typename T>
__host__ __device__ T myfunc(T &d){
return d + 5; // define your own function here
}
struct mytfunc
{
template <typename T>
__host__ __device__
void operator()(T &d){
d = myfunc(d);
}
};
template <typename T>
__global__ void mykernel(T *dvec, size_t dsize){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < dsize) dvec[idx] = myfunc(dvec[idx]);
}
int main(){
// first using kernel
float *h_data, *d_data;
h_data = new float[DSIZE];
cudaMalloc(&d_data, DSIZE*sizeof(float));
for (int i = 0; i < DSIZE; i++) h_data[i] = i;
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
mykernel<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_data, DSIZE);
cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < DSIZE; i++) std::cout << h_data[i] << ",";
std::cout << std::endl;
// then using thrust
thrust::host_vector<float> hvec(h_data, h_data+DSIZE);
thrust::device_vector<float> dvec = hvec;
thrust::for_each(dvec.begin(), dvec.end(), mytfunc());
thrust::copy_n(dvec.begin(), DSIZE, std::ostream_iterator<float>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t934 t934.cu
$ ./t934
5,6,7,8,
10,11,12,13,
$
Note that in order to provide a complete example, I'm starting with a vector definition in host memory. If you already have the vector in device memory (perhaps as a result of computing y=Ax) then you can work directly on that, by passing that vector to the CUDA kernel, or using it directly in the thrust function, using a thrust::device_ptr wrapper (this method is covered in the thrust quick start guide previously linked.)
The assumption I've made here is you want to use an arbitrary function of one variable. This should handle pretty much arbitrary functions defined in myfunc. However, for some categories of functions that you may be interested in, you may be able to realize it one or more CUBLAS calls as well.

Thrust Gathering/Filtering

What I am trying to do is create a filter on a vector so it removes elements that do not pass a predicate test; but not too sure how I go about it.
I evaluate each element in my inputer vector against the predicate, for example in my code the is_even functor, in a device_vector vector. It is true if it passes the test and false if it's not.
Now I am stuck because I now have this bool vector and I want to gather the elements that passed this predicate test. I store it in a bool vector because I want to keep the result to filter other vectors.
#include ...
template<typename T>
struct is_even : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x%2)==0;
}
};
int main(void)
{
std::cout << "Loading test!" << std::endl;
const int N = 1000000;
thrust::device_vector<int> col1(N);
thrust::device_vector<float> col2(N, 1);
thrust::sequence(col1.begin(), col1.end());
thrust::device_vector<bool> filter(N);
thrust::transform(col1.begin(), col1.end(), filter.begin(), is_even<int>());
// filter col1 and col2 based on filter
return 0;
}
Within the stream compaction group you may be interested in thrust::copy_if
We can select the even elements into a new vector directly using your defined predicate without making an intermediate filter vector:
thrust::copy_if(col1.begin(), col1.end(), result.begin(), is_even<int>());
(result should be a vector of identical type to col1, and already defined to be a length equal to or greater than col1, since it's unknown how many elements will pass the predicate test.)
If you want to work off of the filter vector you have created, use the stencil version of copy_if instead.
Here's a worked example using the stencil method based on your comments:
$ cat t267.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
template<typename T>
struct is_even : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x%2)==0;
}
};
struct is_true : thrust::unary_function<bool, bool>
{
__host__ __device__
bool operator()(const bool &x)
{
return x;
}
};
int main(void)
{
std::cout << "Loading test!" << std::endl;
const int N = 1000000;
thrust::device_vector<int> col1(N);
thrust::device_vector<float> col2(N, 1);
thrust::sequence(col1.begin(), col1.end());
thrust::device_vector<bool> filter(N);
thrust::device_vector<int> result(N);
thrust::transform(col1.begin(), col1.end(), filter.begin(), is_even<int>());
// filter col1 based on filter
thrust::device_vector<int>::iterator end = thrust::copy_if(col1.begin(), col1.end(), filter.begin(), result.begin(), is_true());
int len = end - result.begin();
thrust::host_vector<int> h_result(len);
thrust::copy_n(result.begin(), len, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, "\n"));
return 0;
}
$ nvcc -arch=sm_20 -o t267 t267.cu
$ ./t267
Loading test!
0
2
4
6
8
10
12
14
16
18
$

CUDA host and device using same __constant__ memory

I have device/host function that uses constant memory. It runs OK on device, but on host it seems like this memory remains uninitialized.
#include <iostream>
#include <stdio.h>
const __constant__ double vals[2] = { 0.0, 1000.0 };
__device__ __host__ double f(size_t i)
{
return vals[i];
}
__global__ void kern()
{
printf("vals[%d] = %lf\n", threadIdx.x, vals[threadIdx.x]);
}
int main() {
std::cerr << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}
This prints (requires CC 2.0 or above)
0 0
vals[0] = 0.000000
vals[1] = 1000.000000
What is the problem and how can I get both device and host memory constants initialized simultaneously?
Since CygnusX1 misunderstood what I meant in my comment on MurphEngineer's answer, maybe I should post my own answer. What I meant was this:
__constant__ double dc_vals[2] = { 0.0, 1000.0 };
const double hc_vals[2] = { 0.0, 1000.0 };
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
This has the same result as Cygnus', but it is more flexible in the face of real code: it lets you have runtime-defined values in your constant arrays, for example, and allows you to use CUDA API functions like cudaMemcpyToSymbol/cudsaMemcpyFromSymbol on the __constant__ array.
A more realistic complete example:
#include <iostream>
#include <stdio.h>
__constant__ double dc_vals[2];
const double hc_vals[2];
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
__global__ void kern()
{
printf("vals[%d] = %lf\n", threadIdx.x, vals[threadIdx.x]);
}
int main() {
hc_vals[0] = 0.0;
hc_vals[1] = 1000.0;
cudaMemcpyToSymbol(dc_vals, hc_vals, 2 * sizeof(double), 0, cudaMemcpyHostToDevice);
std::cerr << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}
I think MurphEngineer explained well why it does not work.
To quickly fix this problem, you can follow harrism's idea, something like this:
#ifdef __CUDA_ARCH__
#define CONSTANT __constant__
#else
#define CONSTANT
#endif
const CONSTANT double vals[2] = { 0.0, 1000.0 };
This way the host compilation will create a normal host const array, while device compilation will create a device __constant__ compilation.
Do note that with this trick it might be harder to use CUDA API to access that device array with functions like cudaMemcpyToSymbol() if you ever decide to do so.
Using the __constant__ qualifier explicitly allocates that memory on the device. There is no way to access that memory from the host -- not even with the new CUDA Unified Addressing stuff (that only works for memory allocated with cudaMalloc() and its friends). Qualifying the variable with const just says "this is a constant pointer to (...)".
The correct way to do this is, indeed, to have two arrays: one on the host, and one on the device. Initialize your host array, then use cudaMemcpyToSymbol() to copy data to the device array at runtime. For more information on how to do this, see this thread: http://forums.nvidia.com/index.php?showtopic=69724
Absolutely great. I was struggling with the same issue and this provides a solution. However, the code suggested by harrism gives errors on compilation. Here is the fixed code which compiles correctly with nvcc:
#include <iostream>
#include <stdio.h>
__constant__ double dc_vals[2];
const double hc_vals[2] = {0.0, 1000.0};
__device__ __host__ double f(size_t i)
{
#ifdef __CUDA_ARCH__
return dc_vals[i];
#else
return hc_vals[i];
#endif
}
__global__ void kern()
{
printf("Device: vals[%d] = %lf\n", threadIdx.x, f(threadIdx.x));
}
int main() {
cudaMemcpyToSymbol(dc_vals, hc_vals, 2 * sizeof(double), 0, cudaMemcpyHostToDevice);
std::cerr << "Host: " << f(0) << " " << f(1) << std::endl;
kern<<<1, 2>>>();
cudaThreadSynchronize();
}