How to use thrust::remove_if to check and remove blocks 2×i and 2×i+1 together - cuda

I am using Cuda C++, and I have a big array Arr including 64-bit unsigned integers in a form like the below:
Arr = {a1, b1, a2, b2, ..., an, bn}
The number of items in Arr is 2n which is an even number. Now, given a boolean function f(int a, int b), I wonder if I can use thrust::remove_if to check f(a1,b1), f(a2, b2), ..., f(an, bn) and remove both consecutive numbers (ai, bi) together if needed?

Rather than zip_iterator, I think a simpler approach is just to reinterpret the array of 64-bit integers as an array of thrust::pair. here is an example:
$ cat t2157.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
struct my_remove
{
template <typename T>
__host__ __device__
bool operator()(T t){
return (thrust::get<0>(t) > thrust::get<1>(t));
}
};
const size_t n = 32768;
const size_t s = 2*n;
using mt=unsigned long long;
using dt=thrust::pair<mt,mt>;
int main(){
thrust::device_vector<mt> A(s);
thrust::sequence(A.begin(), A.end());
A[0] = 2; // expecting removal of the first pair
thrust::copy_n(A.begin(), 6, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
auto D = thrust::device_pointer_cast<dt>(reinterpret_cast<dt *>(thrust::raw_pointer_cast(A.data())));
thrust::remove_if(D, D+n, my_remove());
thrust::copy_n(A.begin(), 6, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t2157 t2157.cu
$ compute-sanitizer ./t2157
========= COMPUTE-SANITIZER
2,1,2,3,4,5,
2,3,4,5,6,7,
========= ERROR SUMMARY: 0 errors
$

Related

Thrust: why always host code is executed in spite of __CUDA_ARCH__

I try to define two branches in code: one for CUDA execution and the other - without it (with future OMP in mind). But when I use macro __CUDA_ARCH__ it looks as if always the host code is executed. But I supposed that Thrust by default use CUDA (and branch for device code). What's wrong with my code?
Here it is:
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, data, op);
for each (int el in data)
std::cout << el << " ";
std::cout << std::endl;
}
I expect that "transform" will define vector as multiplied by 2*constanta but I see that host code is used - the output is "10 11 12 13 14 15 16", not "20 22 24 26 28 30 32" (as expected).
Why?
Thrust is choosing the host path because one of your data items supplied to the thrust transform operation is in host memory:
thrust::transform(first, last, data, op);
^^^^
If you want a thrust algorithm to operate on the device, generally speaking all the container data you pass to/from must also reside in device memory.
Here's a modification to your code that demonstrates that thrust will follow the device path if we replace data with a device-resident container:
$ cat t13.cu
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
// int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
thrust::device_vector<int> d_data(7);
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, d_data.begin(), op);
for (int el = 0; el < 7; el++) {
int dat = d_data[el];
std::cout << dat << " "; }
std::cout << std::endl;
}
$ nvcc -arch=sm_61 -o t13 t13.cu
$ ./t13
20 22 24 26 28 30 32
$
You may want to read the thrust quick start guide to learn about thrust algorithm dispatch.

CUDA Thrust copy transformed result only if it satisfies a predicate

I want to perform a transformation on a input thrust::device_vector and only copy the result to the output vector if the result satisfies a predicate. So the number of results could be less than the size of the input device_vector (similar to the output vector of thrust::copy_if). I have not found a way to do this with thrust::transform_if. Currently I can do this with thrust::transform and thrust::remove_if as shown in the example below:
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random {
__host__ __device__ add_random() {}
__device__ int operator()(const int n, const int x) const {
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x > 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(5),
d_x.begin(),
d_x.begin(),
add_random());
std::cout << "after adding random number:" << std::endl;
std::ostream_iterator<int> o(std::cout, " ");
thrust::copy(d_x.begin(), d_x.end(), o);
std::cout << std::endl;
thrust::device_vector<int>::iterator new_end(thrust::remove_if(d_x.begin(), d_x.end(), is_greater()));
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy(d_x.begin(), new_end, o);
std::cout << std::endl;
return 0;
}
Which gives the output:
after adding random number:
18 4 8 7 11
after removing values greater than 6:
4
I would like to avoid copying the results to memory twice, first by thrust::transform and then by thrust::remove_if in the above example. Is it possible to get the above output with a single transformation function? How can I do this? My biggest concern is the computational cost, so any optimized solution, even if it doesn't use the Thrust library would be great.
Welcome to the world of thrust fancy iterators. You can get a quick overview of some fancy iterator types by looking at the thrust quick start guide. In particular, a thrust transform iterator can frequently be used to replace a thrust transform operation that is applied to the input of another thrust algorithm, "fusing" the two algorithms into a single operation.
Here's a worked example applied to your case:
$ cat t1254.cu
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random : public thrust::unary_function<thrust::tuple<int, int>, int> {
__host__ __device__ int operator()(thrust::tuple<int, int> t) const {
int n = thrust::get<0>(t);
int x = thrust::get<1>(t);
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x < 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::device_vector<int> d_r(5);
int rsize = thrust::copy_if(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), d_x.begin())), add_random()), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(5), d_x.end())), add_random()), d_r.begin(), is_greater())- d_r.begin();
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy_n(d_r.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1254 t1254.cu
$ ./t1254
after removing values greater than 6:
4
$
We've replaced your transform operation with a transform iterator applied to the same two inputs. Since you have two inputs to your transform operation, we're using a zip iterator to combine these, and the transform functor has also been reworked slightly to accept that tuple as its input.
Converted your remove_if to a copy_if, to work with the transform iterator as input. This requires a slight change in the logic of the copy predicate.

How to use Thrust to implement reduce by key when keys are strings or char array

input:
BC
BD
BC
BC
BD
CD
output:
BC 3
BD 2
CD 1
if I use char type as key it is available.But seems Thrust does not support string as a key.
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/reduce.h>
#include <string>
int main(void)
{
std::string data = "aaabbbbbcddeeeeeeeeeff";
size_t N = data.size();
thrust::device_vector<char> input(data.begin(), data.end());
thrust::device_vector<char> output(N);
thrust::device_vector<int> lengths(N);
size_t num_runs =
thrust::reduce_by_key(input.begin(), input.end(),
thrust::constant_iterator<int>(1),
output.begin(),
lengths.begin()
).first - output.begin();
return 0;
}
How to implement it using Thrust?
With apologies to #AngryLettuce, here are 2 possible approaches:
Method 1:
create a struct to hold your keys. The struct will include one char item for each character in your key.
sort the keys to bring like keys together. It appears that what you want is really just a count of each key type, regardless of where it appears in the sequence. To facilitate this with reduce_by_key, it's necessary to first group like keys together. Otherwise, reduce_by_key will treat like keys that are separated by different intervening keys as distinct key sequences. It's evident from your desired input and output that this is not what you want.
Now use reduce_by_key on the sorted keys, to count like keys.
Step 2 requires (for this method) a functor to sort keys, and step 3 requires a functor to identify the meaning of "equal" keys, which reduce_by_key needs.
Method 2:
create two separate char device_vector(s), one to hold the first letter of each key, the other to hold the second letter of each key. We will then use zip_iterator throughout the remainder of the code to treat these two vectors as a unified "key" vector.
sort the zipped key vector. In this situation, thrust knows how to sort a zipped vector of basic types, and requires no separate sorting functor
perform reduce_by_key on the zipped (and sorted) key vector. This once again requires no separate equality functor. Thrust knows how to determine equality of zipped vectors of basic types.
This second method, in addition to not requiring any functor definitions, probably would also be faster, as zip_iterator tends to improve data access as compared to the AoS (array of structures) present in the first method.
Here's a worked example demonstrating both methods:
$ cat t1004.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/iterator/constant_iterator.h>
#include <iostream>
#include <thrust/iterator/zip_iterator.h>
struct key {
char k1;
char k2;
};
struct sort_functor{
__host__ __device__ bool operator()(key &k1, key &k2){
if (k1.k1 < k2.k1) return true;
if (k1.k1 > k2.k1) return false;
if (k1.k2 < k2.k2) return true;
return false;}
};
struct equal_key{
__host__ __device__ bool operator()(key k1, key k2){
if ((k1.k1 == k2.k1)&&(k1.k2 == k2.k2)) return true;
return false;}
};
int main(){
key data[] = {{'B','C'},{'B','D'},{'B','C'},{'B','C'},{'B','D'},{'C','D'}};;
size_t dsize = sizeof(data)/sizeof(key);
//method 1
thrust::device_vector<key> keys(data, data+dsize);
thrust::device_vector<key> keys_out(dsize);
thrust::device_vector<int> lengths(dsize);
thrust::sort(keys.begin(), keys.end(), sort_functor());
int rsize = thrust::reduce_by_key(keys.begin(), keys.end(), thrust::constant_iterator<int>(1), keys_out.begin(), lengths.begin(),equal_key()).first - keys_out.begin();
std::cout << "Method1:" << std::endl;
for (int i = 0; i < rsize; i++){
key temp = keys_out[i];
int len = lengths[i];
std::cout << " " << temp.k1 << temp.k2 << " " << len << std::endl;}
//method 2
//get the key data into 2 separate vectors.
//there are more efficient ways to do this
//but this is not the crux of your question
thrust::device_vector<char> k1;
thrust::device_vector<char> k2;
for (int i = 0; i < dsize; i++){
k1.push_back(data[i].k1);
k2.push_back(data[i].k2);}
thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(k1.begin(), k2.begin())), thrust::make_zip_iterator(thrust::make_tuple(k1.end(), k2.end())));
thrust::device_vector<char> k1r(dsize);
thrust::device_vector<char> k2r(dsize);
rsize = thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(k1.begin(), k2.begin())), thrust::make_zip_iterator(thrust::make_tuple(k1.end(), k2.end())), thrust::constant_iterator<int>(1), thrust::make_zip_iterator(thrust::make_tuple(k1r.begin(), k2r.begin())), lengths.begin()).first - thrust::make_zip_iterator(thrust::make_tuple(k1r.begin(),k2r.begin()));
std::cout << "Method2:" << std::endl;
for (int i = 0; i < rsize; i++){
char c1 = k1r[i];
char c2 = k2r[i];
int len = lengths[i];
std::cout << " " << c1 << c2 << " " << len << std::endl;}
return 0;
}
$ nvcc -o t1004 t1004.cu
$ ./t1004
Method1:
BC 3
BD 2
CD 1
Method2:
BC 3
BD 2
CD 1
$
Here's an improved version of method 2. You should be able to use a string/char array directly, and this version can also be fairly easily modified to accommodate a key length from 2 to 10 characters. This method uses a strided range iterator to pull the individual key characters directly from the data array:
$ cat t1004.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/iterator/constant_iterator.h>
#include <iostream>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
typedef thrust::device_vector<char>::iterator cIterator;
int main(){
//method 2
//get the key data into separate vectors, one per character in key.
#define KEYLEN 2
const char data[] = "BCBDBCBCBDCD";
size_t dsize = sizeof(data)/sizeof(char);
size_t numkeys = dsize/KEYLEN;
thrust::device_vector<char> keys(data, data+dsize);
strided_range<cIterator> *str_k[KEYLEN];
for (int i = 0; i < KEYLEN; i++)
str_k[i] = new strided_range<cIterator>(keys.begin()+i, keys.end(), KEYLEN);
//modify this line also if KEYLEN changes (max 10)
auto my_z = thrust::make_zip_iterator(thrust::make_tuple((*str_k[0]).begin(), (*str_k[1]).begin()));
thrust::sort(my_z, my_z+numkeys);
thrust::device_vector<char> kr[KEYLEN];
for (int i = 0; i < KEYLEN; i++)
kr[i].resize(numkeys);
//modify this line also if KEYLEN changes (max 10)
auto my_zr = thrust::make_zip_iterator(thrust::make_tuple(kr[0].begin(), kr[1].begin()));
thrust::device_vector<int> lengths(numkeys);
size_t rsize = thrust::reduce_by_key(my_z, my_z + numkeys, thrust::constant_iterator<int>(1), my_zr, lengths.begin()).first - my_zr;
std::cout << "Method2:" << std::endl;
for (int i = 0; i < rsize; i++){
std::cout << " ";
for (int j = 0; j < KEYLEN; j++){
char c = kr[j][i];
std::cout << c; }
int len = lengths[i];
std::cout <<" " << len << std::endl;}
return 0;
}
$ nvcc -std=c++11 t1004.cu -o t1004
$ ./t1004
Method2:
BC 3
BD 2
CD 1
$

cuda function application elementwise in cuda

After multiplying a matrix A and a vector x obtaining the result y, I want to apply a function h elementwise to y.
I want to obtain z = h(Ax), where h is applied elementwise to the vector Ax.
I know how to make the matrix/vector multiplication on the GPU (with cublas). Now I want h (which is my own function, coded in C++) to be applied to the resultant vector also in GPU, how can I do that?
Two possible approaches are:
Write your own CUDA kernel to perform the operation
Use thrust (e.g. thrust::for_each() ).
Here is a worked example of both approaches:
$ cat t934.cu
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/for_each.h>
#define DSIZE 4
#define nTPB 256
template <typename T>
__host__ __device__ T myfunc(T &d){
return d + 5; // define your own function here
}
struct mytfunc
{
template <typename T>
__host__ __device__
void operator()(T &d){
d = myfunc(d);
}
};
template <typename T>
__global__ void mykernel(T *dvec, size_t dsize){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < dsize) dvec[idx] = myfunc(dvec[idx]);
}
int main(){
// first using kernel
float *h_data, *d_data;
h_data = new float[DSIZE];
cudaMalloc(&d_data, DSIZE*sizeof(float));
for (int i = 0; i < DSIZE; i++) h_data[i] = i;
cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
mykernel<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(d_data, DSIZE);
cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < DSIZE; i++) std::cout << h_data[i] << ",";
std::cout << std::endl;
// then using thrust
thrust::host_vector<float> hvec(h_data, h_data+DSIZE);
thrust::device_vector<float> dvec = hvec;
thrust::for_each(dvec.begin(), dvec.end(), mytfunc());
thrust::copy_n(dvec.begin(), DSIZE, std::ostream_iterator<float>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t934 t934.cu
$ ./t934
5,6,7,8,
10,11,12,13,
$
Note that in order to provide a complete example, I'm starting with a vector definition in host memory. If you already have the vector in device memory (perhaps as a result of computing y=Ax) then you can work directly on that, by passing that vector to the CUDA kernel, or using it directly in the thrust function, using a thrust::device_ptr wrapper (this method is covered in the thrust quick start guide previously linked.)
The assumption I've made here is you want to use an arbitrary function of one variable. This should handle pretty much arbitrary functions defined in myfunc. However, for some categories of functions that you may be interested in, you may be able to realize it one or more CUBLAS calls as well.

Thrust Gathering/Filtering

What I am trying to do is create a filter on a vector so it removes elements that do not pass a predicate test; but not too sure how I go about it.
I evaluate each element in my inputer vector against the predicate, for example in my code the is_even functor, in a device_vector vector. It is true if it passes the test and false if it's not.
Now I am stuck because I now have this bool vector and I want to gather the elements that passed this predicate test. I store it in a bool vector because I want to keep the result to filter other vectors.
#include ...
template<typename T>
struct is_even : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x%2)==0;
}
};
int main(void)
{
std::cout << "Loading test!" << std::endl;
const int N = 1000000;
thrust::device_vector<int> col1(N);
thrust::device_vector<float> col2(N, 1);
thrust::sequence(col1.begin(), col1.end());
thrust::device_vector<bool> filter(N);
thrust::transform(col1.begin(), col1.end(), filter.begin(), is_even<int>());
// filter col1 and col2 based on filter
return 0;
}
Within the stream compaction group you may be interested in thrust::copy_if
We can select the even elements into a new vector directly using your defined predicate without making an intermediate filter vector:
thrust::copy_if(col1.begin(), col1.end(), result.begin(), is_even<int>());
(result should be a vector of identical type to col1, and already defined to be a length equal to or greater than col1, since it's unknown how many elements will pass the predicate test.)
If you want to work off of the filter vector you have created, use the stencil version of copy_if instead.
Here's a worked example using the stencil method based on your comments:
$ cat t267.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
template<typename T>
struct is_even : thrust::unary_function<T, bool>
{
__host__ __device__
bool operator()(const T &x)
{
return (x%2)==0;
}
};
struct is_true : thrust::unary_function<bool, bool>
{
__host__ __device__
bool operator()(const bool &x)
{
return x;
}
};
int main(void)
{
std::cout << "Loading test!" << std::endl;
const int N = 1000000;
thrust::device_vector<int> col1(N);
thrust::device_vector<float> col2(N, 1);
thrust::sequence(col1.begin(), col1.end());
thrust::device_vector<bool> filter(N);
thrust::device_vector<int> result(N);
thrust::transform(col1.begin(), col1.end(), filter.begin(), is_even<int>());
// filter col1 based on filter
thrust::device_vector<int>::iterator end = thrust::copy_if(col1.begin(), col1.end(), filter.begin(), result.begin(), is_true());
int len = end - result.begin();
thrust::host_vector<int> h_result(len);
thrust::copy_n(result.begin(), len, h_result.begin());
thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, "\n"));
return 0;
}
$ nvcc -arch=sm_20 -o t267 t267.cu
$ ./t267
Loading test!
0
2
4
6
8
10
12
14
16
18
$