Finding the first index of every distinct value in CUDA array - cuda

Assume we have an array like this:
0, 0, 0, 1, 2, 2, 2, 3, 3, 4, ...
I would like to have the index of every first occurrence of every value, so in this example [0, 3, 4, 7, 9]. The array is sorted and all possible values are known and consecutive.
Possible solutions I have is using a kernel for every element in this array and use an atomicmin to save the lowest index. But I assume a better approach is possible.

You can do this with a single call to thrust::unique_by_key() if you provide a vector of indices e.g. via thrust::sequence(). Here's a worked example:
$ cat t3.cu
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/unique.h>
#include <thrust/sequence.h>
#include <iostream>
int main(){
int keys[] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 4};
int ks = sizeof(keys)/sizeof(keys[0]);
thrust::device_vector<int> d_keys(keys, keys+ks);
thrust::device_vector<int> d_result(ks);
thrust::sequence(d_result.begin(), d_result.end());
int rs = (thrust::unique_by_key(d_keys.begin(), d_keys.end(), d_result.begin())).first - d_keys.begin();
thrust::copy_n(d_result.begin(), rs, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -arch=sm_35 -o t3 t3.cu
$ ./t3
0,3,4,7,9,
$
The important activity occurring here is stream compaction and thrust provides a nice set of routines for various use-cases. For example this operation could also be done with thrust::unique_copy() and in that case, with some additional code complexity, you could eliminate the need for the thrust::sequence() call (it would be replaced by a thrust::counting_iterator zipped together with your data, and an appropriate selection functor), but it still requires an output vector of the same length.

As #tera pointed out, you can compare a number with the previous number to determine whether it is the first occurrence in a sequence of unique number. You can write a kernel to generate a mask for this criteria such that the mask array contains the index for a number which is a first occurrence and a negative number (like -1, as it cannot be an index) otherwise. After that, use thrust to count the non -1 values by using a predicate. Then copy those values from the mask using the same predicate as above. Finally, copy back the results to host.
Here is a sample implementation of the above mentioned approach.
#include <iostream>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/count.h>
#include <thrust/copy.h>
using namespace std;
//Copy index
__global__ void is_first_occurence(int* input, int* is, int count)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid<count)
{
if(tid == 0)
{
is[0] = 0;
}
else if(input[tid] != input[tid-1])
{
is[tid] = tid;
}
else
{
is[tid] = -1;
}
}
}
struct isFirst
{
__host__ __device__ bool operator()(const int x)
{
return (x != -1);
}
};
int main(int argc, char** argv)
{
const int count = 13;
std::vector<int> arr = { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4 ,4 };
thrust::device_vector<int> arr_d = arr;
thrust::device_vector<int> mask_d(arr_d.size());
int* pArr = thrust::raw_pointer_cast(arr_d.data() );
int* pMask = thrust::raw_pointer_cast(mask_d.data() );
dim3 block(16);
dim3 grid((count + block.x -1)/block.x);
is_first_occurence<<<grid,block>>>(pArr, pMask, count);
cudaDeviceSynchronize();
int num_unique = thrust::count_if(mask_d.begin(), mask_d.end(), isFirst());
thrust::copy_if(mask_d.begin(), mask_d.end(), arr_d.begin(), isFirst());
std::vector<int> unique_indices(num_unique);
thrust::copy(arr_d.begin(), arr_d.begin() + num_unique, unique_indices.begin());
for(auto i:unique_indices)
{
cout<<i<<endl;
}
return 0;
}
Compiled and tested using the following command:
nvcc -o get_unique get_unique.cu -std=c++11 -arch=sm_61

Related

How to use thrust::remove_if to check and remove blocks 2×i and 2×i+1 together

I am using Cuda C++, and I have a big array Arr including 64-bit unsigned integers in a form like the below:
Arr = {a1, b1, a2, b2, ..., an, bn}
The number of items in Arr is 2n which is an even number. Now, given a boolean function f(int a, int b), I wonder if I can use thrust::remove_if to check f(a1,b1), f(a2, b2), ..., f(an, bn) and remove both consecutive numbers (ai, bi) together if needed?
Rather than zip_iterator, I think a simpler approach is just to reinterpret the array of 64-bit integers as an array of thrust::pair. here is an example:
$ cat t2157.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
struct my_remove
{
template <typename T>
__host__ __device__
bool operator()(T t){
return (thrust::get<0>(t) > thrust::get<1>(t));
}
};
const size_t n = 32768;
const size_t s = 2*n;
using mt=unsigned long long;
using dt=thrust::pair<mt,mt>;
int main(){
thrust::device_vector<mt> A(s);
thrust::sequence(A.begin(), A.end());
A[0] = 2; // expecting removal of the first pair
thrust::copy_n(A.begin(), 6, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
auto D = thrust::device_pointer_cast<dt>(reinterpret_cast<dt *>(thrust::raw_pointer_cast(A.data())));
thrust::remove_if(D, D+n, my_remove());
thrust::copy_n(A.begin(), 6, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t2157 t2157.cu
$ compute-sanitizer ./t2157
========= COMPUTE-SANITIZER
2,1,2,3,4,5,
2,3,4,5,6,7,
========= ERROR SUMMARY: 0 errors
$

Thrust: why always host code is executed in spite of __CUDA_ARCH__

I try to define two branches in code: one for CUDA execution and the other - without it (with future OMP in mind). But when I use macro __CUDA_ARCH__ it looks as if always the host code is executed. But I supposed that Thrust by default use CUDA (and branch for device code). What's wrong with my code?
Here it is:
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, data, op);
for each (int el in data)
std::cout << el << " ";
std::cout << std::endl;
}
I expect that "transform" will define vector as multiplied by 2*constanta but I see that host code is used - the output is "10 11 12 13 14 15 16", not "20 22 24 26 28 30 32" (as expected).
Why?
Thrust is choosing the host path because one of your data items supplied to the thrust transform operation is in host memory:
thrust::transform(first, last, data, op);
^^^^
If you want a thrust algorithm to operate on the device, generally speaking all the container data you pass to/from must also reside in device memory.
Here's a modification to your code that demonstrates that thrust will follow the device path if we replace data with a device-resident container:
$ cat t13.cu
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <stdio.h>
struct my_op
{
my_op(int init_const) : constanta(init_const) {}
__host__ __device__ int operator()(const int &x) const
{
#if defined(__CUDA_ARCH__)
return 2 * x * constanta; // never executed - why?
#else
return x * constanta; // always executed
#endif
}
private:
int constanta;
};
int main()
{
// int data[7] = { 0, 0, 0, 0, 0, 0, 0 };
thrust::counting_iterator<int> first(10);
thrust::counting_iterator<int> last = first + 7;
thrust::device_vector<int> d_data(7);
int init_value = 1;
my_op op(init_value);
thrust::transform(first, last, d_data.begin(), op);
for (int el = 0; el < 7; el++) {
int dat = d_data[el];
std::cout << dat << " "; }
std::cout << std::endl;
}
$ nvcc -arch=sm_61 -o t13 t13.cu
$ ./t13
20 22 24 26 28 30 32
$
You may want to read the thrust quick start guide to learn about thrust algorithm dispatch.

CUDA Thrust copy transformed result only if it satisfies a predicate

I want to perform a transformation on a input thrust::device_vector and only copy the result to the output vector if the result satisfies a predicate. So the number of results could be less than the size of the input device_vector (similar to the output vector of thrust::copy_if). I have not found a way to do this with thrust::transform_if. Currently I can do this with thrust::transform and thrust::remove_if as shown in the example below:
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random {
__host__ __device__ add_random() {}
__device__ int operator()(const int n, const int x) const {
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x > 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(5),
d_x.begin(),
d_x.begin(),
add_random());
std::cout << "after adding random number:" << std::endl;
std::ostream_iterator<int> o(std::cout, " ");
thrust::copy(d_x.begin(), d_x.end(), o);
std::cout << std::endl;
thrust::device_vector<int>::iterator new_end(thrust::remove_if(d_x.begin(), d_x.end(), is_greater()));
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy(d_x.begin(), new_end, o);
std::cout << std::endl;
return 0;
}
Which gives the output:
after adding random number:
18 4 8 7 11
after removing values greater than 6:
4
I would like to avoid copying the results to memory twice, first by thrust::transform and then by thrust::remove_if in the above example. Is it possible to get the above output with a single transformation function? How can I do this? My biggest concern is the computational cost, so any optimized solution, even if it doesn't use the Thrust library would be great.
Welcome to the world of thrust fancy iterators. You can get a quick overview of some fancy iterator types by looking at the thrust quick start guide. In particular, a thrust transform iterator can frequently be used to replace a thrust transform operation that is applied to the input of another thrust algorithm, "fusing" the two algorithms into a single operation.
Here's a worked example applied to your case:
$ cat t1254.cu
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random : public thrust::unary_function<thrust::tuple<int, int>, int> {
__host__ __device__ int operator()(thrust::tuple<int, int> t) const {
int n = thrust::get<0>(t);
int x = thrust::get<1>(t);
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x < 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::device_vector<int> d_r(5);
int rsize = thrust::copy_if(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), d_x.begin())), add_random()), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(5), d_x.end())), add_random()), d_r.begin(), is_greater())- d_r.begin();
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy_n(d_r.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1254 t1254.cu
$ ./t1254
after removing values greater than 6:
4
$
We've replaced your transform operation with a transform iterator applied to the same two inputs. Since you have two inputs to your transform operation, we're using a zip iterator to combine these, and the transform functor has also been reworked slightly to accept that tuple as its input.
Converted your remove_if to a copy_if, to work with the transform iterator as input. This requires a slight change in the logic of the copy predicate.

CUDA Thrust: reduce_by_key on only some values in an array, based off values in a "key" array

Let's say I have two device_vector<byte> arrays, d_keys and d_data.
If d_data is, for example, a flattened 2D 3x5 array ( e.g. { 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3 } ) and d_keys is a 1D array of size 5 ( e.g. { 1, 0, 0, 1, 1 } ), how can I do a reduction such that I'd end up only adding values on a per-row basis if the corresponding d_keys value is one ( e.g. ending up with a result of { 10, 23, 14 } )?
The sum_rows.cu example allows me to add every value in d_data, but that's not quite right.
Alternatively, I can, on a per-row basis, use a zip_iterator and combine d_keys with one row of d_data at a time, and do a transform_reduce, adding only if the key value is one, but then I'd have to loop through the d_data array.
What I really need is some sort of transform_reduce_by_key functionality that isn't built-in, but surely there must be a way to make it!
Based on the additional comment that instead of 3 rows there are thousands of rows, we can write a transform functor that sums an entire row. Based on the fact that there are thousands of rows, this should keep the machine pretty busy:
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#define ROW 20
#define COL 10
__device__ int *vals;
__device__ int *keys;
struct test_functor
{
const int a;
test_functor(int _a) : a(_a) {}
__device__
int operator()(int& x, int& y ) {
int temp = 0;
for (int i = 0; i<a; i++)
temp += vals[i + (y*a)] * keys[i];
return temp;
}
};
int main(){
int *s_vals, *s_keys;
thrust::host_vector<int> h_vals(ROW*COL);
thrust::host_vector<int> h_keys(COL);
thrust::sequence(h_vals.begin(), h_vals.end());
thrust::fill(h_keys.begin(), h_keys.end(), 1);
h_keys[0] = 0;
thrust::device_vector<int> d_vals = h_vals;
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_sums(ROW);
thrust::fill(d_sums.begin(), d_sums.end(), 0);
s_vals = thrust::raw_pointer_cast(&d_vals[0]);
s_keys = thrust::raw_pointer_cast(&d_keys[0]);
cudaMemcpyToSymbol(vals, &s_vals, sizeof(int *));
cudaMemcpyToSymbol(keys, &s_keys, sizeof(int *));
thrust::device_vector<int> d_idx(ROW);
thrust::sequence(d_idx.begin(), d_idx.end());
thrust::transform(d_sums.begin(), d_sums.end(), d_idx.begin(), d_sums.begin(), test_functor(COL));
thrust::host_vector<int> h_sums = d_sums;
std::cout << "Results :" << std::endl;
for (unsigned i = 0; i<ROW; i++)
std::cout<<"h_sums["<<i<<"] = " << h_sums[i] << std::endl;
return 0;
}
This approach has the drawback that in general accesses to the vals array will not be coalesced. However for a few thousand rows the cache may offer significant relief. We can fix this problem by re-ordering the data to be stored in column-major form in the flattened array, and change our indexing method in the loop in the functor to be like this:
for (int i=0; i<a; i++)
temp += vals[(i*ROW)+y]*keys[i];
If preferred, you can pass ROW as an additional parameter to the functor.
Here is some sample code that does something like what you are after, using the approach I outlined in my comment below your question. In fact we want to use 4-tuples, to pick up your key value. Reproducing the suitably modified comment here:
You could make a zip iterator that zips your 3 rows together plus the key "row" and passes a 4-tuple to a special functor. Your special functor would then do a reduction on the array of 3-tuples (using the key also) and return a result that is a 4-tuple. The thrust dot product example may give you some ideas.
This is one possible approach:
#include <thrust/host_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/tuple.h>
#define N 30 // make this evenly divisible by 3 for this example
typedef thrust::tuple<int, int, int, int> tpl4int;
typedef thrust::host_vector<int>::iterator intiter;
typedef thrust::tuple<intiter, intiter, intiter, intiter> tpl4intiter;
typedef thrust::zip_iterator<tpl4intiter> int4zip;
struct r3key_unary_op : public thrust::unary_function<tpl4int, tpl4int>
{
__host__ __device__
tpl4int operator()(const tpl4int& x) const
{
tpl4int result;
thrust::get<0>(result) = x.get<0>()*x.get<3>();
thrust::get<1>(result) = x.get<1>()*x.get<3>();
thrust::get<2>(result) = x.get<2>()*x.get<3>();
thrust::get<3>(result) = 1;
return result;
}
};
struct r3key_binary_op : public thrust::binary_function<tpl4int, tpl4int, tpl4int>
{
__host__ __device__
tpl4int operator()(const tpl4int& x, const tpl4int& y) const
{
tpl4int result;
thrust::get<0>(result) = x.get<0>()*x.get<3>() + y.get<0>()*y.get<3>();
thrust::get<1>(result) = x.get<1>()*x.get<3>() + y.get<1>()*y.get<3>();
thrust::get<2>(result) = x.get<2>()*x.get<3>() + y.get<2>()*y.get<3>();
thrust::get<3>(result) = 1;
return result;
}
};
int main() {
thrust::host_vector<int> A(N); // values, in 3 "rows" flattened
thrust::sequence(A.begin(), A.end());
thrust::host_vector<int> K(N/3); // keys in one row
thrust::fill(K.begin(), K.end(), 1); // set some keys to 1
K[9] = 0; // set some keys to zero
int4zip first = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), A.begin() + N/3, A.begin() + 2*N/3, K.begin()));
int4zip last = thrust::make_zip_iterator(thrust::make_tuple(A.begin() + N/3, A.begin() + 2*N/3, A.end(), K.end()));
r3key_unary_op my_unary_op;
r3key_binary_op my_binary_op;
tpl4int init = my_unary_op(*first);
// init = thrust::make_tuple((int) 0, (int) 0, (int) 0, (int) 0);
tpl4int result = thrust::transform_reduce(first, last, my_unary_op, init, my_binary_op);
std::cout << "row 0 = " << result.get<0>() << std::endl;
std::cout << "row 1 = " << result.get<1>() << std::endl;
std::cout << "row 2 = " << result.get<2>() << std::endl;
return 0;
}
Notes:
This is just using host_vector. Extending it to work with device_vector, or templatizing it to work with something other than int should be straightforward.
For completeness, I am using the unary functor to provide an init value other than zero for the sum reduction of each row. You might want to change the init value to zero (a 4-tuple of zeros).

Type of return value of thrust::remove_if

I have two arrays of integers dmap and dflag on the device of
the same length
and I have wrapped them with thrust device pointers, dmapt and
dflagt
There are some elements in the dmap array with value -1. I want to
remove these -1's and the corresponding values from
the dflag array.
I am using the remove_if function to do this, but I cannot figure out
what the return value of this call is or how I should use this
returned value to get .
( I want to pass these reduced arrays to the reduce_by_key function
where dflagt will be used as the keys. )
I am using the following call for doing the reduction. Please let me
know how I can store the returned value in a variable and
use it to address the individual arrays dflag and dmap
thrust::remove_if(
thrust::make_zip_iterator(thrust::make_tuple(dmapt, dflagt)),
thrust::make_zip_iterator(thrust::make_tuple(dmapt+numindices, dflagt+numindices)),
minus_one_equality_test()
);
where the predicate functor used above is defined as
struct minus_one_equality_test
{
typedef typename thrust::tuple<int,int> Tuple;
__host__ __device__
bool operator()(const Tuple& a )
{
return thrust::get<0>(a) == (-1);
}
}
The return value is a zip_iterator which marks the new end of the sequence of tuples for which your functor returned true during the remove_if call. To access the new end iterator of the underlying array you will need to retrieve a tuple iterator from the zip_iterator; the contents of that tuple are then the new end iterators of the original arrays you used to build the zip_iterator. It is a lot more convoluted in words than in code:
#include <thrust/tuple.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/remove.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/copy.h>
#include <iostream>
struct minus_one_equality_test
{
typedef thrust::tuple<int,int> Tuple;
__host__ __device__
bool operator()(const Tuple& a )
{
return thrust::get<0>(a) == (-1);
};
};
int main(void)
{
const int numindices = 10;
int mapt[numindices] = { 1, 2, -1, 4, 5, -1, 7, 8, -1, 10 };
int flagt[numindices] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
thrust::device_vector<int> vmapt(10);
thrust::device_vector<int> vflagt(10);
thrust::copy(mapt, mapt+numindices, vmapt.begin());
thrust::copy(flagt, flagt+numindices, vflagt.begin());
thrust::device_ptr<int> dmapt = vmapt.data();
thrust::device_ptr<int> dflagt = vflagt.data();
typedef thrust::device_vector< int >::iterator VIt;
typedef thrust::tuple< VIt, VIt > TupleIt;
typedef thrust::zip_iterator< TupleIt > ZipIt;
ZipIt Zend = thrust::remove_if(
thrust::make_zip_iterator(thrust::make_tuple(dmapt, dflagt)),
thrust::make_zip_iterator(thrust::make_tuple(dmapt+numindices, dflagt+numindices)),
minus_one_equality_test()
);
TupleIt Tend = Zend.get_iterator_tuple();
VIt vmapt_end = thrust::get<0>(Tend);
for(VIt x = vmapt.begin(); x != vmapt_end; x++) {
std::cout << *x << std::endl;
}
return 0;
}
If you compile this and run it, you should see something like this:
$ nvcc -arch=sm_12 remove_if.cu
$ ./a.out
1
2
4
5
7
8
10
In this example I only "retrieve" the shorted contents of the first element of the tuple, the second is accessed in the same way, ie. the iterator marking the new end of the vector is thrust::get<1>(Tend).