Related
I am using Cuda C++, and I have a big array Arr including 64-bit unsigned integers in a form like the below:
Arr = {a1, b1, a2, b2, ..., an, bn}
The number of items in Arr is 2n which is an even number. Now, given a boolean function f(int a, int b), I wonder if I can use thrust::remove_if to check f(a1,b1), f(a2, b2), ..., f(an, bn) and remove both consecutive numbers (ai, bi) together if needed?
Rather than zip_iterator, I think a simpler approach is just to reinterpret the array of 64-bit integers as an array of thrust::pair. here is an example:
$ cat t2157.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/remove.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
struct my_remove
{
template <typename T>
__host__ __device__
bool operator()(T t){
return (thrust::get<0>(t) > thrust::get<1>(t));
}
};
const size_t n = 32768;
const size_t s = 2*n;
using mt=unsigned long long;
using dt=thrust::pair<mt,mt>;
int main(){
thrust::device_vector<mt> A(s);
thrust::sequence(A.begin(), A.end());
A[0] = 2; // expecting removal of the first pair
thrust::copy_n(A.begin(), 6, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
auto D = thrust::device_pointer_cast<dt>(reinterpret_cast<dt *>(thrust::raw_pointer_cast(A.data())));
thrust::remove_if(D, D+n, my_remove());
thrust::copy_n(A.begin(), 6, std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t2157 t2157.cu
$ compute-sanitizer ./t2157
========= COMPUTE-SANITIZER
2,1,2,3,4,5,
2,3,4,5,6,7,
========= ERROR SUMMARY: 0 errors
$
Assume we have an array like this:
0, 0, 0, 1, 2, 2, 2, 3, 3, 4, ...
I would like to have the index of every first occurrence of every value, so in this example [0, 3, 4, 7, 9]. The array is sorted and all possible values are known and consecutive.
Possible solutions I have is using a kernel for every element in this array and use an atomicmin to save the lowest index. But I assume a better approach is possible.
You can do this with a single call to thrust::unique_by_key() if you provide a vector of indices e.g. via thrust::sequence(). Here's a worked example:
$ cat t3.cu
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <thrust/unique.h>
#include <thrust/sequence.h>
#include <iostream>
int main(){
int keys[] = {0, 0, 0, 1, 2, 2, 2, 3, 3, 4};
int ks = sizeof(keys)/sizeof(keys[0]);
thrust::device_vector<int> d_keys(keys, keys+ks);
thrust::device_vector<int> d_result(ks);
thrust::sequence(d_result.begin(), d_result.end());
int rs = (thrust::unique_by_key(d_keys.begin(), d_keys.end(), d_result.begin())).first - d_keys.begin();
thrust::copy_n(d_result.begin(), rs, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -arch=sm_35 -o t3 t3.cu
$ ./t3
0,3,4,7,9,
$
The important activity occurring here is stream compaction and thrust provides a nice set of routines for various use-cases. For example this operation could also be done with thrust::unique_copy() and in that case, with some additional code complexity, you could eliminate the need for the thrust::sequence() call (it would be replaced by a thrust::counting_iterator zipped together with your data, and an appropriate selection functor), but it still requires an output vector of the same length.
As #tera pointed out, you can compare a number with the previous number to determine whether it is the first occurrence in a sequence of unique number. You can write a kernel to generate a mask for this criteria such that the mask array contains the index for a number which is a first occurrence and a negative number (like -1, as it cannot be an index) otherwise. After that, use thrust to count the non -1 values by using a predicate. Then copy those values from the mask using the same predicate as above. Finally, copy back the results to host.
Here is a sample implementation of the above mentioned approach.
#include <iostream>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/count.h>
#include <thrust/copy.h>
using namespace std;
//Copy index
__global__ void is_first_occurence(int* input, int* is, int count)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid<count)
{
if(tid == 0)
{
is[0] = 0;
}
else if(input[tid] != input[tid-1])
{
is[tid] = tid;
}
else
{
is[tid] = -1;
}
}
}
struct isFirst
{
__host__ __device__ bool operator()(const int x)
{
return (x != -1);
}
};
int main(int argc, char** argv)
{
const int count = 13;
std::vector<int> arr = { 0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4 ,4 };
thrust::device_vector<int> arr_d = arr;
thrust::device_vector<int> mask_d(arr_d.size());
int* pArr = thrust::raw_pointer_cast(arr_d.data() );
int* pMask = thrust::raw_pointer_cast(mask_d.data() );
dim3 block(16);
dim3 grid((count + block.x -1)/block.x);
is_first_occurence<<<grid,block>>>(pArr, pMask, count);
cudaDeviceSynchronize();
int num_unique = thrust::count_if(mask_d.begin(), mask_d.end(), isFirst());
thrust::copy_if(mask_d.begin(), mask_d.end(), arr_d.begin(), isFirst());
std::vector<int> unique_indices(num_unique);
thrust::copy(arr_d.begin(), arr_d.begin() + num_unique, unique_indices.begin());
for(auto i:unique_indices)
{
cout<<i<<endl;
}
return 0;
}
Compiled and tested using the following command:
nvcc -o get_unique get_unique.cu -std=c++11 -arch=sm_61
I want to perform a transformation on a input thrust::device_vector and only copy the result to the output vector if the result satisfies a predicate. So the number of results could be less than the size of the input device_vector (similar to the output vector of thrust::copy_if). I have not found a way to do this with thrust::transform_if. Currently I can do this with thrust::transform and thrust::remove_if as shown in the example below:
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random {
__host__ __device__ add_random() {}
__device__ int operator()(const int n, const int x) const {
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x > 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::transform(
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(5),
d_x.begin(),
d_x.begin(),
add_random());
std::cout << "after adding random number:" << std::endl;
std::ostream_iterator<int> o(std::cout, " ");
thrust::copy(d_x.begin(), d_x.end(), o);
std::cout << std::endl;
thrust::device_vector<int>::iterator new_end(thrust::remove_if(d_x.begin(), d_x.end(), is_greater()));
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy(d_x.begin(), new_end, o);
std::cout << std::endl;
return 0;
}
Which gives the output:
after adding random number:
18 4 8 7 11
after removing values greater than 6:
4
I would like to avoid copying the results to memory twice, first by thrust::transform and then by thrust::remove_if in the above example. Is it possible to get the above output with a single transformation function? How can I do this? My biggest concern is the computational cost, so any optimized solution, even if it doesn't use the Thrust library would be great.
Welcome to the world of thrust fancy iterators. You can get a quick overview of some fancy iterator types by looking at the thrust quick start guide. In particular, a thrust transform iterator can frequently be used to replace a thrust transform operation that is applied to the input of another thrust algorithm, "fusing" the two algorithms into a single operation.
Here's a worked example applied to your case:
$ cat t1254.cu
#include <thrust/random.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/remove.h>
#include <iostream>
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a+0x7ed55d16) + (a<<12);
a = (a^0xc761c23c) ^ (a>>19);
a = (a+0x165667b1) + (a<<5);
a = (a+0xd3a2646c) ^ (a<<9);
a = (a+0xfd7046c5) + (a<<3);
a = (a^0xb55a4f09) ^ (a>>16);
return a;
};
struct add_random : public thrust::unary_function<thrust::tuple<int, int>, int> {
__host__ __device__ int operator()(thrust::tuple<int, int> t) const {
int n = thrust::get<0>(t);
int x = thrust::get<1>(t);
thrust::default_random_engine rng(hash(n));
thrust::uniform_int_distribution<int> uniform(0, 11);
return uniform(rng)+x;
}
};
struct is_greater {
__host__ __device__ bool operator()(const int x) {
return x < 6 ;
}
};
int main(void) {
int x[5] = {10, 2, 5, 3, 0};
thrust::device_vector<int> d_x(x, x+5);
thrust::device_vector<int> d_r(5);
int rsize = thrust::copy_if(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0), d_x.begin())), add_random()), thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(5), d_x.end())), add_random()), d_r.begin(), is_greater())- d_r.begin();
std::cout << "after removing values greater than 6:" << std::endl;
thrust::copy_n(d_r.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1254 t1254.cu
$ ./t1254
after removing values greater than 6:
4
$
We've replaced your transform operation with a transform iterator applied to the same two inputs. Since you have two inputs to your transform operation, we're using a zip iterator to combine these, and the transform functor has also been reworked slightly to accept that tuple as its input.
Converted your remove_if to a copy_if, to work with the transform iterator as input. This requires a slight change in the logic of the copy predicate.
In the thrust example, there is code expand.cu. But when i want to get the value, has shown error like this
error: expression must be a modifiable lvalue in the line
SX = d_output;
Where is the wrong part? the code is like this
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <thrust/fill.h>
#include <thrust/copy.h>
#include <iterator>
#include <iostream>
template <typename InputIterator1,
typename InputIterator2,
typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size, 0);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size, 0);
thrust::scatter_if
(thrust::counting_iterator<difference_type>(0),
thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(),
first1,
output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan
(output_indices.begin(),
output_indices.end(),
output_indices.begin(),
thrust::maximum<difference_type>());
// gather input values according to index array (output = first2[output_indices])
OutputIterator output_end = output; thrust::advance(output_end, output_size);
thrust::gather(output_indices.begin(),
output_indices.end(),
first2,
output);
// return output + output_size
thrust::advance(output, output_size);
return output;
}
int main(void)
{
int counts[] = {3,5,2,0,1,3,4,2,4};
int values[] = {1,2,3,4,5,6,7,8,9};
size_t input_size = sizeof(counts) / sizeof(int);
size_t output_size = thrust::reduce(counts, counts + input_size);
// copy inputs to device
thrust::device_vector<int> d_counts(counts, counts + input_size);
thrust::device_vector<int> d_values(values, values + input_size);
thrust::device_vector<int> d_output(output_size);
// expand values according to counts
expand(d_counts.begin(), d_counts.end(),
d_values.begin(),
d_output.begin());
std::cout << "d_output: " ;
for(int i=0; i<output_size; i++)
std::cout << d_output[i] << " , ";
//d::cout << d_output[i] << " , ";
thrust::device_vector<int> SX[output_size];
//SX = d_output;
thrust::copy(d_output.begin() , d_output.end(), SX.begin());
return 0;
}
It should be a device vector of the length output_size as
thrust::device_vector<int> SX(output_size);
Using [] means array of device vectors.
Let's say I have two device_vector<byte> arrays, d_keys and d_data.
If d_data is, for example, a flattened 2D 3x5 array ( e.g. { 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6, 5, 4, 3 } ) and d_keys is a 1D array of size 5 ( e.g. { 1, 0, 0, 1, 1 } ), how can I do a reduction such that I'd end up only adding values on a per-row basis if the corresponding d_keys value is one ( e.g. ending up with a result of { 10, 23, 14 } )?
The sum_rows.cu example allows me to add every value in d_data, but that's not quite right.
Alternatively, I can, on a per-row basis, use a zip_iterator and combine d_keys with one row of d_data at a time, and do a transform_reduce, adding only if the key value is one, but then I'd have to loop through the d_data array.
What I really need is some sort of transform_reduce_by_key functionality that isn't built-in, but surely there must be a way to make it!
Based on the additional comment that instead of 3 rows there are thousands of rows, we can write a transform functor that sums an entire row. Based on the fact that there are thousands of rows, this should keep the machine pretty busy:
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#define ROW 20
#define COL 10
__device__ int *vals;
__device__ int *keys;
struct test_functor
{
const int a;
test_functor(int _a) : a(_a) {}
__device__
int operator()(int& x, int& y ) {
int temp = 0;
for (int i = 0; i<a; i++)
temp += vals[i + (y*a)] * keys[i];
return temp;
}
};
int main(){
int *s_vals, *s_keys;
thrust::host_vector<int> h_vals(ROW*COL);
thrust::host_vector<int> h_keys(COL);
thrust::sequence(h_vals.begin(), h_vals.end());
thrust::fill(h_keys.begin(), h_keys.end(), 1);
h_keys[0] = 0;
thrust::device_vector<int> d_vals = h_vals;
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_sums(ROW);
thrust::fill(d_sums.begin(), d_sums.end(), 0);
s_vals = thrust::raw_pointer_cast(&d_vals[0]);
s_keys = thrust::raw_pointer_cast(&d_keys[0]);
cudaMemcpyToSymbol(vals, &s_vals, sizeof(int *));
cudaMemcpyToSymbol(keys, &s_keys, sizeof(int *));
thrust::device_vector<int> d_idx(ROW);
thrust::sequence(d_idx.begin(), d_idx.end());
thrust::transform(d_sums.begin(), d_sums.end(), d_idx.begin(), d_sums.begin(), test_functor(COL));
thrust::host_vector<int> h_sums = d_sums;
std::cout << "Results :" << std::endl;
for (unsigned i = 0; i<ROW; i++)
std::cout<<"h_sums["<<i<<"] = " << h_sums[i] << std::endl;
return 0;
}
This approach has the drawback that in general accesses to the vals array will not be coalesced. However for a few thousand rows the cache may offer significant relief. We can fix this problem by re-ordering the data to be stored in column-major form in the flattened array, and change our indexing method in the loop in the functor to be like this:
for (int i=0; i<a; i++)
temp += vals[(i*ROW)+y]*keys[i];
If preferred, you can pass ROW as an additional parameter to the functor.
Here is some sample code that does something like what you are after, using the approach I outlined in my comment below your question. In fact we want to use 4-tuples, to pick up your key value. Reproducing the suitably modified comment here:
You could make a zip iterator that zips your 3 rows together plus the key "row" and passes a 4-tuple to a special functor. Your special functor would then do a reduction on the array of 3-tuples (using the key also) and return a result that is a 4-tuple. The thrust dot product example may give you some ideas.
This is one possible approach:
#include <thrust/host_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/tuple.h>
#define N 30 // make this evenly divisible by 3 for this example
typedef thrust::tuple<int, int, int, int> tpl4int;
typedef thrust::host_vector<int>::iterator intiter;
typedef thrust::tuple<intiter, intiter, intiter, intiter> tpl4intiter;
typedef thrust::zip_iterator<tpl4intiter> int4zip;
struct r3key_unary_op : public thrust::unary_function<tpl4int, tpl4int>
{
__host__ __device__
tpl4int operator()(const tpl4int& x) const
{
tpl4int result;
thrust::get<0>(result) = x.get<0>()*x.get<3>();
thrust::get<1>(result) = x.get<1>()*x.get<3>();
thrust::get<2>(result) = x.get<2>()*x.get<3>();
thrust::get<3>(result) = 1;
return result;
}
};
struct r3key_binary_op : public thrust::binary_function<tpl4int, tpl4int, tpl4int>
{
__host__ __device__
tpl4int operator()(const tpl4int& x, const tpl4int& y) const
{
tpl4int result;
thrust::get<0>(result) = x.get<0>()*x.get<3>() + y.get<0>()*y.get<3>();
thrust::get<1>(result) = x.get<1>()*x.get<3>() + y.get<1>()*y.get<3>();
thrust::get<2>(result) = x.get<2>()*x.get<3>() + y.get<2>()*y.get<3>();
thrust::get<3>(result) = 1;
return result;
}
};
int main() {
thrust::host_vector<int> A(N); // values, in 3 "rows" flattened
thrust::sequence(A.begin(), A.end());
thrust::host_vector<int> K(N/3); // keys in one row
thrust::fill(K.begin(), K.end(), 1); // set some keys to 1
K[9] = 0; // set some keys to zero
int4zip first = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), A.begin() + N/3, A.begin() + 2*N/3, K.begin()));
int4zip last = thrust::make_zip_iterator(thrust::make_tuple(A.begin() + N/3, A.begin() + 2*N/3, A.end(), K.end()));
r3key_unary_op my_unary_op;
r3key_binary_op my_binary_op;
tpl4int init = my_unary_op(*first);
// init = thrust::make_tuple((int) 0, (int) 0, (int) 0, (int) 0);
tpl4int result = thrust::transform_reduce(first, last, my_unary_op, init, my_binary_op);
std::cout << "row 0 = " << result.get<0>() << std::endl;
std::cout << "row 1 = " << result.get<1>() << std::endl;
std::cout << "row 2 = " << result.get<2>() << std::endl;
return 0;
}
Notes:
This is just using host_vector. Extending it to work with device_vector, or templatizing it to work with something other than int should be straightforward.
For completeness, I am using the unary functor to provide an init value other than zero for the sum reduction of each row. You might want to change the init value to zero (a 4-tuple of zeros).