I'm quite new to thrust (cuda), and am finding something challenging.
(Edited question to be simplified) I have an input vector and a map:
vector = [8,23,46,500,2,7,91,91]
map = [1, 0, 4, 3,1,0, 5, 3]
I want to expand this and increment the values to become:
new_vec = [8,46,47,48,49,500,501,502,2,91,92,93,94,95,91,92,93]
I realise the thrust/examples/expand.cu example already mostly does this, but I don't know how to efficiently increment the data value by the map count.
It would be helpful if someone could explain how to modify this example to achieve this.
Adapt the Thrust expand example to use exclusive_scan_by_key to rank each output element within its subsequence and then increment by that rank:
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <thrust/fill.h>
#include <thrust/copy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/functional.h>
#include <iterator>
#include <iostream>
template<typename Vector>
void print(const std::string& s, const Vector& v)
{
typedef typename Vector::value_type T;
std::cout << s;
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, " "));
std::cout << std::endl;
}
template<typename InputIterator1,
typename InputIterator2,
typename OutputIterator>
void expand_and_increment(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
print("output_offsets ", output_offsets);
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size);
thrust::scatter_if
(thrust::counting_iterator<difference_type>(0),
thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(),
first1,
output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan
(output_indices.begin(),
output_indices.end(),
output_indices.begin(),
thrust::maximum<difference_type>());
print("output_indices ", output_indices);
// gather input values according to index array (output = first2[output_indices])
OutputIterator output_end = output; thrust::advance(output_end, output_size);
thrust::gather(output_indices.begin(),
output_indices.end(),
first2,
output);
// rank output_indices
thrust::device_vector<difference_type> ranks(output_size);
thrust::exclusive_scan_by_key(output_indices.begin(), output_indices.end(),
thrust::make_constant_iterator<difference_type>(1),
ranks.begin());
print("ranks ", ranks);
// increment output by ranks
thrust::transform(output, output + output_size, ranks.begin(), output, thrust::placeholders::_1 + thrust::placeholders::_2);
}
int main(void)
{
int values[] = {8,23,46,500,2,7,91,91};
int counts[] = {1, 0, 4, 3,1,0, 5, 3};
size_t input_size = sizeof(counts) / sizeof(int);
size_t output_size = thrust::reduce(counts, counts + input_size);
// copy inputs to device
thrust::device_vector<int> d_counts(counts, counts + input_size);
thrust::device_vector<int> d_values(values, values + input_size);
thrust::device_vector<int> d_output(output_size);
// expand values according to counts
expand_and_increment(d_counts.begin(), d_counts.end(),
d_values.begin(),
d_output.begin());
std::cout << "Expanding and incrementing values according to counts" << std::endl;
print(" counts ", d_counts);
print(" values ", d_values);
print(" output ", d_output);
return 0;
}
The output:
$ nvcc expand_and_increment.cu -run
output_offsets 0 1 1 5 8 9 9 14
output_indices 0 2 2 2 2 3 3 3 4 6 6 6 6 6 7 7 7
ranks 0 0 1 2 3 0 1 2 0 0 1 2 3 4 0 1 2
Expanding and incrementing values according to counts
counts 1 0 4 3 1 0 5 3
values 8 23 46 500 2 7 91 91
output 8 46 47 48 49 500 501 502 2 91 92 93 94 95 91 92 93
Related
I'm using CUDA and THRUST to perform paired set operations. I would like to retain duplicates, however. For example:
int keys[6] = {1, 1, 1, 3, 4, 5, 5};
int vals[6] = {1, 2, 3, 4, 5, 6, 7};
int comp[2] = {1, 5};
thrust::set_intersection_by_key(keys, keys + 6, comp, comp + 2, vals, rk, rv);
Desired result
rk[1, 1, 1, 5, 5]
rv[1, 2, 3, 6, 7]
Actual Result
rk[1, 5]
rv[5, 7]
I want all of the vals where the corresponding key is contained in comp.
Is there any way to achieve this using thrust, or do I have to write my own kernel or thrust function?
I'm using this function: set_intersection_by_key.
Quoting from the thrust documentation:
The generalization is that if an element appears m times in [keys_first1, keys_last1) and n times in [keys_first2, keys_last2) (where m may be zero), then it appears min(m,n) times in the keys output range
Since comp does only contain each key once, n=1 and therefore min(m,1) = 1.
In order to get "all of the vals where the corresponding key is contained in comp", you can use the approach of my answer to a similar problem.
Similarly, the example code does the following steps:
Get the largest element of d_comp. This assumes that d_comp is already sorted.
Create vector d_map of size largest_element+1. Copy 1 to all positions of the entries of d_comp in d_map.
Copy all entries from d_vals for which there is a 1 entry in d_map into d_result.
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <thrust/scatter.h>
#include <iostream>
#define PRINTER(name) print(#name, (name))
void print(const char* name, const thrust::device_vector<int>& v)
{
std::cout << name << ":\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<int>(std::cout, "\t"));
std::cout << std::endl;
}
int main()
{
int keys[] = {1, 1, 1, 3, 4, 5, 5};
int vals[] = {1, 2, 3, 4, 5, 6, 7};
int comp[] = {1, 5};
const int size_data = sizeof(keys)/sizeof(keys[0]);
const int size_comp = sizeof(comp)/sizeof(comp[0]);
// copy data to GPU
thrust::device_vector<int> d_keys (keys, keys+size_data);
thrust::device_vector<int> d_vals (vals, vals+size_data);
thrust::device_vector<int> d_comp (comp, comp+size_comp);
PRINTER(d_keys);
PRINTER(d_vals);
PRINTER(d_comp);
int largest_element = d_comp.back();
thrust::device_vector<int> d_map(largest_element+1);
thrust::constant_iterator<int> one(1);
thrust::scatter(one, one+size_comp, d_comp.begin(), d_map.begin());
PRINTER(d_map);
thrust::device_vector<int> d_result(size_data);
using namespace thrust::placeholders;
int final_size = thrust::copy_if(d_vals.begin(),
d_vals.end(),
thrust::make_permutation_iterator(d_map.begin(), d_keys.begin()),
d_result.begin(),
_1
) - d_result.begin();
d_result.resize(final_size);
PRINTER(d_result);
return 0;
}
output:
d_keys: 1 1 1 3 4 5 5
d_vals: 1 2 3 4 5 6 7
d_comp: 1 5
d_map: 0 1 0 0 0 1
d_result: 1 2 3 6 7
I'm using CUDA and THRUST to perform paired set operations. I would like to retain duplicates, however. For example:
int keys[6] = {1, 1, 1, 3, 4, 5, 5};
int vals[6] = {1, 2, 3, 4, 5, 6, 7};
int comp[2] = {1, 5};
thrust::set_intersection_by_key(keys, keys + 6, comp, comp + 2, vals, rk, rv);
Desired result
rk[1, 1, 1, 5, 5]
rv[1, 2, 3, 6, 7]
Actual Result
rk[1, 5]
rv[5, 7]
I want all of the vals where the corresponding key is contained in comp.
Is there any way to achieve this using thrust, or do I have to write my own kernel or thrust function?
I'm using this function: set_intersection_by_key.
Quoting from the thrust documentation:
The generalization is that if an element appears m times in [keys_first1, keys_last1) and n times in [keys_first2, keys_last2) (where m may be zero), then it appears min(m,n) times in the keys output range
Since comp does only contain each key once, n=1 and therefore min(m,1) = 1.
In order to get "all of the vals where the corresponding key is contained in comp", you can use the approach of my answer to a similar problem.
Similarly, the example code does the following steps:
Get the largest element of d_comp. This assumes that d_comp is already sorted.
Create vector d_map of size largest_element+1. Copy 1 to all positions of the entries of d_comp in d_map.
Copy all entries from d_vals for which there is a 1 entry in d_map into d_result.
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <thrust/scatter.h>
#include <iostream>
#define PRINTER(name) print(#name, (name))
void print(const char* name, const thrust::device_vector<int>& v)
{
std::cout << name << ":\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<int>(std::cout, "\t"));
std::cout << std::endl;
}
int main()
{
int keys[] = {1, 1, 1, 3, 4, 5, 5};
int vals[] = {1, 2, 3, 4, 5, 6, 7};
int comp[] = {1, 5};
const int size_data = sizeof(keys)/sizeof(keys[0]);
const int size_comp = sizeof(comp)/sizeof(comp[0]);
// copy data to GPU
thrust::device_vector<int> d_keys (keys, keys+size_data);
thrust::device_vector<int> d_vals (vals, vals+size_data);
thrust::device_vector<int> d_comp (comp, comp+size_comp);
PRINTER(d_keys);
PRINTER(d_vals);
PRINTER(d_comp);
int largest_element = d_comp.back();
thrust::device_vector<int> d_map(largest_element+1);
thrust::constant_iterator<int> one(1);
thrust::scatter(one, one+size_comp, d_comp.begin(), d_map.begin());
PRINTER(d_map);
thrust::device_vector<int> d_result(size_data);
using namespace thrust::placeholders;
int final_size = thrust::copy_if(d_vals.begin(),
d_vals.end(),
thrust::make_permutation_iterator(d_map.begin(), d_keys.begin()),
d_result.begin(),
_1
) - d_result.begin();
d_result.resize(final_size);
PRINTER(d_result);
return 0;
}
output:
d_keys: 1 1 1 3 4 5 5
d_vals: 1 2 3 4 5 6 7
d_comp: 1 5
d_map: 0 1 0 0 0 1
d_result: 1 2 3 6 7
I have a M*N host memory matrix, and upon copying into a device memory, I need it to be transposed into a N*M matrix. Is there any cuda (cuBLAS...) API doing that? I am using CUDA 4. Thanks!
To answer your question on efficiency, I have compared two ways to perform matrix transposition, one using the Thrust library and one using cublas<t>geam, as suggested by Robert Crovella. The result of the comparison is the following on a Kepler K20c card:
| Matrix size | Thrust [ms] | cuBLAS [ms] |
| | | |
| 32x32 | 0.015 | 0.016 |
| 64x64 | 0.015 | 0.017 |
| 128x128 | 0.019 | 0.017 |
| 256x256 | 0.028 | 0.017 |
| 512x512 | 0.088 | 0.042 |
| 1024x1024 | 0.34 | 0.13 |
| 2048x2048 | 1.24 | 0.48 |
| 4096x4096 | 11.02 | 1.98 |
As it can be seen, the cublas<t>geam outperforms the version using Thrust. Below is the code to perform the comparison.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <iostream>
#include <iomanip>
#include <cublas_v2.h>
#include <conio.h>
#include <assert.h>
/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
// convert a linear index to a linear index in the transpose
struct transpose_index : public thrust::unary_function<size_t,size_t>
{
size_t m, n;
__host__ __device__
transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}
__host__ __device__
size_t operator()(size_t linear_index)
{
size_t i = linear_index / n;
size_t j = linear_index % n;
return m * j + i;
}
};
// convert a linear index to a row index
struct row_index : public thrust::unary_function<size_t,size_t>
{
size_t n;
__host__ __device__
row_index(size_t _n) : n(_n) {}
__host__ __device__
size_t operator()(size_t i)
{
return i / n;
}
};
// transpose an M-by-N array
template <typename T>
void transpose(size_t m, size_t n, thrust::device_vector<T>& src, thrust::device_vector<T>& dst)
{
thrust::counting_iterator<size_t> indices(0);
thrust::gather
(thrust::make_transform_iterator(indices, transpose_index(n, m)),
thrust::make_transform_iterator(indices, transpose_index(n, m)) + dst.size(),
src.begin(),dst.begin());
}
// print an M-by-N array
template <typename T>
void print(size_t m, size_t n, thrust::device_vector<T>& d_data)
{
thrust::host_vector<T> h_data = d_data;
for(size_t i = 0; i < m; i++)
{
for(size_t j = 0; j < n; j++)
std::cout << std::setw(8) << h_data[i * n + j] << " ";
std::cout << "\n";
}
}
int main(void)
{
size_t m = 5; // number of rows
size_t n = 4; // number of columns
// 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
thrust::device_vector<double> data(m * n, 1.);
data[1] = 2.;
data[3] = 3.;
std::cout << "Initial array" << std::endl;
print(m, n, data);
std::cout << "Transpose array - Thrust" << std::endl;
thrust::device_vector<double> transposed_thrust(m * n);
transpose(m, n, data, transposed_thrust);
print(n, m, transposed_thrust);
std::cout << "Transpose array - cuBLAS" << std::endl;
thrust::device_vector<double> transposed_cuBLAS(m * n);
double* dv_ptr_in = thrust::raw_pointer_cast(data.data());
double* dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data());
double alpha = 1.;
double beta = 0.;
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m));
print(n, m, transposed_cuBLAS);
getch();
return 0;
}
In the cublas API:
cublas<t>geam()
This function performs the matrix-matrix addition/transposition
the user can transpose matrix A by setting *alpha=1 and *beta=0.
(and specifying the transa operator as CUBLAS_OP_T for transpose)
CULA has auxiliary routines to compute the transpose (culaDevice?geTranspose). In case of a square matrix you could also use inplace transposition (culaDevise?geTransposeInplace).
Note: CULA has a free license available, if you meet certain conditions.
I'm having trouble doing the parallelization on an array of numbers with CUDA.
So, for example if we have an array M containing numbers ( 1 , 2 , 3 , 4 , 5)
And If I were to remove the number 2 in the array and shift everything to the left,
the resulting array would be ( 1 , 3 , 4 , 5 , 5 )
where M[1] = M[2], M[2] = M[3] , M[3] = M[4]
And my question is how can we do this in parallel in cuda? Because when we parallel this
there might be a race condition where the number 2 (M[1]) might not be the first one to
act first, if M[2] were the first one to shift, the resulting array would become
( 1 , 4 , 4 , 5 , 5). Is there any method to handle this? I'm fairly new to cuda so I'm
not sure what to do...
My current code is as follows:
__global__ void gpu_shiftSeam(int *MCEnergyMat, int *seam, int width, int height, int currRow)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int index = i+width*j;
if(i < width && j <height)
{
//shift values of -1 to the side of the image
if(MCEnergyMat[i+width*j] == -1)
{
if(i+1 != width)
MCEnergyMat[index] = MCEnergyMat[index+1];
}
if(seam[j] < i)
{
if(i+1 != width)
MCEnergyMat[index] = MCEnergyMat[index+1];
}
}
}
Where seam[i] contains the index I would like to remove in the array. and MCEnergyMat is just a 1D array converted from a 2d array... However, my code does not work... and I believe race condition is the problem.
Thanks!
As talonmies notes in his comment, this sort of thing is called "stream compaction". Here's how you would do it with Thrust:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/remove.h>
#include <iostream>
int main()
{
int data[5] = {1,2,3,4,5};
thrust::device_vector<int> d_vec(data, data + 5);
// new_end points to the end of the sequence after 2 has been thrown out
thrust::device_vector<int>::iterator new_end =
thrust::remove(d_vec.begin(), d_vec.end(), 2);
// erase everything after the new end
d_vec.erase(new_end, d_vec.end());
// prove that it worked
thrust::host_vector<int> h_vec = d_vec;
std::cout << "result: ";
thrust::copy(h_vec.begin(), h_vec.end(), std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
return 0;
}
Here's the result:
$ nvcc test.cu -run
result: 1 3 4 5
I am trying to code the following problem in cuda/thrust. I am given a list of key and three values associated with each keys. I have managed to sort them in lexicographic order. The input now needs to be reduced if inputs with same key have each value-wise relation. In example below, V1(a)<=V1(c) and V2(a)<=V2(c) and V3(a)<=V3(c), implies that Input a < Input c, and hence, Input c is removed from output.
Example Input:
Key V1 V2 V3
a. 1 2 5 3
b. 1 2 6 2
c. 1 2 7 4
d. 1 3 6 5
e. 2 8 8 8
f. 3 1 2 4
Example Output:
Key V1 V2 V3
a. 1 2 5 3
b. 1 2 6 2
e. 2 8 8 8
f. 3 1 2 4
Input a < Input c ==> c removed
Input a < Input d ==> d removed
I’ve been able to solve the above problem using for-loops, and if-statements. I am currently trying to solve this using gpu based cuda/thrust. Could this be done on the gpu (preferably thrust) or an individual kernel has to be written in cuda ?
I have not been to formulate this problem using unique as discussed in Thrust: Removing duplicates in key-value arrays
Edited to include program "stl/c++" program to generate above scenario: section "Reducing myMap" is my implementation using for-loops and if-statements.
#include <iostream>
#include <tr1/array>
#include <vector>
#include <algorithm>
struct mapItem {
mapItem(int k, int v1, int v2, int v3){
key=k;
std::tr1::array<int,3> v = {v1, v2, v3};
values=v;
};
int key;
std::tr1::array<int,3> values;
};
struct sortLexiObj{
bool operator()(const mapItem& lhs, const mapItem& rhs){
return lhs.values < rhs.values;
}
};
struct sortKey{
bool operator()(const mapItem& lhs, const mapItem& rhs){
return lhs.key < rhs.key;
}
};
int main(int argc, char** argv){
std::vector<mapItem> myMap;
// Set up initial matrix:
myMap.push_back(mapItem(3, 1, 2, 4));
myMap.push_back(mapItem(1, 2, 6, 2));
myMap.push_back(mapItem(1, 2, 5, 3));
myMap.push_back(mapItem(1, 3, 6, 5));
myMap.push_back(mapItem(2, 8, 8, 8));
myMap.push_back(mapItem(1, 2, 7, 4));
std::sort(myMap.begin(), myMap.end(), sortLexiObj());
std::stable_sort(myMap.begin(), myMap.end(), sortKey());
std::cout << "\r\nOriginal sorted Map" << std::endl;
for(std::vector<mapItem>::iterator mt=myMap.begin(); mt!=myMap.end(); ++mt){
std::cout << mt->key << "\t";
for(std::tr1::array<int,3>::iterator it=(mt->values).begin(); it!=(mt->values).end(); ++it){
std::cout << *it << " ";
}
std::cout << std::endl;
}
/////////////////////////
// Reducing myMap
for(std::vector<mapItem>::iterator it=myMap.begin(); it!=myMap.end(); ++it){
std::vector<mapItem>::iterator jt=it; ++jt;
for (; jt != myMap.end();) {
if ( (it->key == jt->key)){
if ( it->values.at(0) <= jt->values.at(0) &&
it->values.at(1) <= jt->values.at(1) &&
it->values.at(2) <= jt->values.at(2) ) {
jt = myMap.erase(jt);
}
else ++jt;
}
else break;
}
}
std::cout << "\r\nReduced Map" << std::endl;
for(std::vector<mapItem>::iterator mt=myMap.begin(); mt!=myMap.end(); ++mt){
std::cout << mt->key << "\t";
for(std::tr1::array<int,3>::iterator it=(mt->values).begin(); it!=(mt->values).end(); ++it){
std::cout << *it << " ";
}
std::cout << std::endl;
}
return 0;
}
I think that you can use thrust::unique with a predicate as it's shown in Thrust: Removing duplicates in key-value arrays.
Actually, we can do it because of the following characteristic of unique:
For each group of consecutive elements in the range [first, last) with the same value, unique removes all but the first element of the group.
So, you should define a predicate to test for pseudo-equality that will return true for tuples that have the same key and all values are smaller in the first tuple:
typedef thrust::tuple<int, int, int, int> tuple_t;
// a functor which defines your *uniqueness* condition
struct tupleEqual
{
__host__ __device__
bool operator()(tuple_t x, tuple_t y)
{
return ( (x.get<0>() == y.get<0>()) // same key
&& (x.get<1>() <= y.get<1>()) // all values are smaller
&& (x.get<2>() <= y.get<2>())
&& (x.get<3>() <= y.get<3>()));
}
};
And you have to apply it to a sorted collection. In this way, only the first tuple (the smallest) will not be removed.
A tuple with the same key and a bigger value in V1, V2 or V3 will yield false so it won't be removed.
typedef thrust::device_vector< int > IntVector;
typedef IntVector::iterator IntIterator;
typedef thrust::tuple< IntIterator, IntIterator, IntIterator, IntIterator > IntIteratorTuple;
typedef thrust::zip_iterator< IntIteratorTuple > ZipIterator;
IntVector keyVector;
IntVector valVector1, valVector2, valVector3;
tupleEqual predicate;
ZipIterator newEnd = thrust::unique(
thrust::make_zip_iterator(
thrust::make_tuple(
keyVector.begin(),
valVector1.begin(),
valVector2.begin(),
valVector3.begin() ) ),
thrust::make_zip_iterator(
thrust::make_tuple(
keyVector.end(),
valVector1.end(),
valVector2.end(),
valVector3.end() ) ),
predicate );
IntIteratorTuple endTuple = newEnd.get_iterator_tuple();
keyVector.erase( thrust::get<0>( endTuple ), keyVector.end() );
valVector1.erase( thrust::get<1>( endTuple ), valVector1.end() );
valVector2.erase( thrust::get<2>( endTuple ), valVector2.end() );
valVector3.erase( thrust::get<3>( endTuple ), valVector3.end() );