Can thrust::gather be used "in-place"? - cuda

Consider the following code:
#include <time.h> // --- time
#include <stdlib.h> // --- srand, rand
#include<fstream>
#include <thrust\host_vector.h>
#include <thrust\device_vector.h>
#include <thrust\sort.h>
#include <thrust\iterator\zip_iterator.h>
#include "TimingGPU.cuh"
/********/
/* MAIN */
/********/
int main() {
const int N = 16384;
std::ifstream h_indices_File, h_x_File;
h_indices_File.open("h_indices.txt");
h_x_File.open("h_x.txt");
std::ofstream h_x_result_File;
h_x_result_File.open("h_x_result.txt");
thrust::host_vector<int> h_indices(N);
thrust::host_vector<double> h_x(N);
thrust::host_vector<double> h_sorted(N);
for (int k = 0; k < N; k++) {
h_indices_File >> h_indices[k];
h_x_File >> h_x[k];
}
thrust::device_vector<int> d_indices(h_indices);
thrust::device_vector<double> d_x(h_x);
thrust::gather(d_indices.begin(), d_indices.end(), d_x.begin(), d_x.begin());
h_x = d_x;
for (int k = 0; k < N; k++) h_x_result_File << h_x[k] << "\n";
//thrust::device_vector<double> d_x_sorted(N);
//thrust::gather(d_indices.begin(), d_indices.end(), d_x.begin(), d_x_sorted.begin());
//h_x = d_x_sorted;
//for (int k = 0; k < N; k++) h_x_result_File << h_x[k] << "\n";
}
The code loads from file an array of indices h_indices.txt and a double array h_x.txt. Then, it transfers those arrays to the GPU to d_indices and d_x and uses thrust::gather to achieve Matlab's equivalent
d_x(d_indices)
The two txt files can be downloaded from h_indices.txt and h_x.txt. The code creates an output result file h_x_result.txt.
If I use the "in-place" version of thrust::gather (the last uncommented three lines of the code), then I obtain that the result is different from d_x(d_indices), while if I use the not "in-place" version (the last commented three lines of the code), then the result is correct.
In Matlab, I'm using
load h_indices.txt; load h_x.txt; load h_x_result.txt
plot(h_x(h_indices + 1)); hold on; plot(h_x_result, 'r'); hold off
The "in-place" case returns the following comparison
On the other side, the "in-place" case returns
I'm using Windows 10, CUDA 8.0, Visual Studio 2013, compiling in Release Mode and running on an NVIDIA GTX 960 cc. 5.2.

Thrust gather can't be used in place.
But I would go as far as to suggest that no "naïve" gather operation can be safely performed in-place, and that the Matlab snippet you presented as in-place (presumably d_x = d_x(d_indices)) isn't an in-place operation at all.

Related

Non-intuitive difference in time for copying Thrust device_vector from host_vector vs device_vector

I have one cuda thrust host vector and one device vector initialized. I have a thrust device matrix. The time taken for the the copying of host vector and device vector to the device matrix is different (as expected).
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <chrono>
using namespace std;
using namespace std::chrono;
typedef thrust::host_vector<double> th_vec;
typedef thrust::device_vector<double> td_vec;
int main(void) {
th_vec h_vec(3,1.0);
td_vec d_vec(3,1.0);
auto start1 = high_resolution_clock::now();
td_vec d1_mat[3];
for(size_t i = 0; i < 3; i++) {
d1_mat[i] = h_vec;
}
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
cout << "Host-Device mat assignment time " << duration1.count() << endl;
auto start2 = high_resolution_clock::now();
td_vec d2_mat[3];
for(size_t i = 0; i < 3; i++) {
d2_mat[i] = d_vec;
}
auto stop2 = high_resolution_clock::now();
auto duration2 = duration_cast<microseconds>(stop2 - start2);
cout << "Device-Device mat assignment time " << duration2.count() << endl;
return 0;
}
For 10 runs of the above code, the average result is as follow
Host-Device mat assignment time 32
Device-Device mat assignment time 85
I am running on Ubuntu 22.04 with CUDA Version 11.7, Driver Version: 515.43.04 and GPU GTX 1650
Why are the timings non-intuitive. Shouldn't the time for data transfer between device to device be less than host to device?

How to prevent thrust::copy_if illegal memory access

In a CUDA C++ code, I am using thrust::copy_if to copy those integer values that are not -1 from the array x to y:
thrust::copy_if(x_ptr, x_ptr + N, y_ptr, is_not_minus_one());
I put the code in a try/catch and I update the array x regularly, and again push numbers that are not -1 to the end of the y until it accessed to the out of the array and returns
CUDA Runtime Error: an illegal memory access was encountered
As I don't know how many values are not -1 in each iteration, I should keep generating and copying numbers in the main array until it becomes full. However, I wanna it to stop once it reaches the end of the array. How can I manage it?
A naive idea would be using another array and then copying if the number of new values does not exceed the size. but it might be inefficient. Any better idea?
Here is one possible method:
use remove_if() instead, (with the opposite condition - remove if -1) on x. Then use (x start and) the returned iterator to copy to your final array (y). Every time you do a copy to y, you will know how much space is left in y, and you can adjust the copy quantity if needed, before doing the copy.
Example (removing 1 values):
$ cat t2088.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/complex.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <iostream>
using mt = int;
using namespace thrust::placeholders;
int main(){
thrust::device_vector<mt> a(5);
thrust::device_vector<mt> c(25);
bool done = false;
int empty_spaces = c.size();
int copy_start_index = 0;
while (!done){
thrust::sequence(a.begin(), a.end());
int num_to_copy = (thrust::remove_if(a.begin(), a.end(), _1 == 1) - a.begin());
int actual_num_to_copy = std::min(num_to_copy, empty_spaces);
if (actual_num_to_copy != num_to_copy) done = true;
thrust::copy_n(a.begin(), actual_num_to_copy, c.begin()+copy_start_index);
copy_start_index += actual_num_to_copy;
empty_spaces -= actual_num_to_copy;
}
std::cout << "output array is full!" << std::endl;
thrust::host_vector<mt> h_c = c;
thrust::copy(h_c.begin(), h_c.end(), std::ostream_iterator<mt>(std::cout, ","));
std::cout << std::endl;
}
$ nvcc -o t2088 t2088.cu
$ compute-sanitizer ./t2088
========= COMPUTE-SANITIZER
output array is full!
0,2,3,4,0,2,3,4,0,2,3,4,0,2,3,4,0,2,3,4,0,2,3,4,0,
========= ERROR SUMMARY: 0 errors
$

-ta=tesla:managed:cuda8 but cuMemAllocManaged returned error 2: Out of memory

I'm new to OpenACC. I like it very much so far as I'm familiar with OpenMP.
I have 2 1080Ti cards each with 9GB and I've 128GB of RAM. I'm trying a very basic test to allocate an array, initialize it, then sum it up in parallel. This works for 8 GB but when I increase to 10 GB I get out-of-memory error. My understanding was that with unified memory of Pascal (which these card are) and CUDA 8, I could allocate an array larger than the GPU's memory and the hardware will page in and page out on demand.
Here's my full C code test :
$ cat firstAcc.c
#include <stdio.h>
#include <openacc.h>
#include <stdlib.h>
#define GB 10
int main()
{
float *a;
size_t n = GB*1024*1024*1024/sizeof(float);
size_t s = n * sizeof(float);
a = (float *)malloc(s);
if (!a) { printf("Failed to malloc.\n"); return 1; }
printf("Initializing ... ");
for (int i = 0; i < n; ++i) {
a[i] = 0.1f;
}
printf("done\n");
float sum=0.0;
#pragma acc loop reduction (+:sum)
for (int i = 0; i < n; ++i) {
sum+=a[i];
}
printf("Sum is %f\n", sum);
free(a);
return 0;
}
As per the "Enable Unified Memory" section of this article I compile it with :
$ pgcc -acc -fast -ta=tesla:managed:cuda8 -Minfo firstAcc.c
main:
20, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop
28, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop containing reductions
Generated a prefetch instruction for the loop
I need to understand those messages but for now I don't think they are relevant. Then I run it :
$ ./a.out
malloc: call to cuMemAllocManaged returned error 2: Out of memory
Aborted (core dumped)
This works fine if I change GB to 8. I expected 10GB to work (despite the GPU card having 9GB) thanks to Pascal 1080Ti and CUDA 8.
Have I misunderstand, or what am I doing wrong? Thanks in advance.
$ pgcc -V
pgcc 17.4-0 64-bit target on x86-64 Linux -tp haswell
PGI Compilers and Tools
Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
$ cat /usr/local/cuda-8.0/version.txt
CUDA Version 8.0.61
Besides what Bob mentioned, I made a few more fixes.
First, you're not actually generating an OpenACC compute region since you only have a "#pragma acc loop" directive. This should be "#pragma acc parallel loop". You can see this in the compiler feedback messages where it's only showing host code optimizations.
Second, the "i" index should be declared as a "long". Otherwise, you'll overflow the index.
Finally, you need to add "cc60" to your target accelerator options to tell the compiler to target a Pascal based GPU.
% cat mi.c
#include <stdio.h>
#include <openacc.h>
#include <stdlib.h>
#define GB 20ULL
int main()
{
float *a;
size_t n = GB*1024ULL*1024ULL*1024ULL/sizeof(float);
size_t s = n * sizeof(float);
printf("n = %lu, s = %lu\n", n, s);
a = (float *)malloc(s);
if (!a) { printf("Failed to malloc.\n"); return 1; }
printf("Initializing ... ");
for (int i = 0; i < n; ++i) {
a[i] = 0.1f;
}
printf("done\n");
double sum=0.0;
#pragma acc parallel loop reduction (+:sum)
for (long i = 0; i < n; ++i) {
sum+=a[i];
}
printf("Sum is %f\n", sum);
free(a);
return 0;
}
% pgcc -fast -acc -ta=tesla:managed,cuda8.0,cc60 -Minfo=accel mi.c
main:
21, Accelerator kernel generated
Generating Tesla code
21, Generating reduction(+:sum)
22, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
21, Generating implicit copyin(a[:5368709120])
% ./a.out
n = 5368709120, s = 21474836480
Initializing ... done
Sum is 536870920.000000
I believe a problem is here:
size_t n = GB*1024*1024*1024/sizeof(float);
when I compile that line of code with g++, I get a warning about integer overflow. For some reason the PGI compiler is not warning, but the same badness is occurring under the hood. After the declarations of s, and n, if I add a printout like this:
size_t n = GB*1024*1024*1024/sizeof(float);
size_t s = n * sizeof(float);
printf("n = %lu, s = %lu\n", n, s); // add this line
and compile with PGI 17.04, and run (on a P100, with 16GB) I get output like this:
$ pgcc -acc -fast -ta=tesla:managed:cuda8 -Minfo m1.c
main:
16, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop
22, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop containing reductions
Generated a prefetch instruction for the loop
$ ./a.out
n = 4611686017890516992, s = 18446744071562067968
malloc: call to cuMemAllocManaged returned error 2: Out of memory
Aborted
$
so it's evident that n and s are not what you intended.
We can fix this by marking all of those constants with ULL, and then things seem to work correctly for me:
$ cat m1.c
#include <stdio.h>
#include <openacc.h>
#include <stdlib.h>
#define GB 20ULL
int main()
{
float *a;
size_t n = GB*1024ULL*1024ULL*1024ULL/sizeof(float);
size_t s = n * sizeof(float);
printf("n = %lu, s = %lu\n", n, s);
a = (float *)malloc(s);
if (!a) { printf("Failed to malloc.\n"); return 1; }
printf("Initializing ... ");
for (int i = 0; i < n; ++i) {
a[i] = 0.1f;
}
printf("done\n");
double sum=0.0;
#pragma acc loop reduction (+:sum)
for (int i = 0; i < n; ++i) {
sum+=a[i];
}
printf("Sum is %f\n", sum);
free(a);
return 0;
}
$ pgcc -acc -fast -ta=tesla:managed:cuda8 -Minfo m1.c
main:
16, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop
22, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop containing reductions
Generated a prefetch instruction for the loop
$ ./a.out
n = 5368709120, s = 21474836480
Initializing ... done
Sum is 536870920.000000
$
Note that I've made another change above as well. I changed the sum accumulation variable from float to double. This is necessary to preserve somewhat "sensible" results when doing a very large reduction across very small quantities.
And, as #MatColgrove pointed out in his answer, I missed a few other things as well.

Thrust Histogram with weights

I want to compute the density of particles over a grid. Therefore, I have a vector that contains the cellID of each particle, as well as a vector with the given mass which does not have to be uniform.
I have taken the non-sparse example from Thrust to compute a histogram of my particles.
However, to compute the density, I need to include the weight of each particle, instead of simply summing the number of particles per cell, i.e. I'm interested in rho[i] = sum W[j] for all j that satify cellID[j]=i (probably unnecessary to explain, since everybody knows that).
Implementing this with Thrust has not worked for me. I also tried to use a CUDA kernel and thrust_raw_pointer_cast, but I did not succeed with that either.
EDIT:
Here is a minimal working example which should compile via nvcc file.cu under CUDA 6.5 and with Thrust installed.
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/adjacent_difference.h>
// Predicate
struct is_out_of_bounds {
__host__ __device__ bool operator()(int i) {
return (i < 0); // out of bounds elements have negative id;
}
};
// cf.: https://code.google.com/p/thrust/source/browse/examples/histogram.cu, but modified
template<typename T1, typename T2>
void computeHistogram(const T1& input, T2& histogram) {
typedef typename T1::value_type ValueType; // input value type
typedef typename T2::value_type IndexType; // histogram index type
// copy input data (could be skipped if input is allowed to be modified)
thrust::device_vector<ValueType> data(input);
// sort data to bring equal elements together
thrust::sort(data.begin(), data.end());
// there are elements that we don't want to count, those have ID -1;
data.erase(thrust::remove_if(data.begin(), data.end(), is_out_of_bounds()),data.end());
// number of histogram bins is equal to the maximum value plus one
IndexType num_bins = histogram.size();
// find the end of each bin of values
thrust::counting_iterator<IndexType> search_begin(0);
thrust::upper_bound(data.begin(), data.end(), search_begin,
search_begin + num_bins, histogram.begin());
// compute the histogram by taking differences of the cumulative histogram
thrust::adjacent_difference(histogram.begin(), histogram.end(),
histogram.begin());
}
int main(void) {
thrust::device_vector<int> cellID(5);
cellID[0] = -1; cellID[1] = 1; cellID[2] = 0; cellID[3] = 2; cellID[4]=1;
thrust::device_vector<float> mass(5);
mass[0] = .5; mass[1] = 1.0; mass[2] = 2.0; mass[3] = 3.0; mass[4] = 4.0;
thrust::device_vector<int> histogram(3);
thrust::device_vector<float> density(3);
computeHistogram(cellID,histogram);
std::cout<<"\nHistogram:\n";
thrust::copy(histogram.begin(), histogram.end(),
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
// this will print: " Histogram 1 2 1 "
// meaning one element with ID 0, two elements with ID 1
// and one element with ID 2
/* here is what I am unable to implement:
*
*
* computeDensity(cellID,mass,density);
*
* print(density): 2.0 5.0 3.0
*
*
*/
}
I hope the comment at the end of the file also makes clear what I mean by computing the density. If there is any question open, please feel free to ask. Thanks!
There still seems to be a problem in understanding my problem, which I am sorry for! Therefore I added some pictures.
Consider the first picture. For my understanding, a histogram would simply be the count of particles per grid cell. In this case a histogram would be an array of size 36, since there are 36 cells. Also, there would be a lot of zero entries in the vector, since for example in the upper left corner almost no cell contains a particle. This is what I already have in my code.
Now consider the slightly more complicated case. Here each particle has a different mass, indicated by the different size in the plot. To compute the density I can't just add the number of particles per cell, but I have to add the mass of all particles per cell. This is what I'm unable to implement.
What you described in your example does not look like a histogram but rather like a segmented reduction.
The following example code uses thrust::reduce_by_key to sum up the masses of particles within the same cell:
density.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/scatter.h>
#include <iostream>
#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
std::cout << name << ":\t\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
std::cout << std::endl << std::endl;
}
int main()
{
const int particle_count = 5;
const int cell_count = 10;
thrust::device_vector<int> cellID(particle_count);
cellID[0] = -1; cellID[1] = 1; cellID[2] = 0; cellID[3] = 2; cellID[4]=1;
thrust::device_vector<float> mass(particle_count);
mass[0] = .5; mass[1] = 1.0; mass[2] = 2.0; mass[3] = 3.0; mass[4] = 4.0;
std::cout << "input data" << std::endl;
PRINTER(cellID);
PRINTER(mass);
thrust::sort_by_key(cellID. begin(), cellID.end(), mass.begin());
std::cout << "after sort_by_key" << std::endl;
PRINTER(cellID);
PRINTER(mass);
thrust::device_vector<int> reduced_cellID(particle_count);
thrust::device_vector<float> density(particle_count);
int new_size = thrust::reduce_by_key(cellID. begin(), cellID.end(),
mass.begin(),
reduced_cellID.begin(),
density.begin()
).second - density.begin();
if (reduced_cellID[0] == -1)
{
density.erase(density.begin());
reduced_cellID.erase(reduced_cellID.begin());
new_size--;
}
density.resize(new_size);
reduced_cellID.resize(new_size);
std::cout << "after reduce_by_key" << std::endl;
PRINTER(density);
PRINTER(reduced_cellID);
thrust::device_vector<float> final_density(cell_count);
thrust::scatter(density.begin(), density.end(), reduced_cellID.begin(), final_density.begin());
PRINTER(final_density);
}
compile using
nvcc -std=c++11 density.cu -o density
output
input data
cellID: -1 1 0 2 1
mass: 0.5 1 2 3 4
after sort_by_key
cellID: -1 0 1 1 2
mass: 0.5 2 1 4 3
after reduce_by_key
density: 2 5 3
reduced_cellID: 0 1 2
final_density: 2 5 3 0 0 0 0 0 0 0

Multi-gpu CUDA Thrust

I have a Cuda C++ code that uses Thrust currently working properly on a single GPU. I'd now like to modify it for multi-gpu. I have a host function that includes a number of Thrust calls that sort, copy, calculate differences etc on device arrays. I want to use each GPU to run this sequence of Thrust calls on it's own (independent) set of arrays at the same time. I've read that Thrust functions that return values are synchronous but can I use OpenMP to have each host thread call up a function (with Thrust calls) that runs on a separate GPU?
For example (coded in browser):
#pragma omp parallel for
for (int dev=0; dev<Ndev; dev++){
cudaSetDevice(dev);
runthrustfunctions(dev);
}
void runthrustfunctions(int dev){
/*lots of Thrust functions running on device arrays stored on corresponding GPU*/
//for example this is just a few of the lines"
thrust::device_ptr<double> pos_ptr = thrust::device_pointer_cast(particle[dev].pos);
thrust::device_ptr<int> list_ptr = thrust::device_pointer_cast(particle[dev].list);
thrust::sequence(list_ptr,list_ptr+length);
thrust::sort_by_key(pos_ptr, pos_ptr+length,list_ptr);
thrust::device_vector<double> temp(length);
thrust::gather(list_ptr,list_ptr+length,pos_ptr,temp.begin());
thrust::copy(temp.begin(), temp.end(), pos_ptr);
}`
I think I also need the structure "particle[0]" to be stored on GPU 0, particle[1] on GPU 1 etc and I my guess is this not possible. An option might be to use "switch" with separate code for each GPU case.
I'd like to know if this is a correct approach or if there is a better way?
Thanks
Yes, you can combine thrust and OpenMP.
Here's a complete worked example with results:
$ cat t340.cu
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <time.h>
#include <sys/time.h>
#define DSIZE 200000000
using namespace std;
int main(int argc, char *argv[])
{
timeval t1, t2;
int num_gpus = 0; // number of CUDA GPUs
printf("%s Starting...\n\n", argv[0]);
// determine the number of CUDA capable GPUs
cudaGetDeviceCount(&num_gpus);
if (num_gpus < 1)
{
printf("no CUDA capable devices were detected\n");
return 1;
}
// display CPU and GPU configuration
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
for (int i = 0; i < num_gpus; i++)
{
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
printf("initialize data\n");
// initialize data
typedef thrust::device_vector<int> dvec;
typedef dvec *p_dvec;
std::vector<p_dvec> dvecs;
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp = new dvec(DSIZE);
dvecs.push_back(temp);
}
thrust::host_vector<int> data(DSIZE);
thrust::generate(data.begin(), data.end(), rand);
// copy data
for (unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
thrust::copy(data.begin(), data.end(), (*(dvecs[i])).begin());
}
printf("start sort\n");
gettimeofday(&t1,NULL);
// run as many CPU threads as there are CUDA devices
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
thrust::sort((*(dvecs[cpu_thread_id])).begin(), (*(dvecs[cpu_thread_id])).end());
cudaDeviceSynchronize();
}
gettimeofday(&t2,NULL);
printf("finished\n");
unsigned long et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
printf("sort time = %fs\n", (float)et/(float)(1000000));
// check results
thrust::host_vector<int> result(DSIZE);
thrust::sort(data.begin(), data.end());
for (int i = 0; i < num_gpus; i++)
{
cudaSetDevice(i);
thrust::copy((*(dvecs[i])).begin(), (*(dvecs[i])).end(), result.begin());
for (int j = 0; j < DSIZE; j++)
if (data[j] != result[j]) { printf("mismatch on device %d at index %d, host: %d, device: %d\n", i, j, data[j], result[j]); return 1;}
}
printf("Success\n");
return 0;
}
$ nvcc -Xcompiler -fopenmp -O3 -arch=sm_20 -o t340 t340.cu -lgomp
$ CUDA_VISIBLE_DEVICES="0" ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 1
0: Tesla M2050
initialize data
start sort
finished
sort time = 0.398922s
Success
$ ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 4
0: Tesla M2050
1: Tesla M2070
2: Tesla M2050
3: Tesla M2070
initialize data
start sort
finished
sort time = 0.460058s
Success
$
We can see that when I restrict the program to using a single device, the sort operation takes about 0.4 seconds. Then when I allow it to use all 4 devices (repeating the same sort on all 4 devices) the overall operation only take 0.46 seconds, even though we're doing 4 times as much work.
For this particular case I happened to be using CUDA 5.0 with thrust v1.7, and gcc 4.4.6 (RHEL 6.2)