Non-intuitive difference in time for copying Thrust device_vector from host_vector vs device_vector - cuda

I have one cuda thrust host vector and one device vector initialized. I have a thrust device matrix. The time taken for the the copying of host vector and device vector to the device matrix is different (as expected).
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <chrono>
using namespace std;
using namespace std::chrono;
typedef thrust::host_vector<double> th_vec;
typedef thrust::device_vector<double> td_vec;
int main(void) {
th_vec h_vec(3,1.0);
td_vec d_vec(3,1.0);
auto start1 = high_resolution_clock::now();
td_vec d1_mat[3];
for(size_t i = 0; i < 3; i++) {
d1_mat[i] = h_vec;
}
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
cout << "Host-Device mat assignment time " << duration1.count() << endl;
auto start2 = high_resolution_clock::now();
td_vec d2_mat[3];
for(size_t i = 0; i < 3; i++) {
d2_mat[i] = d_vec;
}
auto stop2 = high_resolution_clock::now();
auto duration2 = duration_cast<microseconds>(stop2 - start2);
cout << "Device-Device mat assignment time " << duration2.count() << endl;
return 0;
}
For 10 runs of the above code, the average result is as follow
Host-Device mat assignment time 32
Device-Device mat assignment time 85
I am running on Ubuntu 22.04 with CUDA Version 11.7, Driver Version: 515.43.04 and GPU GTX 1650
Why are the timings non-intuitive. Shouldn't the time for data transfer between device to device be less than host to device?

Related

Conditional copying in CUDA, where data vector is longer than stencil

I would like to conditional copy data from vector, basing on stencil vector, which is N times shorter. Every element in stencil would be responsible for N elements in data vector.
Suppose that the vectors look as follows (N=3)
data = {1,2,3,4,5,6,7,8,9}
stencil = {1,0,1}
What I would like to get in result:
result = {1,2,3,7,8,9}
Is there a way to achieve this using functions from Thrust library?
I know, that there is:
thrust::copy_if (InputIterator1 first, InputIterator1 last, InputIterator2 stencil, OutputIterator result, Predicate pred)
but this doesn't allow me to copy N values from data vector basing on one element from stencil.
As is often the case, I imagine there are many possible ways to do this.
The approach which occurs to me (using copy_if) is to use the stencil vector as part of a thrust::permutation_iterator, that takes the stencil vector and generates the index into it using a thrust::transform_iterator. If we imagine a copying index that goes from 0..8 for this example, then we can index into the "source" (i.e. stencil) vector using a "map" index calculated using a thrust::counting_iterator with integer division by N (using thrust placeholders). The copying predicate just tests if the stencil value == 1.
The thrust quick start guide gives a concise description of how to use these fancy iterators.
Here is a worked example:
$ cat t471.cu
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
using namespace thrust::placeholders;
int main(){
int data[] = {1,2,3,4,5,6,7,8,9};
int stencil[] = {1,0,1};
int ds = sizeof(data)/sizeof(data[0]);
int ss = sizeof(stencil)/sizeof(stencil[0]);
int N = ds/ss; // assume this whole number divisible
thrust::device_vector<int> d_data(data, data+ds);
thrust::device_vector<int> d_stencil(stencil, stencil+ss);
thrust::device_vector<int> d_result(ds);
int rs = thrust::copy_if(d_data.begin(), d_data.end(), thrust::make_permutation_iterator(d_stencil.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1 / N)), d_result.begin(), _1 == 1) - d_result.begin();
thrust::copy_n(d_result.begin(), rs, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t471 t471.cu
$ ./t471
1,2,3,7,8,9,
$
With the assumptions about stencil organization made here, we could also pre-compute the result size rs with thrust::reduce, and use that to allocate the result vector size:
$ cat t471.cu
#include <thrust/copy.h>
#include <thrust/reduce.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
using namespace thrust::placeholders;
int main(){
int data[] = {1,2,3,4,5,6,7,8,9};
int stencil[] = {1,0,1};
int ds = sizeof(data)/sizeof(data[0]);
int ss = sizeof(stencil)/sizeof(stencil[0]);
int N = ds/ss; // assume this whole number divisible
thrust::device_vector<int> d_data(data, data+ds);
thrust::device_vector<int> d_stencil(stencil, stencil+ss);
int rs = thrust::reduce(d_stencil.begin(), d_stencil.end())*N;
thrust::device_vector<int> d_result(rs);
thrust::copy_if(d_data.begin(), d_data.end(), thrust::make_permutation_iterator(d_stencil.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1 / N)), d_result.begin(), _1 == 1) - d_result.begin();
thrust::copy_n(d_result.begin(), rs, std::ostream_iterator<int>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t471 t471.cu
$ ./t471
1,2,3,7,8,9,
$

Can thrust::gather be used "in-place"?

Consider the following code:
#include <time.h> // --- time
#include <stdlib.h> // --- srand, rand
#include<fstream>
#include <thrust\host_vector.h>
#include <thrust\device_vector.h>
#include <thrust\sort.h>
#include <thrust\iterator\zip_iterator.h>
#include "TimingGPU.cuh"
/********/
/* MAIN */
/********/
int main() {
const int N = 16384;
std::ifstream h_indices_File, h_x_File;
h_indices_File.open("h_indices.txt");
h_x_File.open("h_x.txt");
std::ofstream h_x_result_File;
h_x_result_File.open("h_x_result.txt");
thrust::host_vector<int> h_indices(N);
thrust::host_vector<double> h_x(N);
thrust::host_vector<double> h_sorted(N);
for (int k = 0; k < N; k++) {
h_indices_File >> h_indices[k];
h_x_File >> h_x[k];
}
thrust::device_vector<int> d_indices(h_indices);
thrust::device_vector<double> d_x(h_x);
thrust::gather(d_indices.begin(), d_indices.end(), d_x.begin(), d_x.begin());
h_x = d_x;
for (int k = 0; k < N; k++) h_x_result_File << h_x[k] << "\n";
//thrust::device_vector<double> d_x_sorted(N);
//thrust::gather(d_indices.begin(), d_indices.end(), d_x.begin(), d_x_sorted.begin());
//h_x = d_x_sorted;
//for (int k = 0; k < N; k++) h_x_result_File << h_x[k] << "\n";
}
The code loads from file an array of indices h_indices.txt and a double array h_x.txt. Then, it transfers those arrays to the GPU to d_indices and d_x and uses thrust::gather to achieve Matlab's equivalent
d_x(d_indices)
The two txt files can be downloaded from h_indices.txt and h_x.txt. The code creates an output result file h_x_result.txt.
If I use the "in-place" version of thrust::gather (the last uncommented three lines of the code), then I obtain that the result is different from d_x(d_indices), while if I use the not "in-place" version (the last commented three lines of the code), then the result is correct.
In Matlab, I'm using
load h_indices.txt; load h_x.txt; load h_x_result.txt
plot(h_x(h_indices + 1)); hold on; plot(h_x_result, 'r'); hold off
The "in-place" case returns the following comparison
On the other side, the "in-place" case returns
I'm using Windows 10, CUDA 8.0, Visual Studio 2013, compiling in Release Mode and running on an NVIDIA GTX 960 cc. 5.2.
Thrust gather can't be used in place.
But I would go as far as to suggest that no "naïve" gather operation can be safely performed in-place, and that the Matlab snippet you presented as in-place (presumably d_x = d_x(d_indices)) isn't an in-place operation at all.

Thrust Histogram with weights

I want to compute the density of particles over a grid. Therefore, I have a vector that contains the cellID of each particle, as well as a vector with the given mass which does not have to be uniform.
I have taken the non-sparse example from Thrust to compute a histogram of my particles.
However, to compute the density, I need to include the weight of each particle, instead of simply summing the number of particles per cell, i.e. I'm interested in rho[i] = sum W[j] for all j that satify cellID[j]=i (probably unnecessary to explain, since everybody knows that).
Implementing this with Thrust has not worked for me. I also tried to use a CUDA kernel and thrust_raw_pointer_cast, but I did not succeed with that either.
EDIT:
Here is a minimal working example which should compile via nvcc file.cu under CUDA 6.5 and with Thrust installed.
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/adjacent_difference.h>
// Predicate
struct is_out_of_bounds {
__host__ __device__ bool operator()(int i) {
return (i < 0); // out of bounds elements have negative id;
}
};
// cf.: https://code.google.com/p/thrust/source/browse/examples/histogram.cu, but modified
template<typename T1, typename T2>
void computeHistogram(const T1& input, T2& histogram) {
typedef typename T1::value_type ValueType; // input value type
typedef typename T2::value_type IndexType; // histogram index type
// copy input data (could be skipped if input is allowed to be modified)
thrust::device_vector<ValueType> data(input);
// sort data to bring equal elements together
thrust::sort(data.begin(), data.end());
// there are elements that we don't want to count, those have ID -1;
data.erase(thrust::remove_if(data.begin(), data.end(), is_out_of_bounds()),data.end());
// number of histogram bins is equal to the maximum value plus one
IndexType num_bins = histogram.size();
// find the end of each bin of values
thrust::counting_iterator<IndexType> search_begin(0);
thrust::upper_bound(data.begin(), data.end(), search_begin,
search_begin + num_bins, histogram.begin());
// compute the histogram by taking differences of the cumulative histogram
thrust::adjacent_difference(histogram.begin(), histogram.end(),
histogram.begin());
}
int main(void) {
thrust::device_vector<int> cellID(5);
cellID[0] = -1; cellID[1] = 1; cellID[2] = 0; cellID[3] = 2; cellID[4]=1;
thrust::device_vector<float> mass(5);
mass[0] = .5; mass[1] = 1.0; mass[2] = 2.0; mass[3] = 3.0; mass[4] = 4.0;
thrust::device_vector<int> histogram(3);
thrust::device_vector<float> density(3);
computeHistogram(cellID,histogram);
std::cout<<"\nHistogram:\n";
thrust::copy(histogram.begin(), histogram.end(),
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
// this will print: " Histogram 1 2 1 "
// meaning one element with ID 0, two elements with ID 1
// and one element with ID 2
/* here is what I am unable to implement:
*
*
* computeDensity(cellID,mass,density);
*
* print(density): 2.0 5.0 3.0
*
*
*/
}
I hope the comment at the end of the file also makes clear what I mean by computing the density. If there is any question open, please feel free to ask. Thanks!
There still seems to be a problem in understanding my problem, which I am sorry for! Therefore I added some pictures.
Consider the first picture. For my understanding, a histogram would simply be the count of particles per grid cell. In this case a histogram would be an array of size 36, since there are 36 cells. Also, there would be a lot of zero entries in the vector, since for example in the upper left corner almost no cell contains a particle. This is what I already have in my code.
Now consider the slightly more complicated case. Here each particle has a different mass, indicated by the different size in the plot. To compute the density I can't just add the number of particles per cell, but I have to add the mass of all particles per cell. This is what I'm unable to implement.
What you described in your example does not look like a histogram but rather like a segmented reduction.
The following example code uses thrust::reduce_by_key to sum up the masses of particles within the same cell:
density.cu
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/scatter.h>
#include <iostream>
#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
std::cout << name << ":\t\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
std::cout << std::endl << std::endl;
}
int main()
{
const int particle_count = 5;
const int cell_count = 10;
thrust::device_vector<int> cellID(particle_count);
cellID[0] = -1; cellID[1] = 1; cellID[2] = 0; cellID[3] = 2; cellID[4]=1;
thrust::device_vector<float> mass(particle_count);
mass[0] = .5; mass[1] = 1.0; mass[2] = 2.0; mass[3] = 3.0; mass[4] = 4.0;
std::cout << "input data" << std::endl;
PRINTER(cellID);
PRINTER(mass);
thrust::sort_by_key(cellID. begin(), cellID.end(), mass.begin());
std::cout << "after sort_by_key" << std::endl;
PRINTER(cellID);
PRINTER(mass);
thrust::device_vector<int> reduced_cellID(particle_count);
thrust::device_vector<float> density(particle_count);
int new_size = thrust::reduce_by_key(cellID. begin(), cellID.end(),
mass.begin(),
reduced_cellID.begin(),
density.begin()
).second - density.begin();
if (reduced_cellID[0] == -1)
{
density.erase(density.begin());
reduced_cellID.erase(reduced_cellID.begin());
new_size--;
}
density.resize(new_size);
reduced_cellID.resize(new_size);
std::cout << "after reduce_by_key" << std::endl;
PRINTER(density);
PRINTER(reduced_cellID);
thrust::device_vector<float> final_density(cell_count);
thrust::scatter(density.begin(), density.end(), reduced_cellID.begin(), final_density.begin());
PRINTER(final_density);
}
compile using
nvcc -std=c++11 density.cu -o density
output
input data
cellID: -1 1 0 2 1
mass: 0.5 1 2 3 4
after sort_by_key
cellID: -1 0 1 1 2
mass: 0.5 2 1 4 3
after reduce_by_key
density: 2 5 3
reduced_cellID: 0 1 2
final_density: 2 5 3 0 0 0 0 0 0 0

Matrix not copied back from device to host successfully in CUDA

I am new to cuda. I wrote a kernel to create an identity matrix(GPUsetIdentity) of dimension sizeXsize. Further inside a function GPUfunctioncall, I called my kernel. The identity matrix should be stored in dDataInv. But when I copy it back to dataOut sizexsize , all the values are zero. I know, I am doing something very stupid somewhere, but couldnt get it, I am new to cuda, if anyone can point my mistake. Thanks.
#include <stdio.h>
#include <malloc.h>
#include <memory.h>
#include <math.h>
#include <stdlib.h>
#include <iostream>
#include <stdlib.h>
#include <string>
#include <fstream>
#include <iterator>
#include <sstream>
#include <vector>
#include <cstring>
#include <cstdlib>
#include <ctime>
#include <stdlib.h>
#include <cuda_runtime.h>
#include "cuda.h"
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size);
cudaMemset ((void *) dDataInv, 0, size);
dim3 idyThreads (BLOCKSIZE);
dim3 idyBlocks (size / BLOCKSIZE);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size, cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
Any time you are having trouble with a CUDA code, it's good practice to use proper cuda error checking. You can also run your code with cuda-memcheck to get a quick read on whether there are any errors.
Using either of these methods, you would have discovered an "invalid configuration error" on your kernel launch. This usually means that the parameters in the <<< >>> syntax are incorrect. When you run into this type of error, simply printing out those values may indicate the problem.
In your case, this line of code:
dim3 idyBlocks (size / BLOCKSIZE);
results in a value of 0 for idyBlocks when size is 4 and BLOCKSIZE is 16. So you are requesting a kernel launch of 0 blocks which is illegal. Therefore your kernel is not running and your results are not what you expect.
There are a variety of ways to solve this, many of them involving detecting this condition and adding an "extra block" when size is not evenly divisible by BLOCKSIZE. Using this approach, we may be launching "extra threads", so we must include a "thread check" in the kernel to prevent those extra threads from doing anything (such as accessing arrays out of bounds). For this, we often need to know the intended size in the kernel, and we can pass this value as an extra kernel parameter.
You've also made some errors in your handling of device variables. The following code:
dataOut = new float[size*size];
allocates enough space for a square matrix of dimension size. But the following code:
cudaMalloc ((void **) &dDataInv, size);
only allocates enough space for size bytes. You want size*size*sizeof(float) instead of size here, and likewise you want it in the following cudaMemset and cudaMemcpy operations. cudaMalloc, cudaMemset and cudaMemcpy require a size parameter in bytes, just like malloc, memset, and memcpy. This error is found in your usage of cudaMemset and cudaMemcpy as well.
The following code has those modifications, and seems to work correctly for me:
$ cat t580.cu
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#define BLOCKSIZE 16
using namespace std;
__global__ void GPUsetIdentity (float* matrix, int width, int size)
{
int tx = threadIdx.x;
int bx = blockIdx.x;
int offset = bx * BLOCKSIZE + tx;
if (tx < size)
matrix[offset + width * offset] = 1;
}
void print_matrix_host(float* A , int nr_rows_A, int nr_cols_A) {
for(int i = 0; i < nr_rows_A; ++i){
for(int j = 0; j < nr_cols_A; ++j){
std::cout << A[i * nr_rows_A + j ] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int GPUfunctioncall (float* hDataOut, int size){
float *dDataInv;
cudaMalloc ((void **) &dDataInv, size*size*sizeof(float));
cudaMemset ((void *) dDataInv, 0, size*size*sizeof(float));
dim3 idyThreads (BLOCKSIZE);
int num_blocks = size/BLOCKSIZE + (size%BLOCKSIZE)?1:0;
dim3 idyBlocks (num_blocks);
GPUsetIdentity <<< idyBlocks, idyThreads >>> (dDataInv, size, size);
cudaThreadSynchronize ();
cudaMemcpy ((void *) hDataOut, (void *) dDataInv, size*size*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree (dDataInv);
return 0;
}
int main()
{
int size = 4;
float* dataOut;
dataOut = new float[size*size];
GPUfunctioncall(dataOut, size);
print_matrix_host(dataOut, size, size);
}
$ nvcc -arch=sm_20 -o t580 t580.cu
$ cuda-memcheck ./t580
========= CUDA-MEMCHECK
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
========= ERROR SUMMARY: 0 errors
$
Note that it may be redundant to pass size twice to the kernel. For this particular example, we could have easily used the width parameter to do our kernel "thread check". But for educational purposes, I chose to call it out as a separate parameter, because in the general case you will often pass it as a separate parameter to other kernels that you write.
Finally, note that cudaThreadSynchronize() is deprecated and should be replaced with cudaDeviceSynchronize() instead. In this particular example, niether are actually necessary, as the next cudaMemcpy operation will force the same kind of synchronization, but you may use it if you decide to add cuda error checking to your code (recommended).

Multi-gpu CUDA Thrust

I have a Cuda C++ code that uses Thrust currently working properly on a single GPU. I'd now like to modify it for multi-gpu. I have a host function that includes a number of Thrust calls that sort, copy, calculate differences etc on device arrays. I want to use each GPU to run this sequence of Thrust calls on it's own (independent) set of arrays at the same time. I've read that Thrust functions that return values are synchronous but can I use OpenMP to have each host thread call up a function (with Thrust calls) that runs on a separate GPU?
For example (coded in browser):
#pragma omp parallel for
for (int dev=0; dev<Ndev; dev++){
cudaSetDevice(dev);
runthrustfunctions(dev);
}
void runthrustfunctions(int dev){
/*lots of Thrust functions running on device arrays stored on corresponding GPU*/
//for example this is just a few of the lines"
thrust::device_ptr<double> pos_ptr = thrust::device_pointer_cast(particle[dev].pos);
thrust::device_ptr<int> list_ptr = thrust::device_pointer_cast(particle[dev].list);
thrust::sequence(list_ptr,list_ptr+length);
thrust::sort_by_key(pos_ptr, pos_ptr+length,list_ptr);
thrust::device_vector<double> temp(length);
thrust::gather(list_ptr,list_ptr+length,pos_ptr,temp.begin());
thrust::copy(temp.begin(), temp.end(), pos_ptr);
}`
I think I also need the structure "particle[0]" to be stored on GPU 0, particle[1] on GPU 1 etc and I my guess is this not possible. An option might be to use "switch" with separate code for each GPU case.
I'd like to know if this is a correct approach or if there is a better way?
Thanks
Yes, you can combine thrust and OpenMP.
Here's a complete worked example with results:
$ cat t340.cu
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <time.h>
#include <sys/time.h>
#define DSIZE 200000000
using namespace std;
int main(int argc, char *argv[])
{
timeval t1, t2;
int num_gpus = 0; // number of CUDA GPUs
printf("%s Starting...\n\n", argv[0]);
// determine the number of CUDA capable GPUs
cudaGetDeviceCount(&num_gpus);
if (num_gpus < 1)
{
printf("no CUDA capable devices were detected\n");
return 1;
}
// display CPU and GPU configuration
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
for (int i = 0; i < num_gpus; i++)
{
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
printf("initialize data\n");
// initialize data
typedef thrust::device_vector<int> dvec;
typedef dvec *p_dvec;
std::vector<p_dvec> dvecs;
for(unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
p_dvec temp = new dvec(DSIZE);
dvecs.push_back(temp);
}
thrust::host_vector<int> data(DSIZE);
thrust::generate(data.begin(), data.end(), rand);
// copy data
for (unsigned int i = 0; i < num_gpus; i++) {
cudaSetDevice(i);
thrust::copy(data.begin(), data.end(), (*(dvecs[i])).begin());
}
printf("start sort\n");
gettimeofday(&t1,NULL);
// run as many CPU threads as there are CUDA devices
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
thrust::sort((*(dvecs[cpu_thread_id])).begin(), (*(dvecs[cpu_thread_id])).end());
cudaDeviceSynchronize();
}
gettimeofday(&t2,NULL);
printf("finished\n");
unsigned long et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
printf("sort time = %fs\n", (float)et/(float)(1000000));
// check results
thrust::host_vector<int> result(DSIZE);
thrust::sort(data.begin(), data.end());
for (int i = 0; i < num_gpus; i++)
{
cudaSetDevice(i);
thrust::copy((*(dvecs[i])).begin(), (*(dvecs[i])).end(), result.begin());
for (int j = 0; j < DSIZE; j++)
if (data[j] != result[j]) { printf("mismatch on device %d at index %d, host: %d, device: %d\n", i, j, data[j], result[j]); return 1;}
}
printf("Success\n");
return 0;
}
$ nvcc -Xcompiler -fopenmp -O3 -arch=sm_20 -o t340 t340.cu -lgomp
$ CUDA_VISIBLE_DEVICES="0" ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 1
0: Tesla M2050
initialize data
start sort
finished
sort time = 0.398922s
Success
$ ./t340
./t340 Starting...
number of host CPUs: 12
number of CUDA devices: 4
0: Tesla M2050
1: Tesla M2070
2: Tesla M2050
3: Tesla M2070
initialize data
start sort
finished
sort time = 0.460058s
Success
$
We can see that when I restrict the program to using a single device, the sort operation takes about 0.4 seconds. Then when I allow it to use all 4 devices (repeating the same sort on all 4 devices) the overall operation only take 0.46 seconds, even though we're doing 4 times as much work.
For this particular case I happened to be using CUDA 5.0 with thrust v1.7, and gcc 4.4.6 (RHEL 6.2)