Search Minimum/Maximum from n Arrays parallel in CUDA (Reduction Problem) - cuda

Is there a performant way in CUDA to get out of multiple arrays (which exist in different structures)
to find the maximum/minimum in parallel? The structures are structured according to the Structure of Arrays format.
A simple idea would be to assign each array to a thread block, which is used to calculate the maximum/minimum using the parallel reduction approach. The problem here is the size of the shared memory, which is why I regard this approach as critical.
An other approach is to calculate every Miminum/Maximum separetly for each Array. I think this is to slow.
struct Cube {
int* x;
int* y;
int* z;
int size;
};
int main() {
Cube* c1 = new Cube(); //c1 includes 100 Cubes (because of SOA)
c1-> x = new int[100];
c1-> y = new int[100];
c1 -> z = new int[100];
Cube* c2 = new Cube();
c2-> x = new int[1047];
c2-> y = new int[1047];
c2 -> z = new int[1047];
Cube* c3 = new Cube();
c3-> x = new int[5000];
c3-> y = new int[5000];
c3 -> z = new int[5000];
//My goal now is to find the smallest/largest x dimension of all cubes in c1, c2, ..., and cn,
//with one Kernel launch.
//So the smallest/largest x in c1, the smallest/largest x in c2 etc..
}
Does anyone know an efficient approach? Thanks.

A simple idea would be to assign each array to a thread block, which is used to calculate the maximum/minimum using the parallel reduction approach. The problem here is the size of the shared memory, which is why I regard this approach as critical.
There is no problem with shared memory size. You may wish to review Mark Harris' canonical parallel reduction tutorial and look at the later methods to understand how we can use a loop to populate shared memory, reducing values into shared memory as we go. Once the input loop is completed, then we begin the block-sweep phase of the reduction. This doesn't impose any special requirements on the shared memory per block.
Here's a worked example demonstrating both a thrust::reduce_by_key method (single call) and a CUDA block-segmented method (single kernel call):
$ cat t1535.cu
#include <iostream>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <cstdlib>
#define IMAX(x,y) (x>y)?x:y
#define IMIN(x,y) (x<y)?x:y
typedef int dtype;
const int ncubes = 3;
struct Cube {
dtype* x;
dtype* y;
dtype* z;
int size;
};
struct my_f
{
template <typename T1, typename T2>
__host__ __device__
thrust::tuple<dtype,dtype> operator()(T1 t1, T2 t2){
thrust::tuple<dtype,dtype> r;
thrust::get<0>(r) = IMAX(thrust::get<0>(t1),thrust::get<0>(t2));
thrust::get<1>(r) = IMIN(thrust::get<1>(t1),thrust::get<1>(t2));
return r;
}
};
const int MIN = -1;
const int MAX = 0x7FFFFFFF;
const int BS = 512;
template <typename T>
__global__ void block_segmented_minmax_reduce(const T * __restrict__ in, T * __restrict__ max, T * __restrict__ min, const size_t * __restrict__ slen){
__shared__ T smax[BS];
__shared__ T smin[BS];
size_t my_seg_start = slen[blockIdx.x];
size_t my_seg_size = slen[blockIdx.x+1] - my_seg_start;
smax[threadIdx.x] = MIN;
smin[threadIdx.x] = MAX;
for (size_t idx = my_seg_start+threadIdx.x; idx < my_seg_size; idx += BS){
T my_val = in[idx];
smax[threadIdx.x] = IMAX(my_val, smax[threadIdx.x]);
smin[threadIdx.x] = IMIN(my_val, smin[threadIdx.x]);}
for (int s = BS>>1; s > 0; s>>=1){
__syncthreads();
if (threadIdx.x < s){
smax[threadIdx.x] = IMAX(smax[threadIdx.x], smax[threadIdx.x+s]);
smin[threadIdx.x] = IMIN(smin[threadIdx.x], smin[threadIdx.x+s]);}
}
if (!threadIdx.x){
max[blockIdx.x] = smax[0];
min[blockIdx.x] = smin[0];}
}
int main() {
// data setup
Cube *c = new Cube[ncubes];
thrust::host_vector<size_t> csize(ncubes+1);
csize[0] = 100;
csize[1] = 1047;
csize[2] = 5000;
csize[3] = 0;
c[0].x = new dtype[csize[0]];
c[1].x = new dtype[csize[1]];
c[2].x = new dtype[csize[2]];
size_t ctot = 0;
for (int i = 0; i < ncubes; i++) ctot+=csize[i];
// method 1: thrust
// concatenate
thrust::host_vector<dtype> h_d(ctot);
size_t start = 0;
for (int i = 0; i < ncubes; i++) {thrust::copy_n(c[i].x, csize[i], h_d.begin()+start); start += csize[i];}
for (size_t i = 0; i < ctot; i++) h_d[i] = rand();
thrust::device_vector<dtype> d_d = h_d;
// build flag vector
thrust::device_vector<int> d_f(d_d.size());
thrust::host_vector<size_t> coff(csize.size());
thrust::exclusive_scan(csize.begin(), csize.end(), coff.begin());
thrust::device_vector<size_t> d_coff = coff;
thrust::scatter(thrust::constant_iterator<int>(1), thrust::constant_iterator<int>(1)+ncubes, d_coff.begin(), d_f.begin());
thrust::inclusive_scan(d_f.begin(), d_f.end(), d_f.begin());
// min/max reduction
thrust::device_vector<dtype> d_max(ncubes);
thrust::device_vector<dtype> d_min(ncubes);
thrust::reduce_by_key(d_f.begin(), d_f.end(), thrust::make_zip_iterator(thrust::make_tuple(d_d.begin(), d_d.begin())), thrust::make_discard_iterator(), thrust::make_zip_iterator(thrust::make_tuple(d_max.begin(), d_min.begin())), thrust::equal_to<int>(), my_f());
thrust::host_vector<dtype> h_max = d_max;
thrust::host_vector<dtype> h_min = d_min;
std::cout << "Thrust Maxima: " <<std::endl;
thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl << "Thrust Minima: " << std::endl;
thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl;
// method 2: CUDA kernel (block reduce)
block_segmented_minmax_reduce<<<ncubes, BS>>>(thrust::raw_pointer_cast(d_d.data()), thrust::raw_pointer_cast(d_max.data()), thrust::raw_pointer_cast(d_min.data()), thrust::raw_pointer_cast(d_coff.data()));
thrust::copy_n(d_max.begin(), ncubes, h_max.begin());
thrust::copy_n(d_min.begin(), ncubes, h_min.begin());
std::cout << "CUDA Maxima: " <<std::endl;
thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl << "CUDA Minima: " << std::endl;
thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1535 t1535.cu
$ ./t1535
Thrust Maxima:
2145174067,2147469841,2146753918,
Thrust Minima:
35005211,2416949,100669,
CUDA Maxima:
2145174067,2147469841,2146753918,
CUDA Minima:
35005211,2416949,100669,
$
For a small number of Cube objects, the thrust method is likely to be faster. It will tend to make better use of medium to large GPUs than the block method will. For a large number of Cube objects, the block method should also be fairly efficient.

Related

In cuda, is it possible to write dense array from sparse array with expected sequence?

There is array1 that represent 0 or 1 (for each thread block):
bool array1[]: [1, 1, 0, 0, 1, 1]
Each thread in thread block accesses array1 by using threadIdx.x.
And, I need to make shared dense array2 (each value represents thread ID with '1' value from array1:
__shared__ bool array2[] (thread ID) : [0, 1, 4, 5]
It seems that, at least, I need atomicAdd() operation to index array2.
Even with atomicAdd(), I think that it is hard to make array2 like above sequence
(0, 1, 4, 5).
Is it possible to make array2 from array1 in cuda (for each thread block)?
you can coalesced groups:
suppose the read Boolean is threasIsIN:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
uint32_t tid = threadIdx.x;
const uint32_t warpLength = 32;
uint32_t warpIdx = tid / warpLength;
if (threadIsIn){
auto active = cg::coalesced_threads();
uint32_t idx = active.thread_rank() + warpIdx * warpLength;
array2[idx] = tid;
}
Edit
solution with multiple warps in a block:
the first warp of the block will prepare the shared array for the rest of warps in the block, this makes the other warps to wait for the first warp to finish.
thread_block block = this_thread_block();
uint32_t tid = threadIdx.x;
const uint32_t warpLength = 32;
uint32_t warpIdx = tid / warpLength;
uint32_t startIdx = 0;
uint32_t tidToWrite = tid;
uint32_t maxItr = blockSize / warpLength;
uint32_t itr = 0;
while (warpIdx == 0 && itr < maxItr){
auto warp = cg::coalesced_threads();
auto warpMask = warp.ballot(threadIsIn); // the tid'th bit is set to 1 if threadIsIn is true for tid
uint32_t trueThreadsSize = __popc(warpMask); // counts the number of bits that are set to 1
if(threadIsIn){
auto active = cg::coalesced_threads();
// active.size() has the same value as trueThreadsSize
array2[startIdx + active.thread_rank()] = tidToWrite;
}
startIdx += trueThreadsSize;
tidToWrite += warpLength;
++itr;
arr1Idx += warpLength;
threadIsIn = arr1[arr1Idx];
}
block.sync();
This is in a general category of problems called stream compaction. The canonical approach is to perform a prefix sum (scan operation) on a processed version of your data (converting the kept values to 1, the discarded values to 0), then use that prefix sum as the index to write to, in the output array.
CUB provides a convenient block-level scan operation, so we don't have to write our own. Thereafter, the indexed copy is trivial:
$ cat t1465.cu
#include <cub/cub.cuh>
#include <iostream>
#include <cstdlib>
const int nTPB = 1024;
const int ds = nTPB;
__global__ void BlockCompactKernel(bool *data, int *result, int *data_size)
{
// Specialize BlockScan for a 1D block of nTPB threads on type int
typedef cub::BlockScan<int, nTPB> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int scan_data[1];
// load data
bool tmp = data[threadIdx.x];
// process data
scan_data[0] = (tmp)?1:0;
// scan data
// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).ExclusiveSum(scan_data, scan_data);
// indexed copy
if (tmp) result[scan_data[0]] = threadIdx.x;
// optional: return result size
if (threadIdx.x == nTPB-1) *data_size = scan_data[0] + ((tmp)?1:0);
}
int main(){
bool *d_data, *data = new bool[ds];
int data_size, *d_data_size, *d_result, *result = new int[ds];
cudaMalloc(&d_data_size, sizeof(d_data_size[0]));
cudaMalloc(&d_result, ds*sizeof(d_result[0]));
for (int i = 0; i < ds; i++) data[i] = (rand() > (RAND_MAX/2))?true:false;
std::cout << "Original data:" << std::endl;
for (int i=0; i < ds; i++) std::cout << (int)data[i] << ",";
cudaMalloc(&d_data, ds*sizeof(d_data[0]));
cudaMemcpy(d_data, data, ds*sizeof(d_data[0]), cudaMemcpyHostToDevice);
BlockCompactKernel<<<1,nTPB>>>(d_data, d_result, d_data_size);
cudaMemcpy(&data_size, d_data_size, sizeof(d_data_size[0]), cudaMemcpyDeviceToHost);
cudaMemcpy(result, d_result, data_size*sizeof(d_result[0]), cudaMemcpyDeviceToHost);
std::cout << std::endl << "Compacted data:" << std::endl;
for (int i=0; i < data_size; i++) std::cout << result[i] << ",";
std::cout << std::endl;
}
$ nvcc -o t1465 t1465.cu
$ cuda-memcheck ./t1465
========= CUDA-MEMCHECK
Original data:
1,0,1,1,1,0,0,1,0,1,0,1,0,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,1,1,1,0,1,0,1,1,1,1,0,0,1,1,1,0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,0,0,1,1,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,1,1,1,1,1,0,0,1,0,0,0,0,1,1,1,0,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,1,1,0,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,1,1,0,0,1,0,0,1,1,0,0,1,1,1,0,0,1,1,1,1,1,0,1,0,0,1,0,1,1,0,0,1,1,1,0,1,1,0,0,1,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,0,1,1,1,1,1,0,0,1,1,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,1,1,1,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,1,1,0,1,1,1,1,0,1,1,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,1,1,0,
Compacted data:
0,2,3,4,7,9,11,13,14,15,16,17,19,23,28,30,31,32,34,35,37,39,40,41,43,46,47,49,50,53,54,61,62,63,65,67,68,69,70,73,74,75,77,78,80,83,84,87,89,90,91,92,93,95,97,98,99,102,103,105,106,108,110,116,119,123,124,125,126,128,132,135,137,139,141,143,146,147,148,149,150,151,154,159,160,161,164,166,168,170,173,174,178,179,181,182,184,186,187,189,190,191,192,193,195,196,197,198,199,200,201,202,203,204,207,208,210,212,214,219,221,222,223,225,226,229,230,233,237,238,240,244,246,249,250,251,254,255,256,258,260,261,262,264,267,268,272,273,274,276,280,282,286,287,288,289,291,293,294,295,296,298,299,301,302,303,305,308,311,315,316,318,320,321,329,330,331,332,333,337,338,343,349,350,352,353,356,357,358,360,362,366,367,368,370,374,375,378,379,382,383,386,391,392,397,398,401,402,403,404,407,410,411,412,413,415,418,422,425,427,428,431,432,433,437,439,440,441,448,450,455,457,458,459,460,461,462,464,466,467,468,469,470,473,474,475,479,481,482,483,488,489,492,493,494,496,499,500,501,502,505,506,507,508,509,511,512,513,515,516,517,518,519,520,521,522,524,525,526,527,528,529,531,534,535,536,537,539,540,541,542,544,546,547,548,549,552,554,556,563,564,565,566,569,572,573,576,577,578,581,582,583,584,585,587,590,592,593,596,597,598,600,601,604,605,606,610,611,613,614,618,619,620,621,623,624,629,630,631,632,633,637,638,639,642,644,645,648,650,651,652,653,658,662,667,668,670,677,678,682,683,685,687,689,690,692,693,696,697,698,699,700,702,704,706,712,714,715,717,720,722,724,725,726,727,728,731,732,734,737,740,741,744,747,749,751,752,753,755,756,757,761,762,763,764,765,766,767,775,776,777,782,786,787,789,790,793,794,796,797,798,799,801,802,806,808,811,812,814,815,817,820,822,827,829,830,832,833,835,836,839,847,851,852,853,854,855,858,860,863,864,865,866,868,869,870,872,876,878,879,880,881,882,883,884,885,886,887,888,890,891,895,896,897,899,902,908,909,911,912,913,916,917,918,920,921,922,923,924,926,927,928,929,932,938,941,942,944,945,950,952,954,955,961,964,968,973,975,976,977,980,981,983,985,986,987,989,990,991,994,996,999,1001,1002,1004,1008,1011,1014,1019,1020,1021,1022,
========= ERROR SUMMARY: 0 errors
$

CUDA Kernel not returning values

I am working with a server that has multiple GPUs. I am using openMP to launch a kernel over multiple GPUs at once. The problem I am seeing is that the Kernel I am running does not seem to update the values in the thrust device vectors it is passed. The code below should output a value of 1 for all elements in the device vectors but instead outputs a value of 0. The code compiles and runs and shows me that the kernel executes successfully.
I do not understand why this code is not behaving as expected.
#include <iostream>
#include <cmath>
#include <omp.h>
#include <vector>
#include <thrust/host_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/device_vector.h>
using namespace::std;
const long N_R1 = 100;
const long N_R2 = 100;
__global__ void kernel(long* ND, long* NR1,
float* a, float* b, float* c, float* d)
{
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
//Values correspond to 2D array limits
long idxR1 = idx / ND[0];
long idxR2 = idx % ND[0];
if(idxR1 >= NR1[0] || idxR2 >= ND[0])
{
return;
}
a[idx] =1.0;
b[idx] =1.0;
c[idx] =1.0;
d[idx] =1.0;
}
void kernel_wrapper()
{
// GPU Count
int num_gpus = 0;
cudaGetDeviceCount(&num_gpus);
omp_set_num_threads(num_gpus);
//Calculate Dimensioning
long D_total = N_R1 * N_R2;
//Region 1 coordinates are loaded on to each GPU
//Region 2 coordinates are divided up onto GPUs
long R2_stride = ceil(float(N_R2)/float(num_gpus));
//Distance arrays need to be split longo whole sections of region 1.
//(Distances size = N_R1 * N_R2) subset of distance size needs to be N_R1
long D_stride = R2_stride * N_R1;
#pragma omp parallel
{
// Get CPU thread number
long cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
// Set up Local Arrays for distance and potential
// Step 1: Calculate rough Array Limits
// If array spaces divide evenly between threads then beginnings and endings can be calculated below
long R2_begin = cpu_thread_id * R2_stride;
long D_begin = cpu_thread_id * D_stride;
long R2_end = R2_begin + R2_stride;
long D_end = D_begin + D_stride;
// Step 2: Check Ends are not out of bounds
// The last thread in the calculation is likely to have array sizings that are out of bounds
// if this is the case then the ends need to be clipped:
if(R2_end >= N_R2)
{
R2_end = N_R2;
}
if(D_end >= D_total)
{
D_end = D_total;
}
// Local aray sizes are (end - begin)
long l_R2 = R2_end - R2_begin;
long l_D = D_end - D_begin;
float zero = 0.0;
// Create Region 2 potential components
thrust::host_vector<float > a(l_D,zero);
thrust::host_vector<float > b(l_D,zero);
thrust::host_vector<float > c(l_D,zero);
thrust::host_vector<float > d(l_D,zero);
long* p_NR1;
long nr1 = N_R1;
cudaMalloc( (void**)&p_NR1, sizeof(long) );
cudaMemcpy( p_NR1, &nr1, sizeof(long), cudaMemcpyHostToDevice);
long* p_NR2;
cudaMalloc( (void**)&p_NR2, sizeof(long) );
cudaMemcpy( p_NR2, &l_D, sizeof(long), cudaMemcpyHostToDevice);
//Generate Device Side Data for region 2 potential components
thrust::device_vector< float > d_a = a;
thrust::device_vector< float > d_b = b;
thrust::device_vector< float > d_c = c;
thrust::device_vector< float > d_d = d;
// Generate pointers to Device Side Data for region 2 potential components
float* p_a = thrust::raw_pointer_cast(d_a.data());
float* p_b = thrust::raw_pointer_cast(d_b.data());
float* p_c = thrust::raw_pointer_cast(d_c.data());
float* p_d = thrust::raw_pointer_cast(d_d.data());
dim3 blocks = N_R1;
dim3 threads = l_R2;
kernel<<<blocks,threads>>>(p_NR2, p_NR1,
p_a, p_b, p_c, p_d);
cudaDeviceSynchronize();
if(cudaGetLastError() == cudaSuccess)
{
cout << "Kernel Successful!" << cudaGetErrorString(cudaGetLastError()) << endl;
cin.ignore(1);
}
a = d_a;
b = d_b;
c = d_c;
d = d_d;
for(long j = 0; j != a.size(); j++)
{
cout << "a[" << j << "] = " << a[j] << endl;
}
for(long j = 0; j != b.size(); j++)
{
cout << "b[" << j << "] = " << b[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
}
cin.ignore(1);
}
int main()
{
kernel_wrapper();
return 0;
}
Any help would be greatly appreciated.
Some of the output values are getting set to 1, some are not. The problem is due to this statement:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
That isn't what I would call a proper generic conversion of 3D grid/block to globally unique 1D index, which I assume is your intent. Let's just pick one example to prove that it is broken. Suppose you are launching a 1D grid of 1D blocks (which is what you are doing). Then all of the (block,thread)Idx.y and .z variables will all be zero. Only blockIdx.x and threadIdx.x can take on non-zero values in that launch configuration.
In that case your expression reduces to:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + 0 * gridDim.x * gridDim.y * 0 )
* ( 0 * ( blockDim.x*blockDim.y ) ) + 0
* blockDim.x + threadIdx.x;
i.e. it reduces to:
long idx = threadIdx.x;
So the first (block-size) elements of your arrays (a,b,c,d) are getting set properly, the rest are not. Since threadIdx.x is not unique from one block to the next, this is not a proper globally-unique thread ID, and therefore each block is writing the same output locations, rather than each taking care of a separate part of the array.
So what is a possible (correct) generic 3D-to-1D index conversion?
That is answered here (and probably other places). This answer actually only converts a 3D grid plus 1D block configuration to a globally-unique ID, but it is sufficient for demonstration purposes of what is wrong in this code.
When I replace your in-kernel calculation of idx with that code, your kernel populates all array entries with 1.0 according to my testing.

CUB reduction using 2D grid of blocks

I'm trying to make a sum using the CUB reduction method.
The big problem is:
I'm not sure how to return the values of each block to the Host when using 2-dimensional grids.
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
The idea here is to return a 1D array, which contains the sum of each block.
I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that).
I looked for some example that use 2D grid with CUB, but did not find.
I really new in CUDA program, maybe I'm making a mistake.
edit: I put the complete code.
For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121
Probably the main issue is that your output indexing is not correct. Here's a reduced version of your code demonstrating correct results for arbitrary GRID_SIZE:
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
The important change I made to your kernel code was in the output indexing:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
We want a simulated 2D index into an array that has width and height of GRID_SIZE consisting of one float quantity per point. Therefore the width of this array is given by gridDim.x (not blockDim). The gridDim variable gives the dimensions of the grid in terms of blocks - and this lines up exactly with how our results array is set up.
Your posted code will fail if GRID_SIZE and BLOCK_SIZE are different (for example, if GRID_SIZE were smaller than BLOCK_SIZE, cuda-memcheck will show illegal accesses, and if GRID_SIZE is larger than BLOCK_SIZE then this indexing error will result in blocks overwriting each other's values in the output array) because of this mixup between blockDim and gridDim.
Also note that float operations typically only have around 5 decimal digits of precision. So small differences in the 5th or 6th decimal place may be attributable to order of operations differences when doing floating-point arithmetic. You can prove this to yourself by switching to double arithmetic.

reduction example using cuda and CUB

I'm trying to get my head around CUB, and having a bit of trouble following the (rather incomplete) worked examples. CUB looks like it is a fantastic tool, I just can't make sense of the example code.
I've built a simple proto-warp reduce example:
#include <cub/cub.cuh>
#include <cuda.h>
#include <vector>
using std::vector;
#include <iostream>
using std::cout;
using std::endl;
const int N = 128;
__global__ void sum(float *indata, float *outdata) {
typedef cub::WarpReduce<float,4> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage;
int id = blockIdx.x*blockDim.x+threadIdx.x;
if( id < 128 ) {
outdata[id] = WarpReduce(temp_storage).Sum(indata[id]);
}
}
int main() {
vector<float> y(N), sol(N);
float *dev_y, *dev_sol;
cudaMalloc((void**)&dev_y,N*sizeof(float));
cudaMalloc((void**)&dev_sol,N*sizeof(float));
for( int i = 0; i < N; i++ ) {
y[i] = (float)i;
}
cout << "input: ";
for( int i = 0; i < N; i++ ) cout << y[i] << " ";
cout << endl;
cudaMemcpy(&y[0],dev_y,N*sizeof(float),cudaMemcpyHostToDevice);
sum<<<1,32>>>(dev_y,dev_sol);
cudaMemcpy(dev_sol,&sol[0],N*sizeof(float),cudaMemcpyDeviceToHost);
cout << "output: ";
for( int i = 0; i < N; i++ ) cout << sol[i] << " ";
cout << endl;
cudaFree(dev_y);
cudaFree(dev_sol);
return 0;
}
which returns all zeros.
I'm aware that this code would return a reduction that was banded with every 32nd element being the sum of a warp and the other elements being undefined - I just want to get a feel for how CUB works. Can someone point out what I'm doing wrong?
(also, does CUB deserve its own tag yet?)
Your cudaMemcpy arguments are back to front, the destination comes first (to be consistent with memcpy).
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
See the API reference for more info.

How do I copy the row_indices from a COO matrix to a thrust vector in CUSP

I have been trying to copy the row indices, column indices and values of a COO matrix into separate thrust vectors, but I find that I am unable to do so.
Below is the code
cusp::coo_matrix <unsigned int, float, cusp::device_memory> *sim_mat;
sim_mat = new cusp::coo_matrix <unsigned int, float, cusp::device_memory>;
/* Code to fill up sim_mat: runs fine
...
*/
{
thrust::device_ptr <unsigned int> d_rows = &((sim_mat->row_indices));
thrust::device_ptr <unsigned int> d_cols = &((sim_mat->column_indices)[0]);
thrust::device_ptr <float> d_vals = &((sim_mat->values)[0]);
unsigned int size_nn = (sim_mat->row_indices).size();
thrust::device_vector <unsigned int> d_Rows;
thrust::device_vector <float> d_Vals;
thrust::device_vector <unsigned int> reduced_Rows;
// Code fails below this point
thrust::copy_n (d_rows, size_nn, d_Rows.begin());
thrust::copy_n (d_vals, size_nn, d_Vals.begin());
cout << size_nn << std::endl;
if (!(sim_mat->is_sorted_by_row()))
thrust::sort_by_key(d_Rows.begin(), d_Rows.end(), d_Vals.begin());
thrust::reduce_by_key(d_Rows.begin(), d_Rows.end(), d_Vals.begin(), reduced_Rows.begin(), sim_row->begin());
}
Ithe sim_row is a thrust vector pointer that has been allocated memory in some previous code and not relevant here.
The code compiles, but fails at run time with the error:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): invalid argument
Aborted (core dumped)
Could someone tell me what am I doing wrong?
Thanks
Akshay
There are several errors in your coding. As already pointed out, your method of accessing the row indices, column indices and values of the coo matrix won't work. Furthermore, you can't create thrust vectors like d_Rows, d_Vals of zero size, and then copy other vectors to them.
the following code works for me and illustrates one way of extracting the row indices, column indices, and values into separate thrust vectors:
#include <stdio.h>
#include <cusp/verify.h>
#include <cusp/array2d.h>
#include <cusp/coo_matrix.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
int main()
{
typedef cusp::device_memory MemorySpace;
// initial matrix
cusp::array2d<float, MemorySpace> E(4, 3);
E(0,0) = 1.000e+00; E(0,1) = 0.000e+00; E(0,2) = 0.000e+00;
E(1,0) = 0.000e+00; E(1,1) = 1.050e+01; E(1,2) = 0.000e+00;
E(2,0) = 0.000e+00; E(2,1) = 0.000e+00; E(2,2) = 2.500e-01;
E(3,0) = 0.000e+00; E(3,1) = 2.505e+02; E(3,2) = 0.000e+00;
cusp::coo_matrix<int, float, MemorySpace> coo(E);
if (!cusp::is_valid_matrix(coo)) {printf("Invalid COO\n"); return 1;}
thrust::device_vector<int> row_ind(coo.row_indices.size());
thrust::device_vector<int> col_ind(coo.column_indices.size());
thrust::device_vector<float> values(coo.values.size());
thrust::copy(coo.row_indices.begin(), coo.row_indices.end(), row_ind.begin());
thrust::copy(coo.column_indices.begin(), coo.column_indices.end(), col_ind.begin());
thrust::copy(coo.values.begin(), coo.values.end(), values.begin());
thrust::host_vector<int> h_row_ind = row_ind;
thrust::host_vector<int> h_col_ind = col_ind;
thrust::host_vector<float> h_values = values;
printf("COO row indices: \n");
for (int i = 0; i < h_row_ind.size(); i++)
printf("%d \n", h_row_ind[i]);
printf("COO column indices: \n");
for (int i = 0; i < h_col_ind.size(); i++)
printf("%d \n", h_col_ind[i]);
printf("COO values: \n");
for (int i = 0; i < h_values.size(); i++)
printf("%f \n", h_values[i]);
return 0;
}