In cuda, is it possible to write dense array from sparse array with expected sequence? - cuda

There is array1 that represent 0 or 1 (for each thread block):
bool array1[]: [1, 1, 0, 0, 1, 1]
Each thread in thread block accesses array1 by using threadIdx.x.
And, I need to make shared dense array2 (each value represents thread ID with '1' value from array1:
__shared__ bool array2[] (thread ID) : [0, 1, 4, 5]
It seems that, at least, I need atomicAdd() operation to index array2.
Even with atomicAdd(), I think that it is hard to make array2 like above sequence
(0, 1, 4, 5).
Is it possible to make array2 from array1 in cuda (for each thread block)?

you can coalesced groups:
suppose the read Boolean is threasIsIN:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
uint32_t tid = threadIdx.x;
const uint32_t warpLength = 32;
uint32_t warpIdx = tid / warpLength;
if (threadIsIn){
auto active = cg::coalesced_threads();
uint32_t idx = active.thread_rank() + warpIdx * warpLength;
array2[idx] = tid;
}
Edit
solution with multiple warps in a block:
the first warp of the block will prepare the shared array for the rest of warps in the block, this makes the other warps to wait for the first warp to finish.
thread_block block = this_thread_block();
uint32_t tid = threadIdx.x;
const uint32_t warpLength = 32;
uint32_t warpIdx = tid / warpLength;
uint32_t startIdx = 0;
uint32_t tidToWrite = tid;
uint32_t maxItr = blockSize / warpLength;
uint32_t itr = 0;
while (warpIdx == 0 && itr < maxItr){
auto warp = cg::coalesced_threads();
auto warpMask = warp.ballot(threadIsIn); // the tid'th bit is set to 1 if threadIsIn is true for tid
uint32_t trueThreadsSize = __popc(warpMask); // counts the number of bits that are set to 1
if(threadIsIn){
auto active = cg::coalesced_threads();
// active.size() has the same value as trueThreadsSize
array2[startIdx + active.thread_rank()] = tidToWrite;
}
startIdx += trueThreadsSize;
tidToWrite += warpLength;
++itr;
arr1Idx += warpLength;
threadIsIn = arr1[arr1Idx];
}
block.sync();

This is in a general category of problems called stream compaction. The canonical approach is to perform a prefix sum (scan operation) on a processed version of your data (converting the kept values to 1, the discarded values to 0), then use that prefix sum as the index to write to, in the output array.
CUB provides a convenient block-level scan operation, so we don't have to write our own. Thereafter, the indexed copy is trivial:
$ cat t1465.cu
#include <cub/cub.cuh>
#include <iostream>
#include <cstdlib>
const int nTPB = 1024;
const int ds = nTPB;
__global__ void BlockCompactKernel(bool *data, int *result, int *data_size)
{
// Specialize BlockScan for a 1D block of nTPB threads on type int
typedef cub::BlockScan<int, nTPB> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int scan_data[1];
// load data
bool tmp = data[threadIdx.x];
// process data
scan_data[0] = (tmp)?1:0;
// scan data
// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).ExclusiveSum(scan_data, scan_data);
// indexed copy
if (tmp) result[scan_data[0]] = threadIdx.x;
// optional: return result size
if (threadIdx.x == nTPB-1) *data_size = scan_data[0] + ((tmp)?1:0);
}
int main(){
bool *d_data, *data = new bool[ds];
int data_size, *d_data_size, *d_result, *result = new int[ds];
cudaMalloc(&d_data_size, sizeof(d_data_size[0]));
cudaMalloc(&d_result, ds*sizeof(d_result[0]));
for (int i = 0; i < ds; i++) data[i] = (rand() > (RAND_MAX/2))?true:false;
std::cout << "Original data:" << std::endl;
for (int i=0; i < ds; i++) std::cout << (int)data[i] << ",";
cudaMalloc(&d_data, ds*sizeof(d_data[0]));
cudaMemcpy(d_data, data, ds*sizeof(d_data[0]), cudaMemcpyHostToDevice);
BlockCompactKernel<<<1,nTPB>>>(d_data, d_result, d_data_size);
cudaMemcpy(&data_size, d_data_size, sizeof(d_data_size[0]), cudaMemcpyDeviceToHost);
cudaMemcpy(result, d_result, data_size*sizeof(d_result[0]), cudaMemcpyDeviceToHost);
std::cout << std::endl << "Compacted data:" << std::endl;
for (int i=0; i < data_size; i++) std::cout << result[i] << ",";
std::cout << std::endl;
}
$ nvcc -o t1465 t1465.cu
$ cuda-memcheck ./t1465
========= CUDA-MEMCHECK
Original data:
1,0,1,1,1,0,0,1,0,1,0,1,0,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,1,1,1,0,1,0,1,1,1,1,0,0,1,1,1,0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,0,0,1,1,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,1,1,1,1,1,0,0,1,0,0,0,0,1,1,1,0,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,1,1,0,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,1,1,0,0,1,0,0,1,1,0,0,1,1,1,0,0,1,1,1,1,1,0,1,0,0,1,0,1,1,0,0,1,1,1,0,1,1,0,0,1,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,0,1,1,1,1,1,0,0,1,1,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,1,1,1,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,1,1,0,1,1,1,1,0,1,1,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,1,1,0,
Compacted data:
0,2,3,4,7,9,11,13,14,15,16,17,19,23,28,30,31,32,34,35,37,39,40,41,43,46,47,49,50,53,54,61,62,63,65,67,68,69,70,73,74,75,77,78,80,83,84,87,89,90,91,92,93,95,97,98,99,102,103,105,106,108,110,116,119,123,124,125,126,128,132,135,137,139,141,143,146,147,148,149,150,151,154,159,160,161,164,166,168,170,173,174,178,179,181,182,184,186,187,189,190,191,192,193,195,196,197,198,199,200,201,202,203,204,207,208,210,212,214,219,221,222,223,225,226,229,230,233,237,238,240,244,246,249,250,251,254,255,256,258,260,261,262,264,267,268,272,273,274,276,280,282,286,287,288,289,291,293,294,295,296,298,299,301,302,303,305,308,311,315,316,318,320,321,329,330,331,332,333,337,338,343,349,350,352,353,356,357,358,360,362,366,367,368,370,374,375,378,379,382,383,386,391,392,397,398,401,402,403,404,407,410,411,412,413,415,418,422,425,427,428,431,432,433,437,439,440,441,448,450,455,457,458,459,460,461,462,464,466,467,468,469,470,473,474,475,479,481,482,483,488,489,492,493,494,496,499,500,501,502,505,506,507,508,509,511,512,513,515,516,517,518,519,520,521,522,524,525,526,527,528,529,531,534,535,536,537,539,540,541,542,544,546,547,548,549,552,554,556,563,564,565,566,569,572,573,576,577,578,581,582,583,584,585,587,590,592,593,596,597,598,600,601,604,605,606,610,611,613,614,618,619,620,621,623,624,629,630,631,632,633,637,638,639,642,644,645,648,650,651,652,653,658,662,667,668,670,677,678,682,683,685,687,689,690,692,693,696,697,698,699,700,702,704,706,712,714,715,717,720,722,724,725,726,727,728,731,732,734,737,740,741,744,747,749,751,752,753,755,756,757,761,762,763,764,765,766,767,775,776,777,782,786,787,789,790,793,794,796,797,798,799,801,802,806,808,811,812,814,815,817,820,822,827,829,830,832,833,835,836,839,847,851,852,853,854,855,858,860,863,864,865,866,868,869,870,872,876,878,879,880,881,882,883,884,885,886,887,888,890,891,895,896,897,899,902,908,909,911,912,913,916,917,918,920,921,922,923,924,926,927,928,929,932,938,941,942,944,945,950,952,954,955,961,964,968,973,975,976,977,980,981,983,985,986,987,989,990,991,994,996,999,1001,1002,1004,1008,1011,1014,1019,1020,1021,1022,
========= ERROR SUMMARY: 0 errors
$

Related

Search Minimum/Maximum from n Arrays parallel in CUDA (Reduction Problem)

Is there a performant way in CUDA to get out of multiple arrays (which exist in different structures)
to find the maximum/minimum in parallel? The structures are structured according to the Structure of Arrays format.
A simple idea would be to assign each array to a thread block, which is used to calculate the maximum/minimum using the parallel reduction approach. The problem here is the size of the shared memory, which is why I regard this approach as critical.
An other approach is to calculate every Miminum/Maximum separetly for each Array. I think this is to slow.
struct Cube {
int* x;
int* y;
int* z;
int size;
};
int main() {
Cube* c1 = new Cube(); //c1 includes 100 Cubes (because of SOA)
c1-> x = new int[100];
c1-> y = new int[100];
c1 -> z = new int[100];
Cube* c2 = new Cube();
c2-> x = new int[1047];
c2-> y = new int[1047];
c2 -> z = new int[1047];
Cube* c3 = new Cube();
c3-> x = new int[5000];
c3-> y = new int[5000];
c3 -> z = new int[5000];
//My goal now is to find the smallest/largest x dimension of all cubes in c1, c2, ..., and cn,
//with one Kernel launch.
//So the smallest/largest x in c1, the smallest/largest x in c2 etc..
}
Does anyone know an efficient approach? Thanks.
A simple idea would be to assign each array to a thread block, which is used to calculate the maximum/minimum using the parallel reduction approach. The problem here is the size of the shared memory, which is why I regard this approach as critical.
There is no problem with shared memory size. You may wish to review Mark Harris' canonical parallel reduction tutorial and look at the later methods to understand how we can use a loop to populate shared memory, reducing values into shared memory as we go. Once the input loop is completed, then we begin the block-sweep phase of the reduction. This doesn't impose any special requirements on the shared memory per block.
Here's a worked example demonstrating both a thrust::reduce_by_key method (single call) and a CUDA block-segmented method (single kernel call):
$ cat t1535.cu
#include <iostream>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/functional.h>
#include <cstdlib>
#define IMAX(x,y) (x>y)?x:y
#define IMIN(x,y) (x<y)?x:y
typedef int dtype;
const int ncubes = 3;
struct Cube {
dtype* x;
dtype* y;
dtype* z;
int size;
};
struct my_f
{
template <typename T1, typename T2>
__host__ __device__
thrust::tuple<dtype,dtype> operator()(T1 t1, T2 t2){
thrust::tuple<dtype,dtype> r;
thrust::get<0>(r) = IMAX(thrust::get<0>(t1),thrust::get<0>(t2));
thrust::get<1>(r) = IMIN(thrust::get<1>(t1),thrust::get<1>(t2));
return r;
}
};
const int MIN = -1;
const int MAX = 0x7FFFFFFF;
const int BS = 512;
template <typename T>
__global__ void block_segmented_minmax_reduce(const T * __restrict__ in, T * __restrict__ max, T * __restrict__ min, const size_t * __restrict__ slen){
__shared__ T smax[BS];
__shared__ T smin[BS];
size_t my_seg_start = slen[blockIdx.x];
size_t my_seg_size = slen[blockIdx.x+1] - my_seg_start;
smax[threadIdx.x] = MIN;
smin[threadIdx.x] = MAX;
for (size_t idx = my_seg_start+threadIdx.x; idx < my_seg_size; idx += BS){
T my_val = in[idx];
smax[threadIdx.x] = IMAX(my_val, smax[threadIdx.x]);
smin[threadIdx.x] = IMIN(my_val, smin[threadIdx.x]);}
for (int s = BS>>1; s > 0; s>>=1){
__syncthreads();
if (threadIdx.x < s){
smax[threadIdx.x] = IMAX(smax[threadIdx.x], smax[threadIdx.x+s]);
smin[threadIdx.x] = IMIN(smin[threadIdx.x], smin[threadIdx.x+s]);}
}
if (!threadIdx.x){
max[blockIdx.x] = smax[0];
min[blockIdx.x] = smin[0];}
}
int main() {
// data setup
Cube *c = new Cube[ncubes];
thrust::host_vector<size_t> csize(ncubes+1);
csize[0] = 100;
csize[1] = 1047;
csize[2] = 5000;
csize[3] = 0;
c[0].x = new dtype[csize[0]];
c[1].x = new dtype[csize[1]];
c[2].x = new dtype[csize[2]];
size_t ctot = 0;
for (int i = 0; i < ncubes; i++) ctot+=csize[i];
// method 1: thrust
// concatenate
thrust::host_vector<dtype> h_d(ctot);
size_t start = 0;
for (int i = 0; i < ncubes; i++) {thrust::copy_n(c[i].x, csize[i], h_d.begin()+start); start += csize[i];}
for (size_t i = 0; i < ctot; i++) h_d[i] = rand();
thrust::device_vector<dtype> d_d = h_d;
// build flag vector
thrust::device_vector<int> d_f(d_d.size());
thrust::host_vector<size_t> coff(csize.size());
thrust::exclusive_scan(csize.begin(), csize.end(), coff.begin());
thrust::device_vector<size_t> d_coff = coff;
thrust::scatter(thrust::constant_iterator<int>(1), thrust::constant_iterator<int>(1)+ncubes, d_coff.begin(), d_f.begin());
thrust::inclusive_scan(d_f.begin(), d_f.end(), d_f.begin());
// min/max reduction
thrust::device_vector<dtype> d_max(ncubes);
thrust::device_vector<dtype> d_min(ncubes);
thrust::reduce_by_key(d_f.begin(), d_f.end(), thrust::make_zip_iterator(thrust::make_tuple(d_d.begin(), d_d.begin())), thrust::make_discard_iterator(), thrust::make_zip_iterator(thrust::make_tuple(d_max.begin(), d_min.begin())), thrust::equal_to<int>(), my_f());
thrust::host_vector<dtype> h_max = d_max;
thrust::host_vector<dtype> h_min = d_min;
std::cout << "Thrust Maxima: " <<std::endl;
thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl << "Thrust Minima: " << std::endl;
thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl;
// method 2: CUDA kernel (block reduce)
block_segmented_minmax_reduce<<<ncubes, BS>>>(thrust::raw_pointer_cast(d_d.data()), thrust::raw_pointer_cast(d_max.data()), thrust::raw_pointer_cast(d_min.data()), thrust::raw_pointer_cast(d_coff.data()));
thrust::copy_n(d_max.begin(), ncubes, h_max.begin());
thrust::copy_n(d_min.begin(), ncubes, h_min.begin());
std::cout << "CUDA Maxima: " <<std::endl;
thrust::copy_n(h_max.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl << "CUDA Minima: " << std::endl;
thrust::copy_n(h_min.begin(), ncubes, std::ostream_iterator<dtype>(std::cout, ","));
std::cout << std::endl;
return 0;
}
$ nvcc -o t1535 t1535.cu
$ ./t1535
Thrust Maxima:
2145174067,2147469841,2146753918,
Thrust Minima:
35005211,2416949,100669,
CUDA Maxima:
2145174067,2147469841,2146753918,
CUDA Minima:
35005211,2416949,100669,
$
For a small number of Cube objects, the thrust method is likely to be faster. It will tend to make better use of medium to large GPUs than the block method will. For a large number of Cube objects, the block method should also be fairly efficient.

CUDA Kernel not returning values

I am working with a server that has multiple GPUs. I am using openMP to launch a kernel over multiple GPUs at once. The problem I am seeing is that the Kernel I am running does not seem to update the values in the thrust device vectors it is passed. The code below should output a value of 1 for all elements in the device vectors but instead outputs a value of 0. The code compiles and runs and shows me that the kernel executes successfully.
I do not understand why this code is not behaving as expected.
#include <iostream>
#include <cmath>
#include <omp.h>
#include <vector>
#include <thrust/host_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/device_vector.h>
using namespace::std;
const long N_R1 = 100;
const long N_R2 = 100;
__global__ void kernel(long* ND, long* NR1,
float* a, float* b, float* c, float* d)
{
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
//Values correspond to 2D array limits
long idxR1 = idx / ND[0];
long idxR2 = idx % ND[0];
if(idxR1 >= NR1[0] || idxR2 >= ND[0])
{
return;
}
a[idx] =1.0;
b[idx] =1.0;
c[idx] =1.0;
d[idx] =1.0;
}
void kernel_wrapper()
{
// GPU Count
int num_gpus = 0;
cudaGetDeviceCount(&num_gpus);
omp_set_num_threads(num_gpus);
//Calculate Dimensioning
long D_total = N_R1 * N_R2;
//Region 1 coordinates are loaded on to each GPU
//Region 2 coordinates are divided up onto GPUs
long R2_stride = ceil(float(N_R2)/float(num_gpus));
//Distance arrays need to be split longo whole sections of region 1.
//(Distances size = N_R1 * N_R2) subset of distance size needs to be N_R1
long D_stride = R2_stride * N_R1;
#pragma omp parallel
{
// Get CPU thread number
long cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
// Set up Local Arrays for distance and potential
// Step 1: Calculate rough Array Limits
// If array spaces divide evenly between threads then beginnings and endings can be calculated below
long R2_begin = cpu_thread_id * R2_stride;
long D_begin = cpu_thread_id * D_stride;
long R2_end = R2_begin + R2_stride;
long D_end = D_begin + D_stride;
// Step 2: Check Ends are not out of bounds
// The last thread in the calculation is likely to have array sizings that are out of bounds
// if this is the case then the ends need to be clipped:
if(R2_end >= N_R2)
{
R2_end = N_R2;
}
if(D_end >= D_total)
{
D_end = D_total;
}
// Local aray sizes are (end - begin)
long l_R2 = R2_end - R2_begin;
long l_D = D_end - D_begin;
float zero = 0.0;
// Create Region 2 potential components
thrust::host_vector<float > a(l_D,zero);
thrust::host_vector<float > b(l_D,zero);
thrust::host_vector<float > c(l_D,zero);
thrust::host_vector<float > d(l_D,zero);
long* p_NR1;
long nr1 = N_R1;
cudaMalloc( (void**)&p_NR1, sizeof(long) );
cudaMemcpy( p_NR1, &nr1, sizeof(long), cudaMemcpyHostToDevice);
long* p_NR2;
cudaMalloc( (void**)&p_NR2, sizeof(long) );
cudaMemcpy( p_NR2, &l_D, sizeof(long), cudaMemcpyHostToDevice);
//Generate Device Side Data for region 2 potential components
thrust::device_vector< float > d_a = a;
thrust::device_vector< float > d_b = b;
thrust::device_vector< float > d_c = c;
thrust::device_vector< float > d_d = d;
// Generate pointers to Device Side Data for region 2 potential components
float* p_a = thrust::raw_pointer_cast(d_a.data());
float* p_b = thrust::raw_pointer_cast(d_b.data());
float* p_c = thrust::raw_pointer_cast(d_c.data());
float* p_d = thrust::raw_pointer_cast(d_d.data());
dim3 blocks = N_R1;
dim3 threads = l_R2;
kernel<<<blocks,threads>>>(p_NR2, p_NR1,
p_a, p_b, p_c, p_d);
cudaDeviceSynchronize();
if(cudaGetLastError() == cudaSuccess)
{
cout << "Kernel Successful!" << cudaGetErrorString(cudaGetLastError()) << endl;
cin.ignore(1);
}
a = d_a;
b = d_b;
c = d_c;
d = d_d;
for(long j = 0; j != a.size(); j++)
{
cout << "a[" << j << "] = " << a[j] << endl;
}
for(long j = 0; j != b.size(); j++)
{
cout << "b[" << j << "] = " << b[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
}
cin.ignore(1);
}
int main()
{
kernel_wrapper();
return 0;
}
Any help would be greatly appreciated.
Some of the output values are getting set to 1, some are not. The problem is due to this statement:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
That isn't what I would call a proper generic conversion of 3D grid/block to globally unique 1D index, which I assume is your intent. Let's just pick one example to prove that it is broken. Suppose you are launching a 1D grid of 1D blocks (which is what you are doing). Then all of the (block,thread)Idx.y and .z variables will all be zero. Only blockIdx.x and threadIdx.x can take on non-zero values in that launch configuration.
In that case your expression reduces to:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + 0 * gridDim.x * gridDim.y * 0 )
* ( 0 * ( blockDim.x*blockDim.y ) ) + 0
* blockDim.x + threadIdx.x;
i.e. it reduces to:
long idx = threadIdx.x;
So the first (block-size) elements of your arrays (a,b,c,d) are getting set properly, the rest are not. Since threadIdx.x is not unique from one block to the next, this is not a proper globally-unique thread ID, and therefore each block is writing the same output locations, rather than each taking care of a separate part of the array.
So what is a possible (correct) generic 3D-to-1D index conversion?
That is answered here (and probably other places). This answer actually only converts a 3D grid plus 1D block configuration to a globally-unique ID, but it is sufficient for demonstration purposes of what is wrong in this code.
When I replace your in-kernel calculation of idx with that code, your kernel populates all array entries with 1.0 according to my testing.

sum vectors values with cuda C++

I try to sum many vectors values using CUDA c++. I found some solution for two vectors. As you can see, just possible to add two vectors but I wanna generate vectors dynamically with the same length.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}
int main( int argc, char* argv[] )
{
// Size of vectors
int n = 100000;
// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
}
// Copy host vectors to device
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);
// Execute the kernel
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
// Copy array back to host
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
// Sum up vector c and the print result divided by n, this should equal 1
within error
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Is there a way to do this for many vectors? My vectors size are:
#vector length
N = 1000
#number of vectors
i = 300000
v[i] = [1,2,..., N]
As result i need to get:
out[i]= [sum(v[1]), sum(v[2]),..., sum(v[i])]
Thanks for any advice.
Summing multiple vectors together in a fashion similar to the code you have shown (i.e. generating elementwise sums) is equivalent to summing the columns of a matrix. And this idea represents a sensible way to realize the solution.
We will treat your vectors as a matrix, where each vector is a row in the matrix. The CUDA kernel will assign one thread to each column, and will sum the elements of that column, producing a single number result. That single number result will become one element of the vector result of the entire problem.
Here is a fully worked example demonstrating one possible approach:
$ cat t2.cu
#include <iostream>
typedef double mt;
const int nTPB = 64;
template <typename T>
__global__ void column_sum(T *matrix, T *sums, unsigned n_vectors, unsigned vector_length){
unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < vector_length){
T temp = 0;
for (unsigned i = 0; i < n_vectors; i++)
temp += matrix[i*vector_length+idx];
sums[idx] = temp;}
}
int main(){
const unsigned vlen = 1000;
const unsigned nvec = 300000;
mt *h_matrix, *d_matrix, *h_sums, *d_sums;
// create the desired number of vectors as a single matrix
h_sums = new mt[vlen];
h_matrix = new mt[vlen*nvec];
cudaMalloc(&d_matrix, vlen*nvec*sizeof(mt));
cudaMalloc(&d_sums, vlen*sizeof(mt));
size_t count = 0;
for (unsigned i = 0; i < nvec; i++)
for (unsigned j = 0; j < vlen; j++)
h_matrix[count++] = j;
cudaMemcpy(d_matrix, h_matrix, vlen*nvec*sizeof(mt), cudaMemcpyHostToDevice);
column_sum<<<(vlen+nTPB-1)/nTPB,nTPB>>>(d_matrix, d_sums, nvec, vlen);
cudaMemcpy(h_sums, d_sums, vlen*sizeof(mt), cudaMemcpyDeviceToHost);
for (unsigned i = 0; i < vlen; i++) if (h_sums[i] != ((mt)nvec)*i) {std::cout << " mismatch at " << i << " was: " << h_sums[i] << " should be: " << ((mt)nvec)*i << std::endl; return -1;}
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$
Note that this methodology only creates as many threads on the GPU as there are vector elements (1000 in the above example). 1000 threads would be enough to keep only the smallest GPUs busy. However this algorithm will be efficient on most GPUs if your vector length is 10,000 or longer. If you'd like to explore creating more efficient algorithms for small problem sizes, you can study the idea of a classical parallel reduction.

cuda addvectors memory intuitive explanation

I have the following code and
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <ctime>
#include <vector>
#include <numeric>
float random_float(void)
{
return static_cast<float>(rand()) / RAND_MAX;
}
std::vector<float> add(float alpha, std::vector<float>& v1, std::vector<float>& v2 )
{ /*Do quick size check on vectors before proceeding*/
std::vector<float> result(v1.size());
for (unsigned int i = 0; i < result.size(); ++i)
{
result[i]=alpha*v1[i]+v2[i];
}
return result;
}
__global__ void Addloop( int N, float alpha, float* x, float* y ) {
int i;
int i0 = blockIdx.x*blockDim.x + threadIdx.x;
for( i = i0; i < N; i += blockDim.x*gridDim.x )
y[i] = alpha*x[i] + y[i];
/*
if ( i0 < N )
y[i0] = alpha*x[i0] + y[i0];
*/
}
int main( int argc, char** argv ) {
float alpha = 0.3;
// create array of 256k elements
int num_elements = 10;//1<<18;
// generate random input on the host
std::vector<float> h1_input(num_elements);
std::vector<float> h2_input(num_elements);
for(int i = 0; i < num_elements; ++i)
{
h1_input[i] = random_float();
h2_input[i] = random_float();
}
for (std::vector<float>::iterator it = h1_input.begin() ; it != h1_input.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
for (std::vector<float>::iterator it = h2_input.begin() ; it != h2_input.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
std::vector<float> host_result;//(std::vector<float> h1_input, std::vector<float> h2_input );
host_result = add( alpha, h1_input, h2_input );
for (std::vector<float>::iterator it = host_result.begin() ; it != host_result.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
// move input to device memory
float *d1_input = 0;
cudaMalloc((void**)&d1_input, sizeof(float) * num_elements);
cudaMemcpy(d1_input, &h1_input[0], sizeof(float) * num_elements, cudaMemcpyHostToDevice);
float *d2_input = 0;
cudaMalloc((void**)&d2_input, sizeof(float) * num_elements);
cudaMemcpy(d2_input, &h2_input[0], sizeof(float) * num_elements, cudaMemcpyHostToDevice);
Addloop<<<1,3>>>( num_elements, alpha, d1_input, d2_input );
// copy the result back to the host
std::vector<float> device_result(num_elements);
cudaMemcpy(&device_result[0], d2_input, sizeof(float) * num_elements, cudaMemcpyDeviceToHost);
for (std::vector<float>::iterator it = device_result.begin() ; it != device_result.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
cudaFree(d1_input);
cudaFree(d2_input);
h1_input.clear();
h2_input.clear();
device_result.clear();
std::cout << "DONE! \n";
getchar();
return 0;
}
I am trying to understand the gpu memory access. The kernel, for reasons of simplicity, is launched as Addloop<<<1,3>>>. I am trying to understand how this code is working by imagining the for loops working on the gpu as instances. More specifically, I imagine the following instances but they do not help.
Instance 1:
for( i = 0; i < N; i += 3*1 ) // ( i += 0*1 --> i += 3*1 after Eric's comment)
y[i] = alpha*x[i] + y[i];
Instance 2:
for( i = 1; i < N; i += 3*1 )
y[i] = alpha*x[i] + y[i];
Instance 3:
for( i = 3; i < N; i += 3*1 )
y[i] = alpha*x[i] + y[i];
Looking inside of every loop it does not make any sense in the logic of adding two vectors. Can some one help?
The reason I am adopting this logic of instances is because it is working well in the case of the code inside the kernel which is in comments.
If these thoughts are correct what would be the instances in case we have multiple blocks inside the grid? In other words what would be the i values and the update rates (+=updaterate) in some examples?
PS: The kernel code borrowed from here.
UPDATE:
After Eric's answer I think the execution for N = 15, e.i the number of elements, goes like this (correct me if I am wrong):
For the instance 1 above i = 0 , 3, 6, 9, 12 which computes the corresponding y[i] values.
For the instance 2 above i = 1 , 4, 7, 10, 13 which computes the corresponding remaining y[i] values.
For the instance 3 above i = 2 , 5, 8, 11, 14 which computes the rest y[i] values.
Your blockDim.x is 3 and gridDim.x is 1 according to your setup <<<1,3>>>. So in each thread (you call it instance), it should be i+=3*1
update
With the for loop you can compute 15 element using only 3 threads. Generally you can use limited number of threads to do "infinit" work. And more work per threads can improve the performance by reducing the launch overhead and hiding the instruction stalls.
Another advantage is you could use fixed number of threads/blocks to do work of various sizes, thus requires less tuning.

Cuda call won't allocate more than 8 threads per block, regardless of specification

I am creating a parallel version of the Sieve of Eratosthenes in c++. The problem is my kernel call (reduce0) seems to only ever assign 8 threads per block instead of the 256 I specify. Since even the first CUDA version allows 512 threads per block, there must be some error in my code for it. Any help would be appreciated.
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;
////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);
////////////////////////////////////////////////////
int main(){
int n = pow((double) 2, 8);
int total = sieve(n);
cout << "# primes" << endl << total << endl;
return 0;
}
///////////////////////////////////////////////////
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////////////////////////////////
int call_kernel(int *primes, int n){
// Allocate and copy device arrays
int *g_idevice;
int *g_odevice;
int size = n * sizeof(int);
cudaMalloc(&g_idevice, size);
cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
cudaMalloc(&g_odevice, size);
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
// Copy device data back to primes
cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
//for (int i = 0; i < n; i++) {
// cout << i << " " << primes[i] << endl;
//}
int total = primes[0];
cudaFree(g_idevice);
cudaFree(g_odevice);
return total;
}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
int i = f;
while(arg[i]!= 1 && i < n) {
i++;
}
return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
int total = 0;
int i = 2;
while(i < n){
if(arg[i] == 1){
total = total + 1;
}
i++;
}
return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
int* primes = NULL;
int mult = 0;
int k = 2;
int i; int total;
//primes = new int[n];
primes = new int[256];
for(i = 0; i < n; i++){
primes[i] = 1;
}
primes[0] = primes[1] = 0;
while (k * k < n){
mult = k * k;
while (mult < n) {
primes[mult] = 0;
mult = mult + k;
}
k = findsmallest(primes,k+1, n);
}
total = call_kernel(primes, n);
//delete [] primes;
//primes = NULL;
return total;
}
Your kernel is using dynamically allocated shared memory, but the kernel launch does not include any allocation, so the result is the kernel will be aborting because of illegal memory operations on that shared memory buffer. You should find it works if you modify this part of call_kernel as follows:
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);
If you had of included some basic error checking around the function call, perhaps like this:
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}
// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}
it would have been immediately obvious that the kernel launch or execution was failing with an error.