cuda addvectors memory intuitive explanation - cuda

I have the following code and
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <ctime>
#include <vector>
#include <numeric>
float random_float(void)
{
return static_cast<float>(rand()) / RAND_MAX;
}
std::vector<float> add(float alpha, std::vector<float>& v1, std::vector<float>& v2 )
{ /*Do quick size check on vectors before proceeding*/
std::vector<float> result(v1.size());
for (unsigned int i = 0; i < result.size(); ++i)
{
result[i]=alpha*v1[i]+v2[i];
}
return result;
}
__global__ void Addloop( int N, float alpha, float* x, float* y ) {
int i;
int i0 = blockIdx.x*blockDim.x + threadIdx.x;
for( i = i0; i < N; i += blockDim.x*gridDim.x )
y[i] = alpha*x[i] + y[i];
/*
if ( i0 < N )
y[i0] = alpha*x[i0] + y[i0];
*/
}
int main( int argc, char** argv ) {
float alpha = 0.3;
// create array of 256k elements
int num_elements = 10;//1<<18;
// generate random input on the host
std::vector<float> h1_input(num_elements);
std::vector<float> h2_input(num_elements);
for(int i = 0; i < num_elements; ++i)
{
h1_input[i] = random_float();
h2_input[i] = random_float();
}
for (std::vector<float>::iterator it = h1_input.begin() ; it != h1_input.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
for (std::vector<float>::iterator it = h2_input.begin() ; it != h2_input.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
std::vector<float> host_result;//(std::vector<float> h1_input, std::vector<float> h2_input );
host_result = add( alpha, h1_input, h2_input );
for (std::vector<float>::iterator it = host_result.begin() ; it != host_result.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
// move input to device memory
float *d1_input = 0;
cudaMalloc((void**)&d1_input, sizeof(float) * num_elements);
cudaMemcpy(d1_input, &h1_input[0], sizeof(float) * num_elements, cudaMemcpyHostToDevice);
float *d2_input = 0;
cudaMalloc((void**)&d2_input, sizeof(float) * num_elements);
cudaMemcpy(d2_input, &h2_input[0], sizeof(float) * num_elements, cudaMemcpyHostToDevice);
Addloop<<<1,3>>>( num_elements, alpha, d1_input, d2_input );
// copy the result back to the host
std::vector<float> device_result(num_elements);
cudaMemcpy(&device_result[0], d2_input, sizeof(float) * num_elements, cudaMemcpyDeviceToHost);
for (std::vector<float>::iterator it = device_result.begin() ; it != device_result.end(); ++it)
std::cout << ' ' << *it;
std::cout << '\n';
cudaFree(d1_input);
cudaFree(d2_input);
h1_input.clear();
h2_input.clear();
device_result.clear();
std::cout << "DONE! \n";
getchar();
return 0;
}
I am trying to understand the gpu memory access. The kernel, for reasons of simplicity, is launched as Addloop<<<1,3>>>. I am trying to understand how this code is working by imagining the for loops working on the gpu as instances. More specifically, I imagine the following instances but they do not help.
Instance 1:
for( i = 0; i < N; i += 3*1 ) // ( i += 0*1 --> i += 3*1 after Eric's comment)
y[i] = alpha*x[i] + y[i];
Instance 2:
for( i = 1; i < N; i += 3*1 )
y[i] = alpha*x[i] + y[i];
Instance 3:
for( i = 3; i < N; i += 3*1 )
y[i] = alpha*x[i] + y[i];
Looking inside of every loop it does not make any sense in the logic of adding two vectors. Can some one help?
The reason I am adopting this logic of instances is because it is working well in the case of the code inside the kernel which is in comments.
If these thoughts are correct what would be the instances in case we have multiple blocks inside the grid? In other words what would be the i values and the update rates (+=updaterate) in some examples?
PS: The kernel code borrowed from here.
UPDATE:
After Eric's answer I think the execution for N = 15, e.i the number of elements, goes like this (correct me if I am wrong):
For the instance 1 above i = 0 , 3, 6, 9, 12 which computes the corresponding y[i] values.
For the instance 2 above i = 1 , 4, 7, 10, 13 which computes the corresponding remaining y[i] values.
For the instance 3 above i = 2 , 5, 8, 11, 14 which computes the rest y[i] values.

Your blockDim.x is 3 and gridDim.x is 1 according to your setup <<<1,3>>>. So in each thread (you call it instance), it should be i+=3*1
update
With the for loop you can compute 15 element using only 3 threads. Generally you can use limited number of threads to do "infinit" work. And more work per threads can improve the performance by reducing the launch overhead and hiding the instruction stalls.
Another advantage is you could use fixed number of threads/blocks to do work of various sizes, thus requires less tuning.

Related

In cuda, is it possible to write dense array from sparse array with expected sequence?

There is array1 that represent 0 or 1 (for each thread block):
bool array1[]: [1, 1, 0, 0, 1, 1]
Each thread in thread block accesses array1 by using threadIdx.x.
And, I need to make shared dense array2 (each value represents thread ID with '1' value from array1:
__shared__ bool array2[] (thread ID) : [0, 1, 4, 5]
It seems that, at least, I need atomicAdd() operation to index array2.
Even with atomicAdd(), I think that it is hard to make array2 like above sequence
(0, 1, 4, 5).
Is it possible to make array2 from array1 in cuda (for each thread block)?
you can coalesced groups:
suppose the read Boolean is threasIsIN:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
uint32_t tid = threadIdx.x;
const uint32_t warpLength = 32;
uint32_t warpIdx = tid / warpLength;
if (threadIsIn){
auto active = cg::coalesced_threads();
uint32_t idx = active.thread_rank() + warpIdx * warpLength;
array2[idx] = tid;
}
Edit
solution with multiple warps in a block:
the first warp of the block will prepare the shared array for the rest of warps in the block, this makes the other warps to wait for the first warp to finish.
thread_block block = this_thread_block();
uint32_t tid = threadIdx.x;
const uint32_t warpLength = 32;
uint32_t warpIdx = tid / warpLength;
uint32_t startIdx = 0;
uint32_t tidToWrite = tid;
uint32_t maxItr = blockSize / warpLength;
uint32_t itr = 0;
while (warpIdx == 0 && itr < maxItr){
auto warp = cg::coalesced_threads();
auto warpMask = warp.ballot(threadIsIn); // the tid'th bit is set to 1 if threadIsIn is true for tid
uint32_t trueThreadsSize = __popc(warpMask); // counts the number of bits that are set to 1
if(threadIsIn){
auto active = cg::coalesced_threads();
// active.size() has the same value as trueThreadsSize
array2[startIdx + active.thread_rank()] = tidToWrite;
}
startIdx += trueThreadsSize;
tidToWrite += warpLength;
++itr;
arr1Idx += warpLength;
threadIsIn = arr1[arr1Idx];
}
block.sync();
This is in a general category of problems called stream compaction. The canonical approach is to perform a prefix sum (scan operation) on a processed version of your data (converting the kept values to 1, the discarded values to 0), then use that prefix sum as the index to write to, in the output array.
CUB provides a convenient block-level scan operation, so we don't have to write our own. Thereafter, the indexed copy is trivial:
$ cat t1465.cu
#include <cub/cub.cuh>
#include <iostream>
#include <cstdlib>
const int nTPB = 1024;
const int ds = nTPB;
__global__ void BlockCompactKernel(bool *data, int *result, int *data_size)
{
// Specialize BlockScan for a 1D block of nTPB threads on type int
typedef cub::BlockScan<int, nTPB> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Obtain a segment of consecutive items that are blocked across threads
int scan_data[1];
// load data
bool tmp = data[threadIdx.x];
// process data
scan_data[0] = (tmp)?1:0;
// scan data
// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).ExclusiveSum(scan_data, scan_data);
// indexed copy
if (tmp) result[scan_data[0]] = threadIdx.x;
// optional: return result size
if (threadIdx.x == nTPB-1) *data_size = scan_data[0] + ((tmp)?1:0);
}
int main(){
bool *d_data, *data = new bool[ds];
int data_size, *d_data_size, *d_result, *result = new int[ds];
cudaMalloc(&d_data_size, sizeof(d_data_size[0]));
cudaMalloc(&d_result, ds*sizeof(d_result[0]));
for (int i = 0; i < ds; i++) data[i] = (rand() > (RAND_MAX/2))?true:false;
std::cout << "Original data:" << std::endl;
for (int i=0; i < ds; i++) std::cout << (int)data[i] << ",";
cudaMalloc(&d_data, ds*sizeof(d_data[0]));
cudaMemcpy(d_data, data, ds*sizeof(d_data[0]), cudaMemcpyHostToDevice);
BlockCompactKernel<<<1,nTPB>>>(d_data, d_result, d_data_size);
cudaMemcpy(&data_size, d_data_size, sizeof(d_data_size[0]), cudaMemcpyDeviceToHost);
cudaMemcpy(result, d_result, data_size*sizeof(d_result[0]), cudaMemcpyDeviceToHost);
std::cout << std::endl << "Compacted data:" << std::endl;
for (int i=0; i < data_size; i++) std::cout << result[i] << ",";
std::cout << std::endl;
}
$ nvcc -o t1465 t1465.cu
$ cuda-memcheck ./t1465
========= CUDA-MEMCHECK
Original data:
1,0,1,1,1,0,0,1,0,1,0,1,0,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,1,1,1,0,1,0,1,1,1,1,0,0,1,1,1,0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,1,1,1,0,1,0,1,1,1,0,0,1,1,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,1,0,0,1,1,1,1,1,1,0,0,1,0,0,0,0,1,1,1,0,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,1,0,1,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1,0,1,0,0,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,1,1,0,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,1,1,0,0,1,0,0,1,1,0,0,1,1,1,0,0,1,1,1,1,1,0,1,0,0,1,0,1,1,0,0,1,1,1,0,1,1,0,0,1,1,1,0,0,0,1,1,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,1,1,0,0,1,0,1,1,0,0,1,0,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,0,1,1,1,1,1,0,0,1,1,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,1,1,1,0,0,0,1,1,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,1,1,0,1,1,0,0,1,1,0,1,1,1,1,0,1,1,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,0,0,1,1,1,1,0,1,1,1,0,1,0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,1,1,0,
Compacted data:
0,2,3,4,7,9,11,13,14,15,16,17,19,23,28,30,31,32,34,35,37,39,40,41,43,46,47,49,50,53,54,61,62,63,65,67,68,69,70,73,74,75,77,78,80,83,84,87,89,90,91,92,93,95,97,98,99,102,103,105,106,108,110,116,119,123,124,125,126,128,132,135,137,139,141,143,146,147,148,149,150,151,154,159,160,161,164,166,168,170,173,174,178,179,181,182,184,186,187,189,190,191,192,193,195,196,197,198,199,200,201,202,203,204,207,208,210,212,214,219,221,222,223,225,226,229,230,233,237,238,240,244,246,249,250,251,254,255,256,258,260,261,262,264,267,268,272,273,274,276,280,282,286,287,288,289,291,293,294,295,296,298,299,301,302,303,305,308,311,315,316,318,320,321,329,330,331,332,333,337,338,343,349,350,352,353,356,357,358,360,362,366,367,368,370,374,375,378,379,382,383,386,391,392,397,398,401,402,403,404,407,410,411,412,413,415,418,422,425,427,428,431,432,433,437,439,440,441,448,450,455,457,458,459,460,461,462,464,466,467,468,469,470,473,474,475,479,481,482,483,488,489,492,493,494,496,499,500,501,502,505,506,507,508,509,511,512,513,515,516,517,518,519,520,521,522,524,525,526,527,528,529,531,534,535,536,537,539,540,541,542,544,546,547,548,549,552,554,556,563,564,565,566,569,572,573,576,577,578,581,582,583,584,585,587,590,592,593,596,597,598,600,601,604,605,606,610,611,613,614,618,619,620,621,623,624,629,630,631,632,633,637,638,639,642,644,645,648,650,651,652,653,658,662,667,668,670,677,678,682,683,685,687,689,690,692,693,696,697,698,699,700,702,704,706,712,714,715,717,720,722,724,725,726,727,728,731,732,734,737,740,741,744,747,749,751,752,753,755,756,757,761,762,763,764,765,766,767,775,776,777,782,786,787,789,790,793,794,796,797,798,799,801,802,806,808,811,812,814,815,817,820,822,827,829,830,832,833,835,836,839,847,851,852,853,854,855,858,860,863,864,865,866,868,869,870,872,876,878,879,880,881,882,883,884,885,886,887,888,890,891,895,896,897,899,902,908,909,911,912,913,916,917,918,920,921,922,923,924,926,927,928,929,932,938,941,942,944,945,950,952,954,955,961,964,968,973,975,976,977,980,981,983,985,986,987,989,990,991,994,996,999,1001,1002,1004,1008,1011,1014,1019,1020,1021,1022,
========= ERROR SUMMARY: 0 errors
$

CUDA Kernel not returning values

I am working with a server that has multiple GPUs. I am using openMP to launch a kernel over multiple GPUs at once. The problem I am seeing is that the Kernel I am running does not seem to update the values in the thrust device vectors it is passed. The code below should output a value of 1 for all elements in the device vectors but instead outputs a value of 0. The code compiles and runs and shows me that the kernel executes successfully.
I do not understand why this code is not behaving as expected.
#include <iostream>
#include <cmath>
#include <omp.h>
#include <vector>
#include <thrust/host_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/device_vector.h>
using namespace::std;
const long N_R1 = 100;
const long N_R2 = 100;
__global__ void kernel(long* ND, long* NR1,
float* a, float* b, float* c, float* d)
{
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
//Values correspond to 2D array limits
long idxR1 = idx / ND[0];
long idxR2 = idx % ND[0];
if(idxR1 >= NR1[0] || idxR2 >= ND[0])
{
return;
}
a[idx] =1.0;
b[idx] =1.0;
c[idx] =1.0;
d[idx] =1.0;
}
void kernel_wrapper()
{
// GPU Count
int num_gpus = 0;
cudaGetDeviceCount(&num_gpus);
omp_set_num_threads(num_gpus);
//Calculate Dimensioning
long D_total = N_R1 * N_R2;
//Region 1 coordinates are loaded on to each GPU
//Region 2 coordinates are divided up onto GPUs
long R2_stride = ceil(float(N_R2)/float(num_gpus));
//Distance arrays need to be split longo whole sections of region 1.
//(Distances size = N_R1 * N_R2) subset of distance size needs to be N_R1
long D_stride = R2_stride * N_R1;
#pragma omp parallel
{
// Get CPU thread number
long cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
// Set up Local Arrays for distance and potential
// Step 1: Calculate rough Array Limits
// If array spaces divide evenly between threads then beginnings and endings can be calculated below
long R2_begin = cpu_thread_id * R2_stride;
long D_begin = cpu_thread_id * D_stride;
long R2_end = R2_begin + R2_stride;
long D_end = D_begin + D_stride;
// Step 2: Check Ends are not out of bounds
// The last thread in the calculation is likely to have array sizings that are out of bounds
// if this is the case then the ends need to be clipped:
if(R2_end >= N_R2)
{
R2_end = N_R2;
}
if(D_end >= D_total)
{
D_end = D_total;
}
// Local aray sizes are (end - begin)
long l_R2 = R2_end - R2_begin;
long l_D = D_end - D_begin;
float zero = 0.0;
// Create Region 2 potential components
thrust::host_vector<float > a(l_D,zero);
thrust::host_vector<float > b(l_D,zero);
thrust::host_vector<float > c(l_D,zero);
thrust::host_vector<float > d(l_D,zero);
long* p_NR1;
long nr1 = N_R1;
cudaMalloc( (void**)&p_NR1, sizeof(long) );
cudaMemcpy( p_NR1, &nr1, sizeof(long), cudaMemcpyHostToDevice);
long* p_NR2;
cudaMalloc( (void**)&p_NR2, sizeof(long) );
cudaMemcpy( p_NR2, &l_D, sizeof(long), cudaMemcpyHostToDevice);
//Generate Device Side Data for region 2 potential components
thrust::device_vector< float > d_a = a;
thrust::device_vector< float > d_b = b;
thrust::device_vector< float > d_c = c;
thrust::device_vector< float > d_d = d;
// Generate pointers to Device Side Data for region 2 potential components
float* p_a = thrust::raw_pointer_cast(d_a.data());
float* p_b = thrust::raw_pointer_cast(d_b.data());
float* p_c = thrust::raw_pointer_cast(d_c.data());
float* p_d = thrust::raw_pointer_cast(d_d.data());
dim3 blocks = N_R1;
dim3 threads = l_R2;
kernel<<<blocks,threads>>>(p_NR2, p_NR1,
p_a, p_b, p_c, p_d);
cudaDeviceSynchronize();
if(cudaGetLastError() == cudaSuccess)
{
cout << "Kernel Successful!" << cudaGetErrorString(cudaGetLastError()) << endl;
cin.ignore(1);
}
a = d_a;
b = d_b;
c = d_c;
d = d_d;
for(long j = 0; j != a.size(); j++)
{
cout << "a[" << j << "] = " << a[j] << endl;
}
for(long j = 0; j != b.size(); j++)
{
cout << "b[" << j << "] = " << b[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
}
cin.ignore(1);
}
int main()
{
kernel_wrapper();
return 0;
}
Any help would be greatly appreciated.
Some of the output values are getting set to 1, some are not. The problem is due to this statement:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
That isn't what I would call a proper generic conversion of 3D grid/block to globally unique 1D index, which I assume is your intent. Let's just pick one example to prove that it is broken. Suppose you are launching a 1D grid of 1D blocks (which is what you are doing). Then all of the (block,thread)Idx.y and .z variables will all be zero. Only blockIdx.x and threadIdx.x can take on non-zero values in that launch configuration.
In that case your expression reduces to:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + 0 * gridDim.x * gridDim.y * 0 )
* ( 0 * ( blockDim.x*blockDim.y ) ) + 0
* blockDim.x + threadIdx.x;
i.e. it reduces to:
long idx = threadIdx.x;
So the first (block-size) elements of your arrays (a,b,c,d) are getting set properly, the rest are not. Since threadIdx.x is not unique from one block to the next, this is not a proper globally-unique thread ID, and therefore each block is writing the same output locations, rather than each taking care of a separate part of the array.
So what is a possible (correct) generic 3D-to-1D index conversion?
That is answered here (and probably other places). This answer actually only converts a 3D grid plus 1D block configuration to a globally-unique ID, but it is sufficient for demonstration purposes of what is wrong in this code.
When I replace your in-kernel calculation of idx with that code, your kernel populates all array entries with 1.0 according to my testing.

cufftSetStream causes garbage output. Am I doing something wrong?

According to the docs, the cufftSetStream() function
Associates a CUDA stream with a cuFFT plan. All kernel launches made during plan execution are now done through the associated stream [...until...] the stream is changed with another call to cufftSetStream().
Unfortunately, the results are turned into garbage. Here is an example that demonstrates this by performing a bunch of transforms two ways: once with each stream having its own dedicated plan, and once with a single plan being reused as the documentation above indicates. The former behaves as expected, the reused/cufftSetStream approach has errors in most of the transforms. This was observed on the two cards I've tried (GTX 750 ti, Titan X) on CentOS 7 linux with
Cuda compilation tools, release 7.0, V7.0.27; and release 7.5, V7.5.17.
EDIT :see the "FIX" comments below for one way to fix things.
#include <cufft.h>
#include <stdexcept>
#include <iostream>
#include <numeric>
#include <vector>
#define ck(cmd) if ( cmd) { std::cerr << "error at line " << __LINE__ << std::endl;exit(1);}
__global__
void fill_input(cufftComplex * buf, int batch,int nbins,int stride,int seed)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y)
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nbins;j += gridDim.x*blockDim.x)
buf[i*stride + j] = make_cuFloatComplex( (i+seed)%101 - 50,(j+seed)%41-20);
}
__global__
void check_output(const float * buf1,const float * buf2,int batch, int nfft, int stride, int * errors)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y) {
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nfft;j += gridDim.x*blockDim.x) {
float e=buf1[i*stride+j] - buf2[i*stride+j];
if (e*e > 1) // gross error
atomicAdd(errors,1);
}
}
}
void demo(bool reuse_plan)
{
if (reuse_plan)
std::cout << "Reusing the same fft plan with multiple stream via cufftSetStream ... ";
else
std::cout << "Giving each stream its own dedicated fft plan ... ";
int nfft = 1024;
int batch = 1024;
int nstreams = 8;
int nbins = nfft/2+1;
int nit=100;
size_t inpitch,outpitch;
std::vector<cufftComplex*> inbufs(nstreams);
std::vector<float*> outbufs(nstreams);
std::vector<float*> checkbufs(nstreams);
std::vector<cudaStream_t> streams(nstreams);
std::vector<cufftHandle> plans(nstreams);
for (int i=0;i<nstreams;++i) {
ck( cudaStreamCreate(&streams[i]));
ck( cudaMallocPitch((void**)&inbufs[i],&inpitch,nbins*sizeof(cufftComplex),batch) );
ck( cudaMallocPitch((void**)&outbufs[i],&outpitch,nfft*sizeof(float),batch));
ck( cudaMallocPitch((void**)&checkbufs[i],&outpitch,nfft*sizeof(float),batch) );
if (i==0 || reuse_plan==false)
ck ( cufftPlanMany(&plans[i],1,&nfft,&nbins,1,inpitch/sizeof(cufftComplex),&nfft,1,outpitch/sizeof(float),CUFFT_C2R,batch) );
}
// fill the input buffers and FFT them to get a baseline for comparison
for (int i=0;i<nstreams;++i) {
fill_input<<<20,dim3(32,32)>>>(inbufs[i],batch,nbins,inpitch/sizeof(cufftComplex),i);
ck (cudaGetLastError());
if (reuse_plan) {
ck (cufftExecC2R(plans[0],inbufs[i],checkbufs[i]));
}else{
ck (cufftExecC2R(plans[i],inbufs[i],checkbufs[i]));
ck( cufftSetStream(plans[i],streams[i]) ); // only need to set the stream once
}
ck( cudaDeviceSynchronize());
}
// allocate a buffer for the error count
int * errors;
cudaMallocHost((void**)&errors,sizeof(int)*nit);
memset(errors,0,sizeof(int)*nit);
/* FIX: an event can protect the plan internal buffers
by serializing access to the plan
cudaEvent_t ev;
cudaEventCreateWithFlags(&ev,cudaEventDisableTiming);
*/
// perform the FFTs and check the outputs on streams
for (int it=0;it<nit;++it) {
int k = it % nstreams;
ck( cudaStreamSynchronize(streams[k]) ); // make sure any prior kernels have completed
if (reuse_plan) {
// FIX: ck(cudaStreamWaitEvent(streams[k],ev,0 ) );
ck(cufftSetStream(plans[0],streams[k]));
ck(cufftExecC2R(plans[0],inbufs[k],outbufs[k]));
// FIX: ck(cudaEventRecord(ev,streams[k] ) );
}else{
ck(cufftExecC2R(plans[k],inbufs[k],outbufs[k]));
}
check_output<<<100,dim3(32,32),0,streams[k]>>>(outbufs[k],checkbufs[k],batch,nfft,outpitch/sizeof(float),&errors[it]);
ck (cudaGetLastError());
}
ck(cudaDeviceSynchronize());
// report number of errors
int errcount=0;
for (int it=0;it<nit;++it)
if (errors[it])
++errcount;
std::cout << errcount << " of " << nit << " transforms had errors\n";
for (int i=0;i<nstreams;++i) {
cudaFree(inbufs[i]);
cudaFree(outbufs[i]);
cudaStreamDestroy(streams[i]);
if (i==0 || reuse_plan==false)
cufftDestroy(plans[i]);
}
}
int main(int argc,char ** argv)
{
demo(false);
demo(true);
return 0;
}
Typical output
Giving each stream its own dedicated fft plan ... 0 of 100 transforms had errors
Reusing the same fft plan with multiple stream via cufftSetStream ... 87 of 100 transforms had errors
In order to reuse plans the way you want you need to manage cuFFT work area manually.
Each plan has a space for intermediate calculation results. If you want to use plan handle at the same time for two or more different plan executions you need to provide temporary buffer for each concurrent cufftExec* call.
You can do this by using cufftSetWorkArea - please have a look at section 3.7 in cuFFT documentation. Section 2.2 also would help to understand how it works.
Here's a worked example showing the changes to your code for this:
$ cat t1241.cu
#include <cufft.h>
#include <stdexcept>
#include <iostream>
#include <numeric>
#include <vector>
#define ck(cmd) if ( cmd) { std::cerr << "error at line " << __LINE__ << std::endl;exit(1);}
__global__
void fill_input(cufftComplex * buf, int batch,int nbins,int stride,int seed)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y)
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nbins;j += gridDim.x*blockDim.x)
buf[i*stride + j] = make_cuFloatComplex( (i+seed)%101 - 50,(j+seed)%41-20);
}
__global__
void check_output(const float * buf1,const float * buf2,int batch, int nfft, int stride, int * errors)
{
for (int i = blockDim.y * blockIdx.y + threadIdx.y; i< batch;i += gridDim.y*blockDim.y) {
for (int j = blockDim.x * blockIdx.x + threadIdx.x; j< nfft;j += gridDim.x*blockDim.x) {
float e=buf1[i*stride+j] - buf2[i*stride+j];
if (e*e > 1) // gross error
atomicAdd(errors,1);
}
}
}
void demo(bool reuse_plan)
{
if (reuse_plan)
std::cout << "Reusing the same fft plan with multiple stream via cufftSetStream ... ";
else
std::cout << "Giving each stream its own dedicated fft plan ... ";
int nfft = 1024;
int batch = 1024;
int nstreams = 8;
int nbins = nfft/2+1;
int nit=100;
size_t inpitch,outpitch;
std::vector<cufftComplex*> inbufs(nstreams);
std::vector<float*> outbufs(nstreams);
std::vector<float*> checkbufs(nstreams);
std::vector<cudaStream_t> streams(nstreams);
std::vector<cufftHandle> plans(nstreams);
// if plan reuse, set up independent work areas
std::vector<char *> wk_areas(nstreams);
for (int i=0;i<nstreams;++i) {
ck( cudaStreamCreate(&streams[i]));
ck( cudaMallocPitch((void**)&inbufs[i],&inpitch,nbins*sizeof(cufftComplex),batch) );
ck( cudaMallocPitch((void**)&outbufs[i],&outpitch,nfft*sizeof(float),batch));
ck( cudaMallocPitch((void**)&checkbufs[i],&outpitch,nfft*sizeof(float),batch) );
if (i==0 || reuse_plan==false)
ck ( cufftPlanMany(&plans[i],1,&nfft,&nbins,1,inpitch/sizeof(cufftComplex),&nfft,1,outpitch/sizeof(float),CUFFT_C2R,batch) );
}
if (reuse_plan){
size_t ws;
ck(cufftGetSize(plans[0], &ws));
for (int i = 0; i < nstreams; i++)
ck(cudaMalloc(&(wk_areas[i]), ws));
ck(cufftSetAutoAllocation(plans[0], 0));
ck(cufftSetWorkArea(plans[0], wk_areas[0]));
}
// fill the input buffers and FFT them to get a baseline for comparison
for (int i=0;i<nstreams;++i) {
fill_input<<<20,dim3(32,32)>>>(inbufs[i],batch,nbins,inpitch/sizeof(cufftComplex),i);
ck (cudaGetLastError());
if (reuse_plan) {
ck (cufftExecC2R(plans[0],inbufs[i],checkbufs[i]));
}else{
ck (cufftExecC2R(plans[i],inbufs[i],checkbufs[i]));
ck( cufftSetStream(plans[i],streams[i]) ); // only need to set the stream once
}
ck( cudaDeviceSynchronize());
}
// allocate a buffer for the error count
int * errors;
cudaMallocHost((void**)&errors,sizeof(int)*nit);
memset(errors,0,sizeof(int)*nit);
// perform the FFTs and check the outputs on streams
for (int it=0;it<nit;++it) {
int k = it % nstreams;
ck( cudaStreamSynchronize(streams[k]) ); // make sure any prior kernels have completed
if (reuse_plan) {
ck(cufftSetStream(plans[0],streams[k]));
ck(cufftSetWorkArea(plans[0], wk_areas[k])); // update work area pointer in plan
ck(cufftExecC2R(plans[0],inbufs[k],outbufs[k]));
}else{
ck(cufftExecC2R(plans[k],inbufs[k],outbufs[k]));
}
check_output<<<100,dim3(32,32),0,streams[k]>>>(outbufs[k],checkbufs[k],batch,nfft,outpitch/sizeof(float),&errors[it]);
ck (cudaGetLastError());
}
ck(cudaDeviceSynchronize());
// report number of errors
int errcount=0;
for (int it=0;it<nit;++it)
if (errors[it])
++errcount;
std::cout << errcount << " of " << nit << " transforms had errors\n";
for (int i=0;i<nstreams;++i) {
cudaFree(inbufs[i]);
cudaFree(outbufs[i]);
cudaFree(wk_areas[i]);
cudaStreamDestroy(streams[i]);
if (i==0 || reuse_plan==false)
cufftDestroy(plans[i]);
}
}
int main(int argc,char ** argv)
{
demo(false);
demo(true);
return 0;
}
$ nvcc -o t1241 t1241.cu -lcufft
$ ./t1241
Giving each stream its own dedicated fft plan ... 0 of 100 transforms had errors
Reusing the same fft plan with multiple stream via cufftSetStream ... 0 of 100 transforms had errors
$

Optimize vector matrix multiplication in cuda with large number of zeros

I am using the following kernel to optimize vector-matrix multiplication for the case where both the vector and the matrix have a large number of zeros. The use of this kernel may reduce the time taken for such a multiplication by up to half of the time taken by cublasSgemv, for the case where there are more than 90% zeros. But, it is still much longer than an equivalent blas gemm host call on Ubuntu 14.04
vec = 1 x m, mat = m x m and prod = 1 x m; all are in row-major order
m >= 5000
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
What can be done to further enhance the performance of this kernel apart from libraries like cuSparse?
Would be nice if this optimization was compatible with Compute Capability of 1.2
Thanks
EDIT
Corrected: prod = 1 x m
GPU = Quadro FX 1800M, Cuda v.5.0 on Ubuntu 14.04
EDIT
Complete code that performs multiplication using i. blas, ii. cublas, iii. above kernel for m = 6000. Please enter 0, when asked to enter a value
#include <iostream>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <cublas_v2.h>
#include <math.h>
using namespace std;
const int m = 6000;
const int BS = 512; // threads per block
const int NB = ceil((float) m / BS); // number of blocks
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
int main()
{
timespec blas_start, blas_end, cublas_start, cublas_end, opt_start, opt_end;
long totalnsec; //total nano sec
double totalsec, totaltime;
int i, j;
float *A = new float[m]; // 1 x m
float *B = new float[m*m]; // m x m
float *C = new float[m]; // 1 x m
float input;
cout<<"Enter a value to populate the vector (0 to make it sparse) ";
cin>>input;
// input martix A: every 600th element is non-zero i.e 90% zero
for(i = 0; i < m; i++)
{
A[i] = input;
if( i % 600 == 0) //adjust for sparsity
A[i] = i;
}
// input matrix B: identity matrix
for(i = 0; i < m; i++)
for(j = 0; j < m; j++)
B[j*m + i] = (i==j);
//blas on host
clock_gettime(CLOCK_REALTIME, &blas_start);
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, m, m, 1.0f, A, m, B, m, 0.0f, C, m);
//cblas_sgemv(CblasRowMajor, CblasTrans, m, m, 1.0f, B, m, A, 1, 0.0f, C, 1);
clock_gettime(CLOCK_REALTIME, &blas_end);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
//cublas section
cudaError_t cudaStat;
cublasHandle_t handle;
cublasCreate(&handle);
float *A_d, *B_d, *C_d;
cudaStat = cudaMalloc(&A_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for A_d\n");
cudaStat = cudaMalloc(&B_d, sizeof(float)*m*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for B_d\n");
cudaStat = cudaMalloc(&C_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for C_d\n");
cudaMemcpy(A_d, A, sizeof(float)*m, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B, sizeof(float)*m*m, cudaMemcpyHostToDevice);
float alpha = 1.0f, beta = 0.0f;
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_start);
cublasSgemv(handle, CUBLAS_OP_N, m, m, &alpha, B_d, m, A_d, 1, &beta, C_d, 1);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Call kernel having Optimization for Zeros
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_start);
/////////////////// call kernel //////////////////
calc_v_m<<<NB, BS>>>(A_d, B_d, C_d, m);
//////////////////////////////////////////////////
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/*for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Print times
// blas time
totalsec = (double)blas_end.tv_sec - (double)blas_start.tv_sec;
totalnsec = blas_end.tv_nsec - blas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"blas Time = "<< totaltime << "\n";
//cublas time
totalsec = (double)cublas_end.tv_sec - (double)cublas_start.tv_sec;
totalnsec = cublas_end.tv_nsec - cublas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"cublas Time = "<< totaltime << "\n";
//Optimized Kernel Time
totalsec = (double)opt_end.tv_sec - (double)opt_start.tv_sec;
totalnsec = opt_end.tv_nsec - opt_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"Opt Kernel Time = "<< totaltime << "\n";
return 0;
}
Results
$ nvcc -arch=sm_12 blascomp.cu -o blascomp.o -lblas -lcublas
$ ./blascomp.o
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 0.000105207
cublas Time = 0.0070294
Opt Kernel Time = 0.00642797
At least on my system blas is still the fastest for such a scenario
Things get even more interesting if every '1200th' element instead of '600th' is set to 0
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 7.84e-05
cublas Time = 0.00698783
Opt Kernel Time = 0.00643042
The important thing to recognise here is that the gemv operation you are concerned with is fundamentally memory throughput limited on GPUs, rather than compute throughput limited. This implies that an "optimisation" as you have shown in your kernel:
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
isn't really an optmisation at all, simply because the memory transactions are the performance bottleneck in the kernel, not the floating point arithmetic, and your code must perform most of the memory transactions irrespective of whether the multiply add operation will be performed because of zero detection or not.
Consider the following, instrumented version of roughly the same code:
__constant__ float cvec1[2];
__global__ void
__launch_bounds__(512,4)
calc_v_m1(const float* __restrict__ vec,
const float* __restrict__ mat,
float* __restrict__ prod,
int m,
int do_reads = 1,
int do_write = 1)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
float res = 0;
float mval = cvec1[0], vval = cvec1[1];
#pragma unroll 8
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if (do_reads) {
mval = mat[offset];
vval = vec[i];
}
res += mval * vval;
}
if (do_write) prod[x] = res;
}
}
Here I have added two optional arguments which control whether the kernel will load from global memory, and whether the kernel will store to global memory. This allows me to quantify the performance impact of the memory loads, computation, and memory stores independently. The results using your test code are instructive:
Function nvprof time
-----------------------------------------------
cublasSgemv 942.75us
calc_v_m 2798.4us
calc_v_m1(do_reads=1, do_write=1) 962.40us
calc_v_m1(do_reads=1, do_write=0) 970.40us
calc_v_m1(do_reads=0, do_write=1) 55.166us
calc_v_m1(do_reads=0, do_write=0) 55.102us
[All benchmarking done on a GTX970 using the CUDA 7.5 release toolchain and CUBLAS 7.5 library]
In no particular order:
The full instrumented kernel runtime is within a few percent of the equivalent CUBLAS call
The memory fetches from global memory are the bottleneck
The actual computations in the kernel only constitute 5% of the kernel running time
The "fire-and-forget" nature of write operations in CUDA means that the latency of the write has no significant effect on throughput.
Your "optimised" kernel is considerably slower than either CUBLAS or the instrumented kernel, probably because all you are introducing is branch divergence without addressing the source of the kernel bottleneck (the latency of the memory loads).
The only times conditionally executing the FMAD operation makes sense would be in an architecture where memory has near zero latency and floating point throughput was severely constrained. The GPU definitely doesn't fall into that category.
The only other option for optimising this would be to exploit a priori information about the sparsity patterns in the LHS matrix to remove the need to read zero entries. Which is precisely what sparse matrix formats and linear algebra codes are designed to accommodate.

Cuda call won't allocate more than 8 threads per block, regardless of specification

I am creating a parallel version of the Sieve of Eratosthenes in c++. The problem is my kernel call (reduce0) seems to only ever assign 8 threads per block instead of the 256 I specify. Since even the first CUDA version allows 512 threads per block, there must be some error in my code for it. Any help would be appreciated.
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;
////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);
////////////////////////////////////////////////////
int main(){
int n = pow((double) 2, 8);
int total = sieve(n);
cout << "# primes" << endl << total << endl;
return 0;
}
///////////////////////////////////////////////////
__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////////////////////////////////
int call_kernel(int *primes, int n){
// Allocate and copy device arrays
int *g_idevice;
int *g_odevice;
int size = n * sizeof(int);
cudaMalloc(&g_idevice, size);
cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
cudaMalloc(&g_odevice, size);
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
// Copy device data back to primes
cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
//for (int i = 0; i < n; i++) {
// cout << i << " " << primes[i] << endl;
//}
int total = primes[0];
cudaFree(g_idevice);
cudaFree(g_odevice);
return total;
}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
int i = f;
while(arg[i]!= 1 && i < n) {
i++;
}
return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
int total = 0;
int i = 2;
while(i < n){
if(arg[i] == 1){
total = total + 1;
}
i++;
}
return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
int* primes = NULL;
int mult = 0;
int k = 2;
int i; int total;
//primes = new int[n];
primes = new int[256];
for(i = 0; i < n; i++){
primes[i] = 1;
}
primes[0] = primes[1] = 0;
while (k * k < n){
mult = k * k;
while (mult < n) {
primes[mult] = 0;
mult = mult + k;
}
k = findsmallest(primes,k+1, n);
}
total = call_kernel(primes, n);
//delete [] primes;
//primes = NULL;
return total;
}
Your kernel is using dynamically allocated shared memory, but the kernel launch does not include any allocation, so the result is the kernel will be aborting because of illegal memory operations on that shared memory buffer. You should find it works if you modify this part of call_kernel as follows:
// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);
If you had of included some basic error checking around the function call, perhaps like this:
reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}
// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}
it would have been immediately obvious that the kernel launch or execution was failing with an error.