Shared memory, branching performance and register count - cuda

I came across some peculiar performance behaviour when trying out the CUDA shuffle instruction. The test kernel below is based on an image processing algorithm which adds input-dependent values to all neighbouring pixels within a square of side rad. The output for each block is added in shared memory. If only one thread per warp adds its result to shared memory, the performance is poor (Option 1), whereas on the other hand, if all threads add to shared memory (one thread adds the desired value, the rest just add 0), the execution time drops by 2-3 times (Option 2).
#include <iostream>
#include "cuda_runtime.h"
#define warpSz 32
#define tileY 32
#define rad 32
__global__ void test(float *out, int pitch)
{
// Set shared mem to 0
__shared__ float tile[(warpSz + 2*rad) * (tileY + 2*rad)];
for (int i = threadIdx.y*blockDim.x+threadIdx.x; i<(tileY+2*rad)*(warpSz+2*rad); i+=blockDim.x*blockDim.y) {
tile[i] = 0.0f;
}
__syncthreads();
for (int row=threadIdx.y; row<tileY; row += blockDim.y) {
// Loop over pixels in neighbourhood
for (int i=0; i<2*rad+1; ++i) {
float res = 0.0f;
int rowStartIdx = (row+i)*(warpSz+2*rad);
for (int j=0; j<2*rad+1; ++j) {
res += float(threadIdx.x+row); // Substitute for real calculation
// Option 1: one thread writes to shared mem
if (threadIdx.x == 0) {
tile[rowStartIdx + j] += res;
res = 0.0f;
}
//// Option 2: all threads write to shared mem
//float tmp = 0.0f;
//if (threadIdx.x == 0) {
// tmp = res;
// res = 0.0f;
//}
//tile[rowStartIdx + threadIdx.x+j] += tmp;
res = __shfl(res, (threadIdx.x+1) % warpSz);
}
res += float(threadIdx.x+row);
tile[rowStartIdx + threadIdx.x+2*rad] += res;
__syncthreads();
}
}
// Add result back to global mem
for (int row=threadIdx.y; row<tileY+2*rad; row+=blockDim.y) {
for (int col=threadIdx.x; col<warpSz+2*rad; col+=warpSz) {
int idx = (blockIdx.y*tileY + row)*pitch + blockIdx.x*warpSz + col;
atomicAdd(out+idx, tile[row*(warpSz+2*rad) + col]);
}
}
}
int main(void)
{
int2 dim = make_int2(512, 512);
int pitchOut = (((dim.x+2*rad)+warpSz-1) / warpSz) * warpSz;
int sizeOut = pitchOut*(dim.y+2*rad);
dim3 gridDim((dim.x+warpSz-1)/warpSz, (dim.y+tileY-1)/tileY, 1);
float *devOut;
cudaMalloc((void**)&devOut, sizeOut*sizeof(float));
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaFree(0);
cudaEventRecord(start, 0);
test<<<gridDim, dim3(warpSz, 8)>>>(devOut, pitchOut);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaFree(devOut);
cudaDeviceReset();
std::cout << "Elapsed time: " << elapsedTime << " ms.\n";
std::cin.ignore();
}
Is this expected behaviour/can anyone explain why this happens?
One thing I have noted is that Option 1 uses only 15 registers, whereas Option 2 uses 37, which seems a big difference to me.
Another is that the if-statement in the innermost loop is converted to explicit bra instructions in the PTX code for Option 1, whereas for Option 2 it is converted to two selp instructions. Could it be that the explicit branching is behind the 2-3 times slow down similar to what's suspected in this question?
There are two reasons why I am reluctant to go for Option 2. First, when profiling the original application it seems to be limited by share memory bandwidth, which indicates that there is potential to increase the performance by having fewer threads accessing it. Second, unless we use the volatile keyword, writes to shared memory can be optimised to registers. Since we are only interested in the contribution from last the thread to access each memory location (threadIdx.x == 0), and all others add 0, this is not a problem as long as all changes temporarily located in registers are guaranteed to be written back to shared memory in the same order they were issued. Is this the case though? (This far, both options have produced the exact same result.)
Any thoughts or ideas are much appreciated!
PS. I compile for compute capability 3.0. (However, the shuffle instruction is not necessary to demonstrate the behaviour and can be commented out.)

Related

Threads of a CUDA kernel execute sequentially

I have two kernels that process some data sequentially (launched with only one thread). I want to combine the two so that I can have one kernel to launch with two threads. After doing so, I was expecting to get an exec time of max(kernel1, kernel2) but what I got was the sum of the two exec times. I narrowed down the problem to something like the code below.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<string>
#include<vector>
#include<random>
#include<functional>
#include<algorithm>
#include<iterator>
__global__ void dummyKernel(const float *d_data_Re, const float *d_data_Im,
float *d_out_Re, float *d_out_Im, const int dataLen) {
int i{ threadIdx.x };
if (i == 0) {
printf("Thread zero started \n");
for (int j{}; j < 1000000; j++)
d_out_Re[j%dataLen] = sqrtf(2) + d_data_Re[j%dataLen] * (j % 4 == 1);
printf("Thread zero finished \n");
}
else if (i == 1) {
printf("Thread one started \n");
for (int j{}; j < 1000000; j++)
d_out_Im[j%dataLen] = sqrtf(2) + d_data_Im[j%dataLen] * (j % 4 == 1);
printf("Thread one finished \n");
}
}
__global__ void dummyKernel2(const float *d_data_Re, const float *d_data_Im,
float *d_out_Re, float *d_out_Im, const int dataLen) {
int i{ threadIdx.x };
//if (i == 0) {
printf("Thread zero started \n");
for (int j{}; j < 1000000; j++)
d_out_Re[j%dataLen] = sqrtf(2) + d_data_Re[j%dataLen] * (j % 4 == 1);
printf("Thread zero finished \n");
//}
//else if (i == 1) {
// printf("Thread one started \n");
// for (int j{}; j < 1000000; j++)
// d_out_Im[j%dataLen] = sqrtf(2) + d_data_Im[j%dataLen] * (j % 4 == 1);
// printf("Thread one finished \n");
//}
}
int main()
{
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
const int sizeOfFrame = 2 * 1024 * 1024;
std::vector<float> data_re(sizeOfFrame), data_im;
//random number generator
std::uniform_real_distribution<float> distribution(0.0f, 2.0f); //Values between 0 and 2
std::mt19937 engine; // Mersenne twister MT19937
auto generator = std::bind(distribution, engine);
std::generate_n(data_re.begin(), sizeOfFrame, generator);
std::copy(data_re.begin(), data_re.end(), std::back_inserter(data_im));
//
float *d_data_re, *d_data_im;
cudaMalloc(&d_data_re, sizeOfFrame * sizeof(float));
cudaMalloc(&d_data_im, sizeOfFrame * sizeof(float));
cudaMemcpy(d_data_re, data_re.data(), sizeOfFrame * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_data_im, data_im.data(), sizeOfFrame * sizeof(float), cudaMemcpyHostToDevice);
float *d_pll_out_re, *d_pll_out_im;
cudaMalloc(&d_pll_out_re, sizeOfFrame * sizeof(float));
cudaMalloc(&d_pll_out_im, sizeOfFrame * sizeof(float));
dummyKernel << <1, 2 >> >(d_data_re, d_data_im,
d_pll_out_re, d_pll_out_im, sizeOfFrame);
cudaDeviceSynchronize();
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
btw I got the code for random number generator from an answer to this question. So, the dummyKernel doesn't do anything useful, I just wanted to have a kernel that took relatively long to finish. If you launch dummyKernel, the order of the output will be "Thread zero started", "Thread zero finished", "Thread one started", "Thread one finished". Sequential. But if you launch dummyKernel2, the order of the output will be "Thread zero started", "Thread zero started", "Thread zero finished", "Thread zero finished" and the exec time is almost half as dummyKernel. I don't understand this behavior and the effect of the if-else I used.
OS: Windows 10, GTX 1050 Ti, CUDA Driver/Runtime version: 11.1/10.1.
Each Cuda multiprocessor has execution units (several each for int, float, special functions, ...). Those work as pipelines, which take several cycles to complete a calculation, but in each cycle a new calculation can be inserted (=scheduled) and several calculations are processed at the same time at different stages of the pipeline.
Groups of 32 threads (warps) within a block are scheduled the same instruction at the same time (same cycle or often two cycles depending on how many execution and datapath resources are available on the architecture and needed for this instruction), together with a bitfield, stating, for which threads this instruction should be actively executed. If some threads of a warp evaluated an if clause as false, they are temporarily deactivated. Or some threads may have already exited the kernel.
The effect is that if the 32 warps diverge (branch differently), each execution path has to be run through for each of the 32 threads (with some threads deactivated for each path). That should be avoided for performance reasons, as the computation resources are reserved nevertheless. Threads from different warps don't have this interdependency. The algorithm should be structured in a way to consider this.
With Volta, Independent Thread Scheduling was introduced. Each thread has its own instruction counter (and manages a separate function callstack). But the scheduler still will schedule groups of 32 threads (warps) with bitfields for active threads. What changed is that the scheduler can interleave the diverging paths. Instead of executing CCCIIIEEECCC pre-Volta (instructions: C=common, I=if branch, e=else branch), it could execute CCCIEEIIECCC, if the available execution units or the memory latency better fits. As programmer, one has to be careful, as it can be no longer assumed that the threads have not diverged, even when executing the same instruction. That is why __syncwarp was introduced and all kind of cooperation functions (e.g. the shuffle instructions) got a sync variant. Nevertheless (although we cannot know for sure, if the threads diverged) one still has to program in a way that all 32 threads can work together, if executed synchronously, especially for coalesced memory accesses. Putting __syncwarp after each possibly diverging instruction can help to ensure convergence. (But do performance profiling).
The Independent Thread Scheduling is also the reason, why __syncthreads must definitely be called correctly on the RTX 3080 - with each thread participating. A typical correcting solution for the deadlock case you mentioned in the comment is to close the if clause, sync all the threads and open a new if clause with the same condition as the previous one.

Summing up elements in array using managedCuda

Problem Description
I try to get a kernel summing up all elements of an array to work. The kernel is intended to be launched with 256 threads per block and an arbitary number of blocks. The length of the array passsed in as a is always a multiple of 512, in fact it is #blocks * 512. One block of the kernel should sum up 'its' 512 elements (256 threads can sum up 512 elements using this algorithm), storing the result in out[blockIdx.x]. The final summation over the values in out ,and therefore the results of the blocks, will be done on the host.
This kernel works fine for up to 6 blocks, meaning up to 3072 elements. But launching it with more than 6 blocks result in the first block calculating a strictly greater, wrong result than the other blocks (i. e. out = {572, 512, 512, 512, 512, 512, 512}), this wrong result is reproducable, the wrong value is the same for multiple executions.
I guess this means there is a structural error somewhere in my code, which has something to do with blockIdx.x, but the only use this is to calculate blockStart, and this seams to be a correct calculation, also for the first block.
I verified if my host code computes the correct number of blocks for the kernel and passes in an array of correct size. That's not the problem.
Of course I read a lot of similar questions here on stackoverflow, but none seems to describe my problem (See i. e. here or here)
The kernel is called via managedCuda (C#), I don't know if this might be a problem.
Hardware
I use a MX150 with the follwing specifications:
Revision Number: 6.1
Total global memory: 2147483648
Total shared memory per block: 49152
Total registers per block: 65536
Warp size: 32
Max Threads per block: 1024
Max Blocks: 2147483648
Number of multiprocessors: 3
Code
Kernel
__global__ void Vector_Reduce_As_Sum_Kernel(float* out, float* a)
{
int tid = threadIdx.x;
int blockStart = blockDim.x * blockIdx.x * 2;
int i = tid + blockStart;
int leftSumElementIdx = blockStart + tid * 2;
a[i] = a[leftSumElementIdx] + a[leftSumElementIdx + 1];
__syncthreads();
if (tid < 128)
{
a[i] = a[leftSumElementIdx] + a[leftSumElementIdx + 1];
}
__syncthreads();
if(tid < 64)
{
a[i] = a[leftSumElementIdx] + a[leftSumElementIdx + 1];
}
__syncthreads();
if (tid < 32)
{
a[i] = a[leftSumElementIdx] + a[leftSumElementIdx + 1];
}
__syncthreads();
if (tid < 16)
{
a[i] = a[leftSumElementIdx] + a[leftSumElementIdx + 1];
}
__syncthreads();
if (tid < 8)
{
a[i] = a[leftSumElementIdx] + a[leftSumElementIdx + 1];
}
__syncthreads();
if (tid < 4)
{
a[i] = a[leftSumElementIdx] + a[leftSumElementIdx + 1];
}
__syncthreads();
if (tid < 2)
{
a[i] = a[leftSumElementIdx] + a[leftSumElementIdx + 1];
}
__syncthreads();
if (tid == 0)
{
out[blockIdx.x] = a[blockStart] + a[blockStart + 1];
}
}
Kernel Invocation
//Get the cuda kernel
//PathToPtx and MangledKernelName must be replaced
CudaContext cntxt = new CudaContext();
CUmodule module = cntxt.LoadModule("pathToPtx");
CudaKernel vectorReduceAsSumKernel = new CudaKernel("MangledKernelName", module, cntxt);
//Get an array to reduce
float[] array = new float[4096];
for(int i = 0; i < array.Length; i++)
{
array[i] = 1;
}
//Calculate execution info for the kernel
int threadsPerBlock = 256;
int numOfBlocks = array.Length / (threadsPerBlock * 2);
//Memory on the device
CudaDeviceVariable<float> m_d = array;
CudaDeviceVariable<float> out_d = new CudaDeviceVariable<float>(numOfBlocks);
//Give the kernel necessary execution info
vectorReduceAsSumKernel.BlockDimensions = threadsPerBlock;
vectorReduceAsSumKernel.GridDimensions = numOfBlocks;
//Run the kernel on the device
vectorReduceAsSumKernel.Run(out_d.DevicePointer, m_d.DevicePointer);
//Fetch the result
float[] out_h = out_d;
//Sum up the partial sums on the cpu
float sum = 0;
for(int i = 0; i < out_h.Length; i++)
{
sum += out_h[i];
}
//Verify the correctness
if(sum != 4096)
{
throw new Exception("Thats the wrong result!");
}
Update:
The very helpfull and only answer did address all my problems. Thank you! The problem was an unforeseen race condition.
Important Hint:
In the comments the author of managedCuda pointed out all NPPs methods are indeed already implmented in managedCuda (using ManagedCuda.NPP.NPPsExtensions;). I wasn't aware of that, and i guess so are many people reading ths question.
You are not correctly incorporating into your code the idea that each block will process 512 elements out of your total array. According to my testing, you need to make at least 2 changes to fix this:
In the kernel, you have incorrectly calculated the starting point for each block:
int blockStart = blockDim.x * blockIdx.x;
since blockDim.x is 256, but each block processes 512 elements, you must multiply this by 2. (the multiplication by 2 in your calculation of leftSumElementIdx doesn't take care of this -- since it is only multiplying tid).
In your host code, your number of blocks calculation is incorrect:
vectorReduceAsSumKernel.GridDimensions = array.Length / threadsPerBlock;
for a value of 2048 for array.Length and a value of 256 for threadsPerBlock, this creates 8 blocks. But as you already indicate, your intention is to launch for blocks (2048/512). So you need to multiply the denominator by 2:
vectorReduceAsSumKernel.GridDimensions = array.Length / (2*threadsPerBlock);
In addition, your reduction sweep pattern is broken. It is warp-execution-order dependent, to give the proper result, and CUDA does not specify a warp execution order.
To see why, let's take a simple example. Let's consider just a single threadblock, with a starting point of the array being all 1, just as you have initialized it.
Now, warp 0 consists of threads 0-31. Your reduction sweep operation is like this:
a[i] = a[leftSumElementIdx] + a[leftSumElementIdx + 1];
So each thread in warp 0 will collect two other values and add them, and store them. Thread 31 will take the values a[62] and a[63] and add them together. If the values of a[62] and a[63] are still 1, as initialized, then this will work as expected. But the values of a[62] and a[63] are written to by warp 1, consisting of threads 32-63. So if warp 1 executes before warp 0 (perfectly legal), then you will get a different result. This is a global memory race condition. It is arising due to the fact that your input array is both the source and destination of your intermediate results, and __syncthreads() will not sort this out for you. It doesn't force warps to execute in any particular order.
One possible solution is to fix your sweep pattern. On any given reduction cycle, let's have a sweep pattern where each thread writes and reads values that are not touched by any other thread during that cycle. The following adaptation of your kernel code accomplishes that:
__global__ void Vector_Reduce_As_Sum_Kernel(float* out, float* a)
{
int tid = threadIdx.x;
int blockStart = blockDim.x * blockIdx.x * 2;
int i = tid + blockStart;
for (int j = blockDim.x; j > 0; j>>=1){
if (tid < j)
a[i] += a[i+j];
__syncthreads();}
if (tid == 0)
{
out[blockIdx.x] = a[i];
}
}
For general purpose reductions, this is still a very slow method. This tutorial covers how to write faster reductions. And, as already pointed out, managedCuda may have methods to avoid writing a kernel at all.

CUDA Reduction: Warp Unrolling (School)

I am currently working on a project in which I am unrolling the last warp of a reduction. I have finished the code above; however, some modifications were done by guessing and I'd like an explanation why. The code I have written is only the function kernel4
// in is input array, out is where to store result, n is number of elements from in
// T is a float (32bit)
__global__ void kernel4(T *in, T *out, unsigned int n)
which is a reduction algorithm, the rest of the code was already provided.
Code:
#include <stdlib.h>
#include <stdio.h>
#include "timer.h"
#include "cuda_utils.h"
typedef float T;
#define N_ (8 * 1024 * 1024)
#define MAX_THREADS 256
#define MAX_BLOCKS 64
#define MIN(x,y) ((x < y) ? x : y)
#define tid threadIdx.x
#define bid blockIdx.x
#define bdim blockDim.x
#define warp_size 32
unsigned int nextPow2( unsigned int x ) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
if (whichKernel < 3) {
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
blocks = (n + threads - 1) / threads;
} else {
threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
blocks = (n + (threads * 2 - 1)) / (threads * 2);
}
if (whichKernel == 5)
blocks = MIN(maxBlocks, blocks);
}
T reduce_cpu(T *data, int n) {
T sum = data[0];
T c = (T) 0.0;
for (int i = 1; i < n; i++)
{
T y = data[i] - c;
T t = sum + y;
c = (t - sum) - y;
sum = t;
}
return sum;
}
__global__ void
kernel4(T *in, T *out, unsigned int n)
{
__shared__ volatile T d[MAX_THREADS];
unsigned int i = bid * bdim + tid;
n >>= 1;
d[tid] = (i < n) ? in[i] + in[i+n] : 0;
__syncthreads ();
for(unsigned int s = bdim >> 1; s > warp_size; s >>= 1) {
if(tid < s)
d[tid] += d[tid + s];
__syncthreads ();
}
if (tid < warp_size) {
if (n > 64) d[tid] += d[tid + 32];
if (n > 32) d[tid] += d[tid + 16];
d[tid] += d[tid + 8];
d[tid] += d[tid + 4];
d[tid] += d[tid + 2];
d[tid] += d[tid + 1];
}
if(tid == 0)
out[bid] = d[0];
}
int main(int argc, char** argv)
{
T *h_idata, h_odata, h_cpu;
T *d_idata, *d_odata;
struct stopwatch_t* timer = NULL;
long double t_kernel_4, t_cpu;
int whichKernel = 4, threads, blocks, N, i;
if(argc > 1) {
N = atoi (argv[1]);
printf("N: %d\n", N);
} else {
N = N_;
printf("N: %d\n", N);
}
getNumBlocksAndThreads (whichKernel, N, MAX_BLOCKS, MAX_THREADS, blocks, threads);
stopwatch_init ();
timer = stopwatch_create ();
h_idata = (T*) malloc (N * sizeof (T));
CUDA_CHECK_ERROR (cudaMalloc (&d_idata, N * sizeof (T)));
CUDA_CHECK_ERROR (cudaMalloc (&d_odata, blocks * sizeof (T)));
srand48(time(NULL));
for(i = 0; i < N; i++)
h_idata[i] = drand48() / 100000;
CUDA_CHECK_ERROR (cudaMemcpy (d_idata, h_idata, N * sizeof (T), cudaMemcpyHostToDevice));
dim3 gb(blocks, 1, 1);
dim3 tb(threads, 1, 1);
kernel4 <<<gb, tb>>> (d_idata, d_odata, N);
cudaThreadSynchronize ();
stopwatch_start (timer);
kernel4 <<<gb, tb>>> (d_idata, d_odata, N);
int s = blocks;
while(s > 1) {
threads = 0;
blocks = 0;
getNumBlocksAndThreads (whichKernel, s, MAX_BLOCKS, MAX_THREADS, blocks, threads);
dim3 gb(blocks, 1, 1);
dim3 tb(threads, 1, 1);
kernel4 <<<gb, tb>>> (d_odata, d_odata, s);
s = (s + threads * 2 - 1) / (threads * 2);
}
cudaThreadSynchronize ();
t_kernel_4 = stopwatch_stop (timer);
fprintf (stdout, "Time to execute unrolled GPU reduction kernel: %Lg secs\n", t_kernel_4);
double bw = (N * sizeof(T)) / (t_kernel_4 * 1e9); // total bits / time
fprintf (stdout, "Effective bandwidth: %.2lf GB/s\n", bw);
CUDA_CHECK_ERROR (cudaMemcpy (&h_odata, d_odata, sizeof (T), cudaMemcpyDeviceToHost));
stopwatch_start (timer);
h_cpu = reduce_cpu (h_idata, N);
t_cpu = stopwatch_stop (timer);
fprintf (stdout, "Time to execute naive CPU reduction: %Lg secs\n", t_cpu);
if(abs (h_odata - h_cpu) > 1e-5)
fprintf(stderr, "FAILURE: GPU: %f CPU: %f\n", h_odata, h_cpu);
else
printf("SUCCESS: GPU: %f CPU: %f\n", h_odata, h_cpu);
return 0;
}
My first question is: when declaring
__shared__ volatile T d[MAX_THREADS];
I would like to verify my understanding of volatile. Volatile prevents compilers from incorrectly optimizing my code and promises that load/stores are completed through the cache and not just registers (please correct me if wrong). For reduction, if partial reduction sums are still stored in registers, why is this a problem?
My second question is: when doing the actual warp reduction
if (tid < warp_size) { // Final log2(32) = 5 strides
if (n > 64) d[tid] += d[tid + 32];
if (n > 32) d[tid] += d[tid + 16];
d[tid] += d[tid + 8];
d[tid] += d[tid + 4];
d[tid] += d[tid + 2];
d[tid] += d[tid + 1];
}
The reduction sum will yield incorrect results without (n > 64) and (n > 32) conditions. The results I get are:
FAILURE: GPU: 41.966557 CPU: 41.946209
With 5 trials, the GPU reduction consistently yields an error of 0.0204. I am wary to think this is a floating point operation error.
To be honest as well, my teacher's assistant suggested this change to add the (n > 64) and (n > 32) conditions but did not explain why it would fix the code.
Since n in my trials are over 64, why does this conditional change the results. I am having difficulty tracing back the problem because I cannot use print functions like I would in a CPU.
Let's start with a few preface comments before we tackle your two questions:
I encourage you to read NVIDIA's canonical reduction tutorial
Reductions written like this make several assumptions, one of which is that the block size is a power-of-2 (for "correctness").
Your code is using warp-synchronous programming at the final reduction stage. You appear to know what you are doing, so I won't provide a detailed description of that, but it is certainly relevant for understanding here. You can google it and get descriptions if needed. It is relevant to the discussion below, but I'm not going to call out its relevance in each situation.
OK, now your questions:
I would like to verify my understanding of volatile. Volatile prevents compilers from incorrectly optimizing my code and promises that load/stores are completed through the cache and not just registers (please correct me if wrong). For reduction, if partial reduction sums are still stored in registers, why is this a problem?
Regarding a definition of volatile, I would refer you to the CUDA programming guide. I have seen summary descriptions referring to this as preventing a register optimization or preventing reordering of loads and stores. I prefer the former and will use that as a working definition.
The basic idea is that volatile forces any reference (read or write) to that variable to actually go to the memory subsystem. By this I mean it will perform a read or write, and will not attempt to use a value previously loaded into a register. Without this qualifier, the compiler is free to load a value once (for example) from the actual memory location, and then maintain that value (and any updates to it) in a register, for as long as it deems appropriate. Compilers do this with an eye toward performance. (As an aside, note that you used the word "cache" here. I would avoid that usage here. Shared memory has no cache interposed between it and the processor load/store mechanism.)
Without volatile in this type of warp-synchronous coding, we will run into a problem if we allow the compiler to "optimize" (i.e. maintain) intermediate values into registers. This primarily comes about due to inter-thread communication. To see clearly why, let's look at the last 2 steps in your final reduction:
d[tid] += d[tid + 2];
d[tid] += d[tid + 1];
Let's consider just threads whose tid values are 0-1. In the second-last step, thread 0 will pick up the d[2] value and add it to the d[0] value, while thread 1 will pick up the d[3] value and add it to the d[1] value. At this point, if we don't use volatile, the compiler is not obligated to write the d[1] value accumulated by thread 1 back out to shared memory. It is allowed to maintain that in a register. So the d[1] value as seen in shared memory is not "up-to-date".
Now lets go to the last step. In this step, thread 0 reads the d[1] value from shared memory and adds it to the d[0] value. But without volatile, we saw in the previous step that the shared memory contents of d[1] are no longer accurate. OTOH, if we use volatile, then the write to shared memory in the previous step will actually take place, and in the final step, thread 0 will pick up the correct value when it reads d[1]. A CUDA thread is a standalone model. By that, I mean that one thread cannot directly access values contained in registers belonging to another thread. So inter-thread communication at the warp level will normally be accomplished either through shared memory, or via warp-shuffle operations.
__syncthreads() has a similar behavior: it forces all register-optimized values like this to be written out to memory, so that they are "visible" to other threads in the block. Therefore, a more sophisticated optimization would be to only switch to a volatile qualified pointer when the reduction switches from the loop-driven __syncthreads() based reduction to the final warp-synchronous reduction. You can see an example in the tutorial slides I linked at the beginning of this answer.
As another aside, warp-synchronous programming of this kind is (more officially) deprecated in CUDA 9. Instead, you should use cooperative groups.
The reduction sum will yield incorrect results without (n > 64) and (n > 32) conditions.
These conditionals are primarily used because the code is designed to be "correct" for any block configuration that has a power-of-2 size. If we assume that the block size (number of threads per block) is a power of 2, and greater than 64, it must be 128 or larger for example. Your n variable starts out as the block size, but then gets multiplied by 2:
n >>= 1;
Therefore, if we want to ensure the correctness of this line of code:
d[tid] += d[tid + 32];
then we should only apply that operation when the thread block size is 64 (at least) which is like saying that n is greater than 64:
if (n > 64) d[tid] += d[tid + 32];
regarding this question, the claim is made that the posted code behaves differently if the if (n > 64) is included or not. The reason for this is that the posted code includes a loop which recalculates thread count and block count as the reduction proceeds:
int s = blocks;
while(s > 1) {
threads = 0;
blocks = 0;
getNumBlocksAndThreads (whichKernel, s, MAX_BLOCKS, MAX_THREADS, blocks, threads);
This loop eventually results in a block size that is smaller than 128, meaning the omission of the if conditions leads to breakage. (simply print out the threads variable, during this loop).
regarding this:
I am having difficulty tracing back the problem because I cannot use print functions like I would in a CPU.
I'm not sure what the problem is there. printf should work from within kernel code.
shared variables cannot have an initialization as part of their declaration according to this answer.
So if n < 64 we add some random shared memory array data to the sum, which case error.

What is the general way to launch appropriate amount of reduction kernels?

As I have read from NVIDIA's instruction in this link http://www.cuvilib.com/Reduction.pdf, for arrays bigger than blockSize, I should launch multiple reduction kernels to achieve global synchronization. What is the general way to determine how many times I should launch the reduction kernel? I tried as below but I need to Malloc 2 additional pointers, which takes a lot of processing times.
My job is to Reduce the array d_logLuminance into one minimum value min_logLum
void your_histogram_and_prefixsum(const float* const d_logLuminance,
float &min_logLum,
const size_t numRows,
const size_t numCols)
{
const dim3 blockSize(512);
unsigned int pixel = numRows*numCols;
const dim3 gridSize(pixel/blockSize.x+1);
//Reduction kernels to find max and min value
float *d_tempMin, *d_min;
checkCudaErrors(cudaMalloc((void**) &d_tempMin, sizeof(float)*pixel));
checkCudaErrors(cudaMalloc((void**) &d_min, sizeof(float)*pixel));
checkCudaErrors(cudaMemcpy(d_min, d_logLuminance, sizeof(float)*pixel, cudaMemcpyDeviceToDevice));
dim3 subGrid = gridSize;
for(int reduceLevel = pixel; reduceLevel > 0; reduceLevel /= blockSize.x) {
checkCudaErrors(cudaMemcpy(d_tempMin, d_min, sizeof(float)*pixel, cudaMemcpyDeviceToDevice));
reduceMin<<<subGrid,blockSize,blockSize.x*sizeof(float)>>>(d_tempMin, d_min);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
subGrid.x = subGrid.x / blockSize.x + 1;
}
checkCudaErrors(cudaMemcpy(&min_logLum, d_min, sizeof(float), cudaMemcpyDeviceToHost));
std::cout<< "Min value = " << min_logLum << std::endl;
checkCudaErrors(cudaFree(d_tempMin));
checkCudaErrors(cudaFree(d_min));
}
And if you are curious, here is my reduction kernel:
__global__
void reduceMin(const float* const g_inputRange,
float* g_outputRange)
{
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
sdata[tid] = g_inputRange[i];
__syncthreads();
for(unsigned int s = blockDim.x/2; s > 0; s >>= 1){
if (tid < s){
sdata[tid] = min(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if(tid == 0){
g_outputRange[blockIdx.x] = sdata[0];
}
}
There are many ways to skin the cat, but if you want to minimize kernel launches, it can always be done with at most two kernel launches.
The first kernel launch is composed of up to however many blocks correspond to the number of threads per block that your device supports. Newer devices will support 1024, older devices, 512.
Each of these (at most 512 or 1024) blocks in the first kernel will participate in a grid-looping sum of all the data elements in global memory.
Each of these blocks will then do a partial reduction and write a partial result to global memory. There will be 512 or 1024 of these partial results.
The second kernel launch will be composed of 512 or 1024 threads in a single block. Each thread will pick up one of the partial results from global memory, and then the threads in that single block will cooperatively reduce the partial results to a single final result, and write it back to global memory.
The "grid-looping sum" is described in reduction #7 here as "multiple add/thread". All of the reductions described in this document are available in the NVIDIA reduction sample code

CUDA-GDB crashes in Kernel

I've been trying to debug my code, as I know something is going wrong in the Kernel, and I've been trying to figure out what specifically. If I try to step into the kernel it seems to completely step over the kernel functions, and will eventually cause an error on quitting:
Single stepping until exit from function dyld_stub_cudaSetupArgument,
which has no line number information.
[Launch of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),(4,1,1)>>>) on
Device 0]
[Termination of CUDA Kernel 0 (incrementArrayOnDevice<<<(3,1,1),
(4,1,1)>>>) on Device 0]
[Launch of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
[Termination of CUDA Kernel 1 (fillinBoth<<<(40,1,1),(1,1,1)>>>) on Device 0]
add (below=0x124400, newtip=0x124430, newfork=0x125ac0) at test.cu:1223
And if I try to break in the Kernel my entire computer crashes and I have to restart it.
I figure there must be something wrong with the way I'm calling the kernel, but I can't figure out what.
The code is rather long, so I'm only including an excerpt of it:
__global__ void fillinOne(seqptr qset, long max) {
int i, j;
aas aa;
int idx = blockIdx.x;
__shared__ long qs[3];
if(idx < max)
{
memcpy(qs, qset[idx], sizeof(long[3]));
for (i = 0; i <= 1; i++)
{
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1))
{
if (((1L << ((long)aa)) & qs[i]) != 0)
{
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
}
}
//Kernel for left!= NULL and rt != NULL
void fillin(node *p, node *left, node *rt)
{
cudaError_t err = cudaGetLastError();
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
//int i, j;
if (left == NULL)
{
//copy rt->numsteps into p->numsteps--doesn't actually require CUDA, because no computation to do
memcpy(p->numsteps, rt->numsteps, stepsize);
checkCUDAError("memcpy");
//allocate siteset (array of sitearrays) on device
seqptr qsites; //as in array of qs's
cudaMalloc((void **) &qsites, sitesize);
checkCUDAError("malloc");
//copy rt->siteset into device array (equivalent to memcpy(qs, rs) but for whole array)
cudaMemcpy(qsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//do loop in device
int block_size = 1; //each site operated on independently
int n_blocks = chars;
fillinOne <<< n_blocks, block_size>>> (qsites, chars);
cudaThreadSynchronize();
//put qset in p->siteset--equivalent to memcpy(p->siteset[m], qs)
cudaMemcpy(p->siteset, qsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy");
//Cleanup
cudaFree(qsites);
}
If anyone has any ideas at all, please resond! Thanks in advance!
I suppose you have a single card configuration. When you are debugging a cuda kernel and you break inside it you effectively put the display driver in pause. That causes what you think is a crash. If you want to use the cuda-gdb with only one graphics card you must use it in command line mode (don't start X or press ctrl-alt-fn from X).
If you have two cards you must run the code in the card not running the display. Use cudaSelectDevice(n).