What is the general way to launch appropriate amount of reduction kernels? - cuda

As I have read from NVIDIA's instruction in this link http://www.cuvilib.com/Reduction.pdf, for arrays bigger than blockSize, I should launch multiple reduction kernels to achieve global synchronization. What is the general way to determine how many times I should launch the reduction kernel? I tried as below but I need to Malloc 2 additional pointers, which takes a lot of processing times.
My job is to Reduce the array d_logLuminance into one minimum value min_logLum
void your_histogram_and_prefixsum(const float* const d_logLuminance,
float &min_logLum,
const size_t numRows,
const size_t numCols)
{
const dim3 blockSize(512);
unsigned int pixel = numRows*numCols;
const dim3 gridSize(pixel/blockSize.x+1);
//Reduction kernels to find max and min value
float *d_tempMin, *d_min;
checkCudaErrors(cudaMalloc((void**) &d_tempMin, sizeof(float)*pixel));
checkCudaErrors(cudaMalloc((void**) &d_min, sizeof(float)*pixel));
checkCudaErrors(cudaMemcpy(d_min, d_logLuminance, sizeof(float)*pixel, cudaMemcpyDeviceToDevice));
dim3 subGrid = gridSize;
for(int reduceLevel = pixel; reduceLevel > 0; reduceLevel /= blockSize.x) {
checkCudaErrors(cudaMemcpy(d_tempMin, d_min, sizeof(float)*pixel, cudaMemcpyDeviceToDevice));
reduceMin<<<subGrid,blockSize,blockSize.x*sizeof(float)>>>(d_tempMin, d_min);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
subGrid.x = subGrid.x / blockSize.x + 1;
}
checkCudaErrors(cudaMemcpy(&min_logLum, d_min, sizeof(float), cudaMemcpyDeviceToHost));
std::cout<< "Min value = " << min_logLum << std::endl;
checkCudaErrors(cudaFree(d_tempMin));
checkCudaErrors(cudaFree(d_min));
}
And if you are curious, here is my reduction kernel:
__global__
void reduceMin(const float* const g_inputRange,
float* g_outputRange)
{
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
sdata[tid] = g_inputRange[i];
__syncthreads();
for(unsigned int s = blockDim.x/2; s > 0; s >>= 1){
if (tid < s){
sdata[tid] = min(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if(tid == 0){
g_outputRange[blockIdx.x] = sdata[0];
}
}

There are many ways to skin the cat, but if you want to minimize kernel launches, it can always be done with at most two kernel launches.
The first kernel launch is composed of up to however many blocks correspond to the number of threads per block that your device supports. Newer devices will support 1024, older devices, 512.
Each of these (at most 512 or 1024) blocks in the first kernel will participate in a grid-looping sum of all the data elements in global memory.
Each of these blocks will then do a partial reduction and write a partial result to global memory. There will be 512 or 1024 of these partial results.
The second kernel launch will be composed of 512 or 1024 threads in a single block. Each thread will pick up one of the partial results from global memory, and then the threads in that single block will cooperatively reduce the partial results to a single final result, and write it back to global memory.
The "grid-looping sum" is described in reduction #7 here as "multiple add/thread". All of the reductions described in this document are available in the NVIDIA reduction sample code

Related

How to make use of swap space on disk when run out of modern gpu memory?

Post-Pascal UM can allocate more memory than the GPU memory, which swap-in swap-out between GPU memory and host memory automatically.
So what if run out of GPU memory and host memory? How can I use the swap space on disk? Virtual memory swap space seems not work in cudaMallocManaged case. Here is how I did the experiment:
create swap space: dd if=/dev/zero of=./swapfile bs=1G count=16, mkswap and swapon
create host memory occupier, burn out 99% of host memory
for (i = 0; i < 8000; i++)
malloc(1<<20);
create GPU memory occupier with cudaMalloc, leaving 1G of GPU memory
offical uvm demo but with a 12GB workload, which will run out of all available GPU memory and host memory
#include <iostream>
#include <math.h>
// CUDA kernel to add elements of two arrays
__global__ void add(int n, float *x, float *y) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void) {
long long N = 6LL * (1 << 30) / sizeof(float); // <<<<<<<<<<<
float *x, *y;
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Launch kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
It just get kill by oom-killer and leave swap space alone.
It there is a pure lib malloc with 100GB workload, you can see that swap space usage is growing.
UM/UVA can use gpu memory + host memory + swap space, just like virtual memory say
There are 3 types of memory allocations that are accessible to GPU device code:
ordinary (e.g. cudaMalloc)
pinned (e.g. cudaHostAlloc)
managed (e.g. cudaMallocManaged)
None of these will make use of or have any bearing on traditional linux swap space (or the equivalent on windows). The first one is limited by available device memory, and the second two are limited by available host memory (or some other lower limit). All host-based allocations accessible to GPU device code must be resident in non-swappable memory using "swappable" here to refer to the ordinary host virtual memory management system that may swap pages out to disk.
The only space that benefits from this form of swapping is host pageable memory allocations, and these are not directly accessible from CUDA device code.

dram_write_bytes result on P100

I used nvprof to profile a simple vecadd example (n=1024) on P100 but observed the dram_write_bytes is only 256 (rather than 1024*4 that I expected). Can someone explain why this number is small? What other metrics I need to add in to count for global memory writes? Thanks. float_count_sp number is correct (1024).
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
__global__ void vecAdd(float* a, float* b, float* c, int n){
int id = blockIdx.x*blockDim.x + threadIdx.x;
if(id < n) c[id] = a[id] + b[id];
}
int main(int argc, char* argv[]){
int n = 1024;
float *h_a, *d_a;
float *h_b, *d_b;
float *h_c, *d_c;
size_t bytes = n*sizeof(float);
h_a = (float*)malloc(bytes);
h_b = (float*)malloc(bytes);
h_c = (float*)malloc(bytes);
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
for(i = 0; i < n; i++){
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i+1);
}
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
vecAdd <<<1, 1024>>> (d_a, d_b, d_c, n);
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
float sum = 0;
for(i = 0; i < n; i++)
sum += h_c[i] - h_a[i] - h_b[i];
printf("final diff: %f\n", sum/n);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Is it related to the sampling of nvprof? One time I get 384 bytes. Sometimes I even got 0 bytes. Weird thing is: if I change n to 1024*1024, I got bytes more than I expected (4688032). 4688032/1024/1024/4 = 1.11.
There are several reasons why your expectations are not being observed and the data is changing:
The GPU memory system is shared by all engines. The primary engine the is the graphics/compute engine but other engines such as copy engines, display, etc. access the device memory and the memory control (FB = framebuffer) counters do not have a method to track the requester.
NVPROF injection does not attempt to evict all context memory from the L2 cache. The cudaMemcpys prior to the launch and the kernel replay code in nvprof will leave the L2 cache in an inconsistent state.
The initial size of 4KB is simply to small to accurately track. The full data set could be in L2 from either the cudaMemcpy or replay. Furthermore, the bytes you see can be from other clients such as the constant caches.
It is highly recommends you scale the buffer size to a reasonable size. On newer GPUs the Nsight Compute profiler has improved L2 level breakdown of various clients to help detect unexpected traffic. In addition Nsight Compute replay logic clears the L2 cache so that each replay has a consistent start state.
If you have a monitor attached it is recommended to move the monitor to a different GPU when looking at DRAM counters. nvprof L2 counters generally filter the count by traffic from the SMs so traffic from copy engines, the display controller, MMU, constant caches, etc. will not show up in the L2 counters.

CUDA kernels are not overlapping

I have a simple vector multiplication kernel, which I am executing for 2 streams. But when I profile in NVVP, kernels do not seem to overlap. Is it because each kernel execution utilizes %100 of GPU, if not what can be the cause ?
Source code :
#include "common.h"
#include <cstdlib>
#include <stdio.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
#include <string.h>
const int N = 1 << 20;
__global__ void kernel(int n, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) y[i] = x[i] * y[i];
}
int main()
{
float *x, *y, *d_x, *d_y, *d_1, *d_2;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
cudaMalloc(&d_1, N*sizeof(float));
cudaMalloc(&d_2, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_1, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_2, y, N*sizeof(float), cudaMemcpyHostToDevice);
const int num_streams = 8;
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
for (int i = 0; i < 300; i++) {
kernel << <512, 512, 0, stream1 >> >(N, d_x, d_y);
kernel << <512, 512, 0, stream2 >> >(N, d_1, d_2);
}
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
// cudaDeviceSynchronize();
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
cudaDeviceReset();
cudaProfilerStop();
return 0;
}
EDIT: From comments I understand each kernel is utilizing GPU fully, so what is the best approach for achieving 262144-sized vector multiplication (for multiple streams) ?
My device information :
CUDA Device Query...
There are 1 CUDA devices.
CUDA Device #0
Major revision number: 5
Minor revision number: 0
Name: GeForce GTX 850M
Total global memory: 0
Total shared memory per block: 49152
Total registers per block: 65536
Warp size: 32
Maximum memory pitch: 2147483647
Maximum threads per block: 1024
Maximum dimension 0 of block: 1024
Maximum dimension 1 of block: 1024
Maximum dimension 2 of block: 64
Maximum dimension 0 of grid: 2147483647
Maximum dimension 1 of grid: 65535
Maximum dimension 2 of grid: 65535
Clock rate: 901500
Total constant memory: 65536
Texture alignment: 512
Concurrent copy and execution: Yes
Number of multiprocessors: 5
Kernel execution timeout: Yes
The reason why your kernels don't overlap is because your gpu is 'filled' with execution threads like #Robert Crovella mentions. Checking the Compute Capabilities chapter from the CUDA Programming Guide, there is a limit of 2048 threads per SM for your CC (5.0). You have 5 SM's so this makes it
a maximum of 10240 threads that can run simultaneously on your device. You are calling 512x512=262144 threads, with just a single kernel call, and that pretty much leaves no space at all for the other kernel call.
You need to launch small enough kernels so that 2 can run concurrently on your device.
I'm not an expert on streams, but from what i've understood, if you want to run your program using streams, you need to split it up in chunks and you have to calculate a proper offset mechanism in order for your streams to be able to access their proper data. On your current code, each stream that you are launching does exactly the same calculation over exactly the same data. You have to split the data among the streams.
Other than that if you want to get the max performance you need to overlap the kernel execution with asynchronous data transfers. The easiest way to do this is to assign a scheme like the following to each of your streams like presented here
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[i]);
kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[i]);
}
This configuration simply tells each stream to do a memcpy then to execute the kernel on some data then to copy the data back. After the async calls, the streams will work simultaneously completing their tasks.
PS: I would also recommend to revise your kernel as well. Using one thread to compute just one multiplication is an overkill. I would use the thread to process some more data.

Shared memory, branching performance and register count

I came across some peculiar performance behaviour when trying out the CUDA shuffle instruction. The test kernel below is based on an image processing algorithm which adds input-dependent values to all neighbouring pixels within a square of side rad. The output for each block is added in shared memory. If only one thread per warp adds its result to shared memory, the performance is poor (Option 1), whereas on the other hand, if all threads add to shared memory (one thread adds the desired value, the rest just add 0), the execution time drops by 2-3 times (Option 2).
#include <iostream>
#include "cuda_runtime.h"
#define warpSz 32
#define tileY 32
#define rad 32
__global__ void test(float *out, int pitch)
{
// Set shared mem to 0
__shared__ float tile[(warpSz + 2*rad) * (tileY + 2*rad)];
for (int i = threadIdx.y*blockDim.x+threadIdx.x; i<(tileY+2*rad)*(warpSz+2*rad); i+=blockDim.x*blockDim.y) {
tile[i] = 0.0f;
}
__syncthreads();
for (int row=threadIdx.y; row<tileY; row += blockDim.y) {
// Loop over pixels in neighbourhood
for (int i=0; i<2*rad+1; ++i) {
float res = 0.0f;
int rowStartIdx = (row+i)*(warpSz+2*rad);
for (int j=0; j<2*rad+1; ++j) {
res += float(threadIdx.x+row); // Substitute for real calculation
// Option 1: one thread writes to shared mem
if (threadIdx.x == 0) {
tile[rowStartIdx + j] += res;
res = 0.0f;
}
//// Option 2: all threads write to shared mem
//float tmp = 0.0f;
//if (threadIdx.x == 0) {
// tmp = res;
// res = 0.0f;
//}
//tile[rowStartIdx + threadIdx.x+j] += tmp;
res = __shfl(res, (threadIdx.x+1) % warpSz);
}
res += float(threadIdx.x+row);
tile[rowStartIdx + threadIdx.x+2*rad] += res;
__syncthreads();
}
}
// Add result back to global mem
for (int row=threadIdx.y; row<tileY+2*rad; row+=blockDim.y) {
for (int col=threadIdx.x; col<warpSz+2*rad; col+=warpSz) {
int idx = (blockIdx.y*tileY + row)*pitch + blockIdx.x*warpSz + col;
atomicAdd(out+idx, tile[row*(warpSz+2*rad) + col]);
}
}
}
int main(void)
{
int2 dim = make_int2(512, 512);
int pitchOut = (((dim.x+2*rad)+warpSz-1) / warpSz) * warpSz;
int sizeOut = pitchOut*(dim.y+2*rad);
dim3 gridDim((dim.x+warpSz-1)/warpSz, (dim.y+tileY-1)/tileY, 1);
float *devOut;
cudaMalloc((void**)&devOut, sizeOut*sizeof(float));
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaFree(0);
cudaEventRecord(start, 0);
test<<<gridDim, dim3(warpSz, 8)>>>(devOut, pitchOut);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaFree(devOut);
cudaDeviceReset();
std::cout << "Elapsed time: " << elapsedTime << " ms.\n";
std::cin.ignore();
}
Is this expected behaviour/can anyone explain why this happens?
One thing I have noted is that Option 1 uses only 15 registers, whereas Option 2 uses 37, which seems a big difference to me.
Another is that the if-statement in the innermost loop is converted to explicit bra instructions in the PTX code for Option 1, whereas for Option 2 it is converted to two selp instructions. Could it be that the explicit branching is behind the 2-3 times slow down similar to what's suspected in this question?
There are two reasons why I am reluctant to go for Option 2. First, when profiling the original application it seems to be limited by share memory bandwidth, which indicates that there is potential to increase the performance by having fewer threads accessing it. Second, unless we use the volatile keyword, writes to shared memory can be optimised to registers. Since we are only interested in the contribution from last the thread to access each memory location (threadIdx.x == 0), and all others add 0, this is not a problem as long as all changes temporarily located in registers are guaranteed to be written back to shared memory in the same order they were issued. Is this the case though? (This far, both options have produced the exact same result.)
Any thoughts or ideas are much appreciated!
PS. I compile for compute capability 3.0. (However, the shuffle instruction is not necessary to demonstrate the behaviour and can be commented out.)

Latency of shuffle instructions in CUDA

About the latency of __shfl() instruction:
Does the following instruction
c=__shfl(c, indi);
/*
where indi is any integer number(may be random (<32)),
and is different for different LaneID.
*/
has the same latency comparing to, lets say:
c=__shfl_down(c,1);
All warp-shuffle instructions have the same performance.
To provide a "quantitative" follow-up answer to Robert's answer, let us consider Mark Harris' reduction approach using CUDA shuffle operations detailed at Faster Parallel Reductions on Kepler.
In this approach, warp reduction is performed by using __shfl_down. An alternative approach to warp reduction is using __shfl_xor according to Lecture 4: warp shuffles, and reduction / scan operations. Below, I'm reporting the full code implementing both the approaches. If tested on a Kepler K20c, both take 0.044ms to reduce an array of N=200000 float elements. Relevantly, both the approaches outperform Thrust reduce by two orders of magnitude since the execution time for the Thrust case is 1.06ms for the same test.
Here is the full code:
#include <thrust\device_vector.h>
#define warpSize 32
/***********************************************/
/* warpReduceSum PERFORMING REDUCTION PER WARP */
/***********************************************/
__forceinline__ __device__ float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2) val += __shfl_down(val, offset);
//for (int i=1; i<warpSize; i*=2) val += __shfl_xor(val, i);
return val;
}
/*************************************************/
/* blockReduceSum PERFORMING REDUCTION PER BLOCK */
/*************************************************/
__forceinline__ __device__ float blockReduceSum(float val) {
// --- The shared memory is appointed to contain the warp reduction results. It is understood that the maximum number of threads per block will be
// 1024, so that there will be at most 32 warps per each block.
static __shared__ float shared[32];
int lane = threadIdx.x % warpSize; // Thread index within the warp
int wid = threadIdx.x / warpSize; // Warp ID
// --- Performing warp reduction. Only the threads with 0 index within the warp have the "val" value set with the warp reduction result
val = warpReduceSum(val);
// --- Only the threads with 0 index within the warp write the warp result to shared memory
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
// --- Wait for all warp reductions
__syncthreads();
// --- There will be at most 1024 threads within a block and at most 1024 blocks within a grid. The partial sum is read from shared memory only
// the corresponding warp existed, otherwise the partial sum is set to zero.
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
// --- The first warp performs the final partial warp summation.
if (wid==0) val = warpReduceSum(val);
return val;
}
/********************/
/* REDUCTION KERNEL */
/********************/
__global__ void deviceReduceKernel(float *in, float* out, int N) {
float sum = 0.f;
// --- Reduce multiple elements per thread.
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) sum += in[i];
sum = blockReduceSum(sum);
if (threadIdx.x==0) out[blockIdx.x]=sum;
}
/********/
/* MAIN */
/********/
void main() {
const int N = 200000;
thrust::host_vector<float> h_out(N,0.f);
thrust::device_vector<float> d_in(N,3.f);
thrust::device_vector<float> d_out(N);
int threads = 512;
int blocks = min((N + threads - 1) / threads, 1024);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// --- Performs the block reduction. It returns an output vector containig the block reductions as elements
cudaEventRecord(start, 0);
deviceReduceKernel<<<blocks, threads>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
// --- Performs a second block reduction with only one block. The input is an array of all 0's, except the first elements which are the
// block reduction results of the previous step.
deviceReduceKernel<<<1, 1024>>>(thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_out.data()), blocks);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("CUDA Shuffle - elapsed time: %3.5f ms \n", time);
h_out = d_out;
cudaEventRecord(start, 0);
float sum = thrust::reduce(d_in.begin(),d_in.end(),0.f,thrust::plus<float>());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("CUDA Thrust - elapsed time: %3.5f ms \n", time);
printf("Shuffle result = %f\n",h_out[0]);
printf("Thrust result = %f\n",sum);
getchar();
}