Latency of shuffle instructions in CUDA - cuda

About the latency of __shfl() instruction:
Does the following instruction
c=__shfl(c, indi);
/*
where indi is any integer number(may be random (<32)),
and is different for different LaneID.
*/
has the same latency comparing to, lets say:
c=__shfl_down(c,1);

All warp-shuffle instructions have the same performance.

To provide a "quantitative" follow-up answer to Robert's answer, let us consider Mark Harris' reduction approach using CUDA shuffle operations detailed at Faster Parallel Reductions on Kepler.
In this approach, warp reduction is performed by using __shfl_down. An alternative approach to warp reduction is using __shfl_xor according to Lecture 4: warp shuffles, and reduction / scan operations. Below, I'm reporting the full code implementing both the approaches. If tested on a Kepler K20c, both take 0.044ms to reduce an array of N=200000 float elements. Relevantly, both the approaches outperform Thrust reduce by two orders of magnitude since the execution time for the Thrust case is 1.06ms for the same test.
Here is the full code:
#include <thrust\device_vector.h>
#define warpSize 32
/***********************************************/
/* warpReduceSum PERFORMING REDUCTION PER WARP */
/***********************************************/
__forceinline__ __device__ float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2) val += __shfl_down(val, offset);
//for (int i=1; i<warpSize; i*=2) val += __shfl_xor(val, i);
return val;
}
/*************************************************/
/* blockReduceSum PERFORMING REDUCTION PER BLOCK */
/*************************************************/
__forceinline__ __device__ float blockReduceSum(float val) {
// --- The shared memory is appointed to contain the warp reduction results. It is understood that the maximum number of threads per block will be
// 1024, so that there will be at most 32 warps per each block.
static __shared__ float shared[32];
int lane = threadIdx.x % warpSize; // Thread index within the warp
int wid = threadIdx.x / warpSize; // Warp ID
// --- Performing warp reduction. Only the threads with 0 index within the warp have the "val" value set with the warp reduction result
val = warpReduceSum(val);
// --- Only the threads with 0 index within the warp write the warp result to shared memory
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
// --- Wait for all warp reductions
__syncthreads();
// --- There will be at most 1024 threads within a block and at most 1024 blocks within a grid. The partial sum is read from shared memory only
// the corresponding warp existed, otherwise the partial sum is set to zero.
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
// --- The first warp performs the final partial warp summation.
if (wid==0) val = warpReduceSum(val);
return val;
}
/********************/
/* REDUCTION KERNEL */
/********************/
__global__ void deviceReduceKernel(float *in, float* out, int N) {
float sum = 0.f;
// --- Reduce multiple elements per thread.
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) sum += in[i];
sum = blockReduceSum(sum);
if (threadIdx.x==0) out[blockIdx.x]=sum;
}
/********/
/* MAIN */
/********/
void main() {
const int N = 200000;
thrust::host_vector<float> h_out(N,0.f);
thrust::device_vector<float> d_in(N,3.f);
thrust::device_vector<float> d_out(N);
int threads = 512;
int blocks = min((N + threads - 1) / threads, 1024);
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// --- Performs the block reduction. It returns an output vector containig the block reductions as elements
cudaEventRecord(start, 0);
deviceReduceKernel<<<blocks, threads>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
// --- Performs a second block reduction with only one block. The input is an array of all 0's, except the first elements which are the
// block reduction results of the previous step.
deviceReduceKernel<<<1, 1024>>>(thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_out.data()), blocks);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("CUDA Shuffle - elapsed time: %3.5f ms \n", time);
h_out = d_out;
cudaEventRecord(start, 0);
float sum = thrust::reduce(d_in.begin(),d_in.end(),0.f,thrust::plus<float>());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("CUDA Thrust - elapsed time: %3.5f ms \n", time);
printf("Shuffle result = %f\n",h_out[0]);
printf("Thrust result = %f\n",sum);
getchar();
}

Related

Can CUDA branch divergence help me in this case?

This is little more than a thought experiment right now, but I want to check my understanding of the CUDA execution model. Consider the following case:
I am running on a GPU with poor double-precision performance (a non-Tesla card).
I have a kernel that needs to calculate a value using double precision. That value is a constant for the rest of the runtime of the kernel, and it is also constant across a warp.
Is something like the following pseudocode advantageous?
// value that we use later in the kernel; this is constant across all threads
// in a warp
int constant_value;
// check to see if this is the first thread in a warp
enum { warp_size = 32 };
if (!(threadIdx.x & (warp_size - 1))
{
// only do the double-precision math in one thread
constant_value = (int) round(double_precision_calculation());
}
// broadcast constant_value to all threads in the warp
constant_value = __shfl(v, 0);
// go on to use constant_value as needed later in the kernel
The reason why I considered doing this is my (possibly wrong) understanding of how double-precision resources are made available on each multiprocessor. From what I understand, there are simply 1/32 as many double-precision ALUs as single-precision ones on recent Geforce cards. Does this mean that if the other threads in a warp diverge, I can work around this lack of resources, and still get decent performance, as long as the double-precision values that I want can be broadcast to all threads in a warp?
Does this mean that if the other threads in a warp diverge, I can work around this lack of resources, and still get decent performance, as long as the double-precision values that I want can be broadcast to all threads in a warp?
No, you can't.
An instruction issue always occurs at the warp level, even in a warp-diverged scenario. Since it is issued at the warp level, it will require/use/schedule enough execution resources for the warp, even for inactive threads.
Therefore a computation done on only one thread will still use the same resources/scheduling slot as a computation done on all 32 threads in the warp.
For example, a floating point multiply will require 32 instances of usage of a floating point ALU. The exact scheduling of this will vary based on the specific GPU, but you cannot reduce the 32 instance usage to a lower number through warp divergence or any other mechanism.
Based on a question in the comments, here's a worked example on CUDA 7.5, Fedora 20, GT640 (GK208 - has 1/24 ratio of DP to SP units):
$ cat t1241.cu
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
const int nTPB = 32;
const int nBLK = 1;
const int rows = 1048576;
const int nSD = 128;
typedef double mytype;
template <bool use_warp>
__global__ void mpy_k(const mytype * in, mytype * out){
__shared__ mytype sdata[nTPB*nSD];
int idx = threadIdx.x + blockDim.x*blockIdx.x;
mytype accum = in[idx];
#pragma unroll 128
for (int i = 0; i < rows; i++)
if (use_warp)
accum += accum*sdata[threadIdx.x+(i&(nSD-1))*nTPB];
else
if (threadIdx.x == 0)
accum += accum*sdata[threadIdx.x+(i&(nSD-1))*nTPB];
out[idx] = accum;
}
int main(){
mytype *din, *dout;
cudaMalloc(&din, nTPB*nBLK*rows*sizeof(mytype));
cudaMalloc(&dout, nTPB*nBLK*sizeof(mytype));
cudaMemset(din, 0, nTPB*nBLK*rows*sizeof(mytype));
cudaMemset(dout, 0, nTPB*nBLK*sizeof(mytype));
mpy_k<true><<<nBLK, nTPB>>>(din, dout); // warm-up
cudaDeviceSynchronize();
unsigned long long dt = dtime_usec(0);
mpy_k<true><<<nBLK, nTPB>>>(din, dout);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
printf("full warp elapsed time: %f\n", dt/(float)USECPSEC);
mpy_k<false><<<nBLK, nTPB>>>(din, dout); //warm up
cudaDeviceSynchronize();
dt = dtime_usec(0);
mpy_k<false><<<nBLK, nTPB>>>(din, dout);
cudaDeviceSynchronize();
dt = dtime_usec(dt);
printf("one thread elapsed time: %f\n", dt/(float)USECPSEC);
cudaError_t res = cudaGetLastError();
if (res != cudaSuccess) printf("CUDA runtime failure %s\n", cudaGetErrorString(res));
return 0;
}
$ nvcc -arch=sm_35 -o t1241 t1241.cu
$ CUDA_VISIBLE_DEVICES="1" ./t1241
full warp elapsed time: 0.034346
one thread elapsed time: 0.049174
$
It is not faster to use just one thread in the warp for a floating-point multiply

Strange Cudamemcpy execution time

I'm currently working on a Cuda code which computes a simple difference pixel by pixel of two images (size: 2560x1706 px) in order to compare execution time of CPU and GPU.
I realize a "for" loop of 1000 iterations of my kernel to have a more significant execution time, and I perform the cudaMemcpy (from device to host) straight after the loop to retrieve the data computed.
Nevertheless, the execution time of this cudaMemcpy took 2800 ms which is higher than expected. I just was asking myself why I obtain such a result.
Here is my Kernel Code :
__global__ void diff (unsigned char *data1 ,unsigned char *data2, int *data_res)
{
int v = threadIdx.x + blockIdx.x*blockDim.x;
if (v < N)
{
data_res[v] = (int) data2[v] - (int) data1[v];
}
}
Here is the kernel calls :
cudaProfilerStart();
// Cuda allocation
cudaMalloc((void**)&dev_data1, N*sizeof(unsigned char));
cudaMalloc((void**)&dev_data2, N*sizeof(unsigned char));
cudaMalloc((void**)&dev_data_res, N*sizeof(int));
// Cuda memory copy
cudaMemcpy(dev_data1, img1->data, N*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_data2, img2->data, N*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(dev_data_res, imgresult->data, N*sizeof(int), cudaMemcpyHostToDevice);
//Simulate nb_loops images
for(int m = 0; m < nb_loops ; m++)
{
diff<<<blck_nb, thrd_nb>>>(dev_data1, dev_data2, dev_data_res);
//printf("%4d", m);
}
printf("WAITING FOR MEMCPY...\n");
clock_t begin = clock(), diff;
cudaMemcpy(imgresult_data, dev_data_res, N*sizeof(int), cudaMemcpyDeviceToHost);
diff = clock() - begin;
float msec = diff*1000/CLOCKS_PER_SEC;
printf("\t \nTime of the MEMCPY : %2.3f ms\n", msec);
printf("MEMCPY DEVICE TO HOST OK!\n");
cudaProfilerStop();
And here is the screenshot of the execution time results :
CUDA kernel launches are asynchronous, and cudaMemcpy is a blocking call. So what you are calling memcpy time is really kernel execution + memcpy tiime. Change your code like this:
...
for(int m = 0; m < nb_loops ; m++)
{
diff<<<blck_nb, thrd_nb>>>(dev_data1, dev_data2, dev_data_res);
//printf("%4d", m);
}
cudaDeviceSynchronize();
printf("WAITING FOR MEMCPY...\n");
....
and this timing should be correct.

CUDA kernels are not overlapping

I have a simple vector multiplication kernel, which I am executing for 2 streams. But when I profile in NVVP, kernels do not seem to overlap. Is it because each kernel execution utilizes %100 of GPU, if not what can be the cause ?
Source code :
#include "common.h"
#include <cstdlib>
#include <stdio.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
#include <string.h>
const int N = 1 << 20;
__global__ void kernel(int n, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n) y[i] = x[i] * y[i];
}
int main()
{
float *x, *y, *d_x, *d_y, *d_1, *d_2;
x = (float*)malloc(N*sizeof(float));
y = (float*)malloc(N*sizeof(float));
cudaMalloc(&d_x, N*sizeof(float));
cudaMalloc(&d_y, N*sizeof(float));
cudaMalloc(&d_1, N*sizeof(float));
cudaMalloc(&d_2, N*sizeof(float));
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_1, x, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_2, y, N*sizeof(float), cudaMemcpyHostToDevice);
const int num_streams = 8;
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
for (int i = 0; i < 300; i++) {
kernel << <512, 512, 0, stream1 >> >(N, d_x, d_y);
kernel << <512, 512, 0, stream2 >> >(N, d_1, d_2);
}
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
// cudaDeviceSynchronize();
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
cudaDeviceReset();
cudaProfilerStop();
return 0;
}
EDIT: From comments I understand each kernel is utilizing GPU fully, so what is the best approach for achieving 262144-sized vector multiplication (for multiple streams) ?
My device information :
CUDA Device Query...
There are 1 CUDA devices.
CUDA Device #0
Major revision number: 5
Minor revision number: 0
Name: GeForce GTX 850M
Total global memory: 0
Total shared memory per block: 49152
Total registers per block: 65536
Warp size: 32
Maximum memory pitch: 2147483647
Maximum threads per block: 1024
Maximum dimension 0 of block: 1024
Maximum dimension 1 of block: 1024
Maximum dimension 2 of block: 64
Maximum dimension 0 of grid: 2147483647
Maximum dimension 1 of grid: 65535
Maximum dimension 2 of grid: 65535
Clock rate: 901500
Total constant memory: 65536
Texture alignment: 512
Concurrent copy and execution: Yes
Number of multiprocessors: 5
Kernel execution timeout: Yes
The reason why your kernels don't overlap is because your gpu is 'filled' with execution threads like #Robert Crovella mentions. Checking the Compute Capabilities chapter from the CUDA Programming Guide, there is a limit of 2048 threads per SM for your CC (5.0). You have 5 SM's so this makes it
a maximum of 10240 threads that can run simultaneously on your device. You are calling 512x512=262144 threads, with just a single kernel call, and that pretty much leaves no space at all for the other kernel call.
You need to launch small enough kernels so that 2 can run concurrently on your device.
I'm not an expert on streams, but from what i've understood, if you want to run your program using streams, you need to split it up in chunks and you have to calculate a proper offset mechanism in order for your streams to be able to access their proper data. On your current code, each stream that you are launching does exactly the same calculation over exactly the same data. You have to split the data among the streams.
Other than that if you want to get the max performance you need to overlap the kernel execution with asynchronous data transfers. The easiest way to do this is to assign a scheme like the following to each of your streams like presented here
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes, cudaMemcpyHostToDevice, stream[i]);
kernel<<<streamSize/blockSize, blockSize, 0, stream[i]>>>(d_a, offset);
cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[i]);
}
This configuration simply tells each stream to do a memcpy then to execute the kernel on some data then to copy the data back. After the async calls, the streams will work simultaneously completing their tasks.
PS: I would also recommend to revise your kernel as well. Using one thread to compute just one multiplication is an overkill. I would use the thread to process some more data.

Shared memory, branching performance and register count

I came across some peculiar performance behaviour when trying out the CUDA shuffle instruction. The test kernel below is based on an image processing algorithm which adds input-dependent values to all neighbouring pixels within a square of side rad. The output for each block is added in shared memory. If only one thread per warp adds its result to shared memory, the performance is poor (Option 1), whereas on the other hand, if all threads add to shared memory (one thread adds the desired value, the rest just add 0), the execution time drops by 2-3 times (Option 2).
#include <iostream>
#include "cuda_runtime.h"
#define warpSz 32
#define tileY 32
#define rad 32
__global__ void test(float *out, int pitch)
{
// Set shared mem to 0
__shared__ float tile[(warpSz + 2*rad) * (tileY + 2*rad)];
for (int i = threadIdx.y*blockDim.x+threadIdx.x; i<(tileY+2*rad)*(warpSz+2*rad); i+=blockDim.x*blockDim.y) {
tile[i] = 0.0f;
}
__syncthreads();
for (int row=threadIdx.y; row<tileY; row += blockDim.y) {
// Loop over pixels in neighbourhood
for (int i=0; i<2*rad+1; ++i) {
float res = 0.0f;
int rowStartIdx = (row+i)*(warpSz+2*rad);
for (int j=0; j<2*rad+1; ++j) {
res += float(threadIdx.x+row); // Substitute for real calculation
// Option 1: one thread writes to shared mem
if (threadIdx.x == 0) {
tile[rowStartIdx + j] += res;
res = 0.0f;
}
//// Option 2: all threads write to shared mem
//float tmp = 0.0f;
//if (threadIdx.x == 0) {
// tmp = res;
// res = 0.0f;
//}
//tile[rowStartIdx + threadIdx.x+j] += tmp;
res = __shfl(res, (threadIdx.x+1) % warpSz);
}
res += float(threadIdx.x+row);
tile[rowStartIdx + threadIdx.x+2*rad] += res;
__syncthreads();
}
}
// Add result back to global mem
for (int row=threadIdx.y; row<tileY+2*rad; row+=blockDim.y) {
for (int col=threadIdx.x; col<warpSz+2*rad; col+=warpSz) {
int idx = (blockIdx.y*tileY + row)*pitch + blockIdx.x*warpSz + col;
atomicAdd(out+idx, tile[row*(warpSz+2*rad) + col]);
}
}
}
int main(void)
{
int2 dim = make_int2(512, 512);
int pitchOut = (((dim.x+2*rad)+warpSz-1) / warpSz) * warpSz;
int sizeOut = pitchOut*(dim.y+2*rad);
dim3 gridDim((dim.x+warpSz-1)/warpSz, (dim.y+tileY-1)/tileY, 1);
float *devOut;
cudaMalloc((void**)&devOut, sizeOut*sizeof(float));
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaFree(0);
cudaEventRecord(start, 0);
test<<<gridDim, dim3(warpSz, 8)>>>(devOut, pitchOut);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaFree(devOut);
cudaDeviceReset();
std::cout << "Elapsed time: " << elapsedTime << " ms.\n";
std::cin.ignore();
}
Is this expected behaviour/can anyone explain why this happens?
One thing I have noted is that Option 1 uses only 15 registers, whereas Option 2 uses 37, which seems a big difference to me.
Another is that the if-statement in the innermost loop is converted to explicit bra instructions in the PTX code for Option 1, whereas for Option 2 it is converted to two selp instructions. Could it be that the explicit branching is behind the 2-3 times slow down similar to what's suspected in this question?
There are two reasons why I am reluctant to go for Option 2. First, when profiling the original application it seems to be limited by share memory bandwidth, which indicates that there is potential to increase the performance by having fewer threads accessing it. Second, unless we use the volatile keyword, writes to shared memory can be optimised to registers. Since we are only interested in the contribution from last the thread to access each memory location (threadIdx.x == 0), and all others add 0, this is not a problem as long as all changes temporarily located in registers are guaranteed to be written back to shared memory in the same order they were issued. Is this the case though? (This far, both options have produced the exact same result.)
Any thoughts or ideas are much appreciated!
PS. I compile for compute capability 3.0. (However, the shuffle instruction is not necessary to demonstrate the behaviour and can be commented out.)

What is the general way to launch appropriate amount of reduction kernels?

As I have read from NVIDIA's instruction in this link http://www.cuvilib.com/Reduction.pdf, for arrays bigger than blockSize, I should launch multiple reduction kernels to achieve global synchronization. What is the general way to determine how many times I should launch the reduction kernel? I tried as below but I need to Malloc 2 additional pointers, which takes a lot of processing times.
My job is to Reduce the array d_logLuminance into one minimum value min_logLum
void your_histogram_and_prefixsum(const float* const d_logLuminance,
float &min_logLum,
const size_t numRows,
const size_t numCols)
{
const dim3 blockSize(512);
unsigned int pixel = numRows*numCols;
const dim3 gridSize(pixel/blockSize.x+1);
//Reduction kernels to find max and min value
float *d_tempMin, *d_min;
checkCudaErrors(cudaMalloc((void**) &d_tempMin, sizeof(float)*pixel));
checkCudaErrors(cudaMalloc((void**) &d_min, sizeof(float)*pixel));
checkCudaErrors(cudaMemcpy(d_min, d_logLuminance, sizeof(float)*pixel, cudaMemcpyDeviceToDevice));
dim3 subGrid = gridSize;
for(int reduceLevel = pixel; reduceLevel > 0; reduceLevel /= blockSize.x) {
checkCudaErrors(cudaMemcpy(d_tempMin, d_min, sizeof(float)*pixel, cudaMemcpyDeviceToDevice));
reduceMin<<<subGrid,blockSize,blockSize.x*sizeof(float)>>>(d_tempMin, d_min);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
subGrid.x = subGrid.x / blockSize.x + 1;
}
checkCudaErrors(cudaMemcpy(&min_logLum, d_min, sizeof(float), cudaMemcpyDeviceToHost));
std::cout<< "Min value = " << min_logLum << std::endl;
checkCudaErrors(cudaFree(d_tempMin));
checkCudaErrors(cudaFree(d_min));
}
And if you are curious, here is my reduction kernel:
__global__
void reduceMin(const float* const g_inputRange,
float* g_outputRange)
{
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
sdata[tid] = g_inputRange[i];
__syncthreads();
for(unsigned int s = blockDim.x/2; s > 0; s >>= 1){
if (tid < s){
sdata[tid] = min(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if(tid == 0){
g_outputRange[blockIdx.x] = sdata[0];
}
}
There are many ways to skin the cat, but if you want to minimize kernel launches, it can always be done with at most two kernel launches.
The first kernel launch is composed of up to however many blocks correspond to the number of threads per block that your device supports. Newer devices will support 1024, older devices, 512.
Each of these (at most 512 or 1024) blocks in the first kernel will participate in a grid-looping sum of all the data elements in global memory.
Each of these blocks will then do a partial reduction and write a partial result to global memory. There will be 512 or 1024 of these partial results.
The second kernel launch will be composed of 512 or 1024 threads in a single block. Each thread will pick up one of the partial results from global memory, and then the threads in that single block will cooperatively reduce the partial results to a single final result, and write it back to global memory.
The "grid-looping sum" is described in reduction #7 here as "multiple add/thread". All of the reductions described in this document are available in the NVIDIA reduction sample code