Why these two GPU kernel have massive performance difference? - cuda

Hi I am trying to understand some behavior of cuda kernel. These are two cuda kernels I have. I found that gpuReduce require two times more duration than gpuReduceOpt. Is it caused by divergence?
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void gpuReduce(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
if((tid%(2*stride)) == 0){
data[tid] += data[tid+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
__global__ void gpuReduceOpt(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
int index = 2*stride*tid;
if(index < blockDim.x){
data[index] += data[index+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
int main(int agrc, char **argv)
{
int size = 1<<24;
int blocksize = 1024;
dim3 block(blocksize, 1);
dim3 grid((size-1)/block.x+1, 1);
int nBytes = sizeof(int)*size;
int *a_h = (int*)malloc(nBytes);
int *tmp = (int*)malloc(sizeof(int)*grid.x);
int *tmp1 = (int*)malloc(sizeof(int)*grid.x);
initData_int(a_h, size);
int *a_d, *output;
cudaMalloc((int**)&a_d, nBytes);
cudaMalloc((int**)&output, grid.x*sizeof(int));
int *a_d1, *output1;
cudaMalloc((int**)&a_d1, nBytes);
cudaMalloc((int**)&output1, grid.x*sizeof(int));
cudaMemcpy(a_d1, a_h, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
auto start2 = std::chrono::system_clock::now();
gpuReduce<<<grid, block>>>(a_d, output, size);
cudaMemcpy(tmp, output, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result;
for (int i =0; i < grid.x; i++)
{
gpu_result += tmp[i];
}
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("Gpu reduce take:%2f s\n", diff2.count());
auto start3 = std::chrono::system_clock::now();
gpuReduceOpt<<<grid, block>>>(a_d1, output1, size);
cudaMemcpy(tmp1, output1, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result1;
for (int i =0; i < grid.x; i++)
{
gpu_result1 += tmp1[i];
}
cudaDeviceSynchronize();
auto end3 = std::chrono::system_clock::now();
std::chrono::duration<double>diff3 = end3 - start3;
printf("Gpu reduce opt take:%2f s\n", diff3.count());
printf("Result from gpuReduce and gpuReduceOpt are %6d and %6d\n", gpu_result, gpu_result1);
cudaFree(a_d);
cudaFree(output);
free(a_h);
free(tmp);
cudaDeviceReset();
return 0;
}
This is the perf data i got:
Gpu reduce take:0.004238 s
Gpu reduce opt take:0.002606 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471

In the code that you now have posted there is still a bug in the host code. This construct is not correct:
int gpu_result; // not initialized
for (int i =0; i < grid.x; i++)
{
gpu_result += tmp[i];
}
That is undefined behavior. There is no guarantee the variable gpu_result above will start out at zero. The same problem exists on gpu_result1.
When we fix that issue, the difference in kernel time execution mostly comes down to the usage of the modulo operator in the first kernel, as suggested by #talonmies in the first comment. If you profile each kernel, let's say with nvprof, and you ask for metrics like gld_efficiency, gst_efficiency, gld_transactions, and gst_transactions you will find that they are all basically identical between the two kernels.
However if you replace the modulo operator with equivalent but less costly arithmetic, the kernel durations become almost the same (to within about 10%):
$ cat t1878a.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
void initData_int(int *p, int size){
for (int t=0; t<size; t++){
p[t] = (int)(rand()&0xff);
}
}
__global__ void gpuReduce(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
#ifdef USE_FAST
if((tid&(2*stride-1)) == 0){
#else
if((tid%(2*stride)) == 0){
#endif
data[tid] += data[tid+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
__global__ void gpuReduceOpt(int *in, int *out, int size)
{
int tid = threadIdx.x;
int* data = in + blockIdx.x*blockDim.x;
if (tid >= size)
return;
for (int stride = 1; stride < blockDim.x; stride*=2)
{
int index = 2*stride*tid;
if(index < blockDim.x){
data[index] += data[index+stride];
}
__syncthreads();
}
if (tid == 0){
out[blockIdx.x] = data[0];
}
}
int main(int agrc, char **argv)
{
int size = 1<<24;
int blocksize = 1024;
dim3 block(blocksize, 1);
dim3 grid((size-1)/block.x+1, 1);
int nBytes = sizeof(int)*size;
int *a_h = (int*)malloc(nBytes);
int *tmp = (int*)malloc(sizeof(int)*grid.x);
int *tmp1 = (int*)malloc(sizeof(int)*grid.x);
initData_int(a_h, size);
int *a_d, *output;
cudaMalloc((int**)&a_d, nBytes);
cudaMalloc((int**)&output, grid.x*sizeof(int));
int *a_d1, *output1;
cudaMalloc((int**)&a_d1, nBytes);
cudaMalloc((int**)&output1, grid.x*sizeof(int));
cudaMemcpy(a_d1, a_h, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
auto start2 = std::chrono::system_clock::now();
gpuReduce<<<grid, block>>>(a_d, output, size);
cudaMemcpy(tmp, output, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result = 0;
for (int i =0; i < grid.x; i++)
{
gpu_result += tmp[i];
}
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("Gpu reduce take:%2f s\n", diff2.count());
auto start3 = std::chrono::system_clock::now();
gpuReduceOpt<<<grid, block>>>(a_d1, output1, size);
cudaMemcpy(tmp1, output1, grid.x*sizeof(int), cudaMemcpyDeviceToHost);
int gpu_result1 = 0;
for (int i =0; i < grid.x; i++)
{
gpu_result1 += tmp1[i];
}
cudaDeviceSynchronize();
auto end3 = std::chrono::system_clock::now();
std::chrono::duration<double>diff3 = end3 - start3;
printf("Gpu reduce opt take:%2f s\n", diff3.count());
printf("Result from gpuReduce and gpuReduceOpt are %6d and %6d\n", gpu_result, gpu_result1);
cudaFree(a_d);
cudaFree(output);
free(a_h);
free(tmp);
cudaDeviceReset();
return 0;
}
$ nvcc -o t1878a t1878a.cu -arch=sm_70 -lineinfo
$ nvprof ./t1878a
==14339== NVPROF is profiling process 14339, command: ./t1878a
Gpu reduce take:0.001021 s
Gpu reduce opt take:0.000543 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
==14339== Profiling application: ./t1878a
==14339== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.40% 43.743ms 2 21.872ms 21.280ms 22.463ms [CUDA memcpy HtoD]
1.72% 770.61us 1 770.61us 770.61us 770.61us gpuReduce(int*, int*, int)
0.86% 384.30us 1 384.30us 384.30us 384.30us gpuReduceOpt(int*, int*, int)
0.03% 12.960us 2 6.4800us 6.4000us 6.5600us [CUDA memcpy DtoH]
API calls: 69.86% 350.40ms 4 87.601ms 8.0580us 349.79ms cudaMalloc
19.33% 96.969ms 1 96.969ms 96.969ms 96.969ms cudaDeviceReset
9.13% 45.770ms 4 11.442ms 451.76us 22.822ms cudaMemcpy
1.00% 5.0119ms 4 1.2530ms 590.62us 3.2115ms cuDeviceTotalMem
0.50% 2.5242ms 404 6.2470us 427ns 270.20us cuDeviceGetAttribute
0.09% 449.28us 2 224.64us 10.437us 438.85us cudaFree
0.06% 279.02us 4 69.755us 59.853us 94.003us cuDeviceGetName
0.02% 101.11us 2 50.555us 23.936us 77.175us cudaLaunchKernel
0.00% 22.146us 4 5.5360us 3.2730us 10.770us cuDeviceGetPCIBusId
0.00% 14.686us 2 7.3430us 4.1300us 10.556us cudaDeviceSynchronize
0.00% 11.444us 8 1.4300us 506ns 4.8200us cuDeviceGet
0.00% 6.2180us 3 2.0720us 610ns 3.9200us cuDeviceGetCount
0.00% 3.5570us 4 889ns 740ns 1.1270us cuDeviceGetUuid
$ nvcc -o t1878a t1878a.cu -arch=sm_70 -lineinfo -DUSE_FAST
$ nvprof ./t1878a
==14375== NVPROF is profiling process 14375, command: ./t1878a
Gpu reduce take:0.000656 s
Gpu reduce opt take:0.000538 s
Result from gpuReduce and gpuReduceOpt are 2139353471 and 2139353471
==14375== Profiling application: ./t1878a
==14375== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.92% 38.947ms 2 19.474ms 19.460ms 19.488ms [CUDA memcpy HtoD]
1.08% 427.79us 1 427.79us 427.79us 427.79us gpuReduce(int*, int*, int)
0.97% 385.99us 1 385.99us 385.99us 385.99us gpuReduceOpt(int*, int*, int)
0.03% 13.216us 2 6.6080us 6.4320us 6.7840us [CUDA memcpy DtoH]
API calls: 67.47% 281.96ms 4 70.491ms 5.5820us 281.49ms cudaMalloc
20.44% 85.428ms 1 85.428ms 85.428ms 85.428ms cudaDeviceReset
9.70% 40.518ms 4 10.129ms 457.52us 19.781ms cudaMemcpy
1.20% 5.0260ms 4 1.2565ms 601.24us 3.2163ms cuDeviceTotalMem
0.94% 3.9413ms 404 9.7550us 270ns 1.7028ms cuDeviceGetAttribute
0.10% 435.98us 2 217.99us 9.5230us 426.46us cudaFree
0.10% 410.88us 4 102.72us 58.347us 225.92us cuDeviceGetName
0.02% 94.871us 2 47.435us 20.952us 73.919us cudaLaunchKernel
0.01% 21.734us 4 5.4330us 3.5080us 8.4130us cuDeviceGetPCIBusId
0.00% 14.504us 2 7.2520us 3.8730us 10.631us cudaDeviceSynchronize
0.00% 12.843us 8 1.6050us 460ns 5.3730us cuDeviceGet
0.00% 9.7040us 3 3.2340us 804ns 6.9430us cuDeviceGetCount
0.00% 2.5870us 4 646ns 517ns 957ns cuDeviceGetUuid
$
Notes:
I'm not suggesting the above is a general replacement for modulo. It works in this case because stride is taking on powers of 2 only.
I doubt this is doing what you think:
if (tid >= size)
return;
but for the problem size here (a whole number multiple of the block size) its not particularly relevant. It's also not an appropriate choice if the remaining kernel code uses __syncthreads() but again that's irrelevant here for this problem size/choice.
Your code on 2080 Ti is running about 5x slower than on my V100, which doesn't sound right to me. I wonder if you are building a debug project. But that doesn't change the observation here. In case you are building a debug project or with the -G compilation switch, I recommend to never do performance analysis on debug code.

Related

Why the performance of cuda kernel w/i divergence is better than w/o divergence?

Hi I am writing cuda kernel to understand the behavior of warp divergence.
These are 3 kernels I have:
#include <cuda_runtime.h>
#include <stdio.h>
#include "util.h"
#include <chrono>
__global__ void wardUp(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if ((idx/warpSize)%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
__global__ void kernel1(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if ((idx/warpSize)%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
__global__ void kernel2(float *c)
{
float a = 0.0;
float b = 0.0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx%2 == 0){
a = 100.0f;
}
else{
b = 200.0f;
}
c[idx] = a+b;
}
int main(int argc, char **argv)
{
initDevice(0);
int size = 64;
int blocksize = 64;
int nBytes = sizeof(float)*size;
float *a_d;
CHECK(cudaMalloc((float**)&a_d, nBytes));
dim3 block(blocksize, 1);
dim3 grid((blocksize-1)/block.x+1, 1);
wardUp<<<grid, block>>>(a_d);
float elapsed = 0;
cudaEvent_t start1, stop1;
CHECK(cudaEventCreate(&start1));
CHECK(cudaEventCreate(&stop1));
CHECK(cudaEventRecord(start1, 0));
kernel1<<<grid, block>>>(a_d);
CHECK(cudaEventRecord(stop1, 0));
CHECK(cudaEventSynchronize(stop1));
CHECK(cudaEventElapsedTime(&elapsed, start1, stop1));
printf("kernel1 take:%2f ms\n", elapsed);
float elapsed_1 = 0;
cudaEvent_t start2, stop2;
CHECK(cudaEventCreate(&start2));
CHECK(cudaEventCreate(&stop2));
CHECK(cudaEventRecord(start2, 0));
kernel2<<<grid, block>>>(a_d);
CHECK(cudaEventRecord(stop2, 0));
CHECK(cudaEventSynchronize(stop2));
CHECK(cudaEventElapsedTime(&elapsed_1, start2, stop2));
printf("kernel2 take:%2f ms\n", elapsed_1);
cudaFree(a_d);
cudaEventDestroy(start1);
cudaEventDestroy(stop1);
cudaEventDestroy(start2);
cudaEventDestroy(stop2);
return 0;
}
If my understanding is correct, kernel1 does not have divergence issue since the if branch happens on thread 0-31, the same warp.
kernel2 will have divergence issue since odd thread and even thread cannot be executed at the same time.
But I observed that kernel1 is slower than kernel2. Why this would happen?
Using device: 0: NVIDIA GeForce RTX 2080 Ti
kernel1 take:0.008864 ms
kernel2 take:0.006752 ms
I switched to use cudaEventRecord to recore the duration but it seems kernel1 is slower than kernel2.
There are/were a variety of problems with your approach. I may not list them all:
problem size is too small for benchmarking
compiler optimizations are working against you
code is too simple; compiler use of predication is mitigating the effects of warp-divergence
your kernel duration measurement method was flawed
your creation of the grid based on blocksize is not sensible (although it happens to be sensible when size == blocksize). It should be based on problem size, not blocksize.
The following code has these issues addressed, and shows approximately a 2x increase in kernel duration, going from the code that does if/then decisions based on warp boundary to code that does that for every other thread:
$ cat t1877.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#define CHECK(x) x
__global__ void kernel1(int *c, int y, int z, int l1, int l2)
{
int a = 0;
int b = 0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx&32){
for (int i = 0; i < l1; i++){
a = a&y; a = a|z;}
}
else{
for (int i = 0; i < l2; i++){
b = b|y; b = b&z;}
}
c[idx] = a+b;
}
__global__ void kernel2(int *c, int y, int z, int l1, int l2)
{
int a = 0;
int b = 0;
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx&1){
for (int i = 0; i < l1; i++){
a = a&y; a = a|z;}
}
else{
for (int i = 0; i < l2; i++){
b = b|y; b = b&z;}
}
c[idx] = a+b;
}
int main(int argc, char **argv)
{
int blocksize = 64;
int size = blocksize*1048576;
int nBytes = sizeof(int)*size;
int *a_d;
CHECK(cudaMalloc((int**)&a_d, nBytes));
dim3 block(blocksize, 1);
dim3 grid(size/block.x, 1);
kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto start1 = std::chrono::system_clock::now();
kernel1<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto end1 = std::chrono::system_clock::now();
std::chrono::duration<double>diff1 = end1 - start1;
printf("kernel1 take:%2f s\n", diff1.count());
kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto start2 = std::chrono::system_clock::now();
kernel2<<<grid, block>>>(a_d, 0, 0, 10000, 10000);
cudaDeviceSynchronize();
auto end2 = std::chrono::system_clock::now();
std::chrono::duration<double>diff2 = end2 - start2;
printf("kernel2 take:%2f s\n", diff2.count());
return 0;
}
$ nvcc -o t1877 t1877.cu -arch=sm_70
$ ./t1877
kernel1 take:0.205650 s
kernel2 take:0.406347 s
$

cudaMallocManaged and cudaDeviceSynchronize()

I have the following two mostly identical example codes. code1.cu use cudaMalloc and cudaMemcpy to handling device/host variable value exchange.
The code2.cu use cudaMallocManaged and thus cudaMemcpy is not needed. When cudaMallocManaged is used, I have to include cudaDeviceSynchronize() to get the correct results, while for the one with cudaMalloc, this is not needed. I would appreciate some hint on why this is happening
code2.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMallocManaged((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
tot =*d_tot;
//
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
code1.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMalloc((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
//
cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0
After remove the comment on cudaDeviceSynchronize(),
GPU: tot: 8.79609e+12
CUDA kernel launches are asynchronous. That means that they execute independently of the CPU thread that launched them.
Because of this asynchronous launch, the CUDA kernel is not guaranteed to be finished (or even started) by the time your CPU thread code begins testing the result.
Therefore it is necessary to wait until the GPU kernel is complete, and cudaDeviceSynchronize() does exactly that. cudaMemcpy also has a synchronizing effect, so when you remove the cudaMemcpy operations, you lose that synchronization, but cudaDeviceSynchronize() restores it.

CUDA: Understanding the behavior of variables in the registers file in a loop with a dot product example

I am very new to CUDA programming. Currently I have difficulties in understanding the behavior of the following program to calculate dot product of two vectors.
The dot product kernel, dotProd calculates the product of each element and reduce the the results to a shorter vector of length blockDim.x*gridDim.x. Then the results in the vector *out is copied back to Host for further reduction.
The second version, dotProdWithSharedMem is copied from the CUDA By Example book, see here.
My questions are:
When the kernel is initiated with enough threads (nThreadsPerBlock*nblocks >= vector_length), the result of dotProd matches the one calculated by CPU, but the result of dotProdWithSharedMem is different from the two. What can be the possible causes? A possible output of $ dot_prod.o 17 512:
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6904191971, time consummed: 0.56154 ms
GPU using shared : 9.6906833649, time consummed: 0.04473 ms
CPU result : 9.6904191971, time consummed: 0.28504 ms
When the kernel is initiated with not enough threads (nThreadsPerBlock*nblocks < vector_length), the GPU results seem to be less accurate. However the while loop is supposed to handle this problem. I guess there might be something happen to the registers variable temp in the loop, otherwise the result should remain the same as in question 1. A possible output of $ dot_prod.o 17 256:
Number of threads per block : 256
Number of blocks in the grid: 256
Total number of threads : 65536
Length of vectors : 131072
GPU using registers: 9.6906890869, time consummed: 0.31478 ms
GPU using shared : 9.6906604767, time consummed: 0.03530 ms
CPU result : 9.6904191971, time consummed: 0.28404 ms
I don't quite understand the size of the cache in dotProdWithSharedMem. Why it is of nThreadsPerBlock elements other than the total number of threads nThreadsPerBlock * nblocks? I think that should be the right number of temp values, is this correct?
The code:
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>
#define PI (float) 3.141592653589793
const size_t nThreadsPerBlock = 256;
static void HandleError(cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void dotProd(int length, float *u, float *v, float *out) {
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
float temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
out[tid_const] = temp;
}
__global__ void dotProdWithSharedMem(int length, float *u, float *v, float *out) {
__shared__ float cache[nThreadsPerBlock];
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned cid = threadIdx.x;
float temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
cache[cid] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i != 0) {
if (cid < i) {
cache[cid] += cache[cid + i];
}
__syncthreads();
i /= 2;
}
if (cid == 0) {
out[blockIdx.x] = cache[0];
}
}
int main(int argc, char* argv[]) {
size_t vec_len = 1 << std::stoi(argv[1]);
size_t size = vec_len * sizeof(float);
size_t nblocks = std::stoi(argv[2]);
size_t size_out = nThreadsPerBlock*nblocks*sizeof(float);
size_t size_out_2 = nblocks*sizeof(float);
float *u = (float *)malloc(size);
float *v = (float *)malloc(size);
float *out = (float *)malloc(size_out);
float *out_2 = (float *)malloc(size_out_2);
float *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays
float res_gpu = 0;
float res_gpu_2 = 0;
float res_cpu = 0;
dim3 dimGrid(nblocks, 1, 1);
dim3 dimBlocks(nThreadsPerBlock, 1, 1);
// Initiate values
for(size_t i=0; i<vec_len; ++i) {
u[i] = std::sin(i*PI*1E-2);
v[i] = std::cos(i*PI*1E-2);
}
HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );
auto t1_gpu = std::chrono::system_clock::now();
dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
res_gpu += out[i];
}
auto t2_gpu = std::chrono::system_clock::now();
// GPU version with shared memory
dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nblocks; ++i) {
res_gpu_2 += out_2[i];
}
auto t3_gpu = std::chrono::system_clock::now();
// CPU version for result-check
for(size_t i=0; i<vec_len; ++i) {
res_cpu += u[i] * v[i];
}
auto t2_cpu = std::chrono::system_clock::now();
double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();
printf("Number of threads per block : %i \n", nThreadsPerBlock);
printf("Number of blocks in the grid: %i \n", nblocks);
printf("Total number of threads : %i \n", nThreadsPerBlock*nblocks);
printf("Length of vectors : %i \n\n", vec_len);
printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
printf("GPU using shared : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
printf("CPU result : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);
cudaFree(dev_u);
cudaFree(dev_v);
cudaFree(dev_out);
cudaFree(dev_out_2);
free(u);
free(v);
free(out);
free(out_2);
return 0;
}
Thank you for your patience for having done reading this LONG post! Any help will be deeply appreciated!
Niko
You're exploring the limits of float precision combined with the variation associated with floating point order of operations. The actual "accuracy" here will depend on the exact data and exact order of operations. The different algorithms will have different order of operations, and therefore different results.
You may want to read this paper.
One of the assumptions you seem to be making is that the CPU result is the accurate one without any justification for that assumption.
If we define "accuracy" as the difference (i.e. "closeness") between the result and the numerically correct result, I suspect that the shared memory result is the more accurate one.
If we convert your code to use double type instead of float type, we observe that:
The result of all 3 approaches are much closer (identical in the printout).
The double results don't match any of the float case.
The shared memory result from the float case is actually the result that is closest to the double case results.
Here's a test case demonstrating this:
$ cat t397.cu
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>
#ifndef USE_DOUBLE
typedef float ft;
#else
typedef double ft;
#endif
#define PI (ft) 3.141592653589793
const size_t nThreadsPerBlock = 256;
static void HandleError(cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void dotProd(int length, ft *u, ft *v, ft *out) {
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
ft temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
out[tid_const] = temp;
}
__global__ void dotProdWithSharedMem(int length, ft *u, ft *v, ft *out) {
__shared__ ft cache[nThreadsPerBlock];
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned cid = threadIdx.x;
ft temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
cache[cid] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i != 0) {
if (cid < i) {
cache[cid] += cache[cid + i];
}
__syncthreads();
i /= 2;
}
if (cid == 0) {
out[blockIdx.x] = cache[0];
}
}
int main(int argc, char* argv[]) {
size_t vec_len = 1 << std::stoi(argv[1]);
size_t size = vec_len * sizeof(ft);
size_t nblocks = std::stoi(argv[2]);
size_t size_out = nThreadsPerBlock*nblocks*sizeof(ft);
size_t size_out_2 = nblocks*sizeof(ft);
ft *u = (ft *)malloc(size);
ft *v = (ft *)malloc(size);
ft *out = (ft *)malloc(size_out);
ft *out_2 = (ft *)malloc(size_out_2);
ft *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays
ft res_gpu = 0;
ft res_gpu_2 = 0;
ft res_cpu = 0;
dim3 dimGrid(nblocks, 1, 1);
dim3 dimBlocks(nThreadsPerBlock, 1, 1);
// Initiate values
for(size_t i=0; i<vec_len; ++i) {
u[i] = std::sin(i*PI*1E-2);
v[i] = std::cos(i*PI*1E-2);
}
HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );
auto t1_gpu = std::chrono::system_clock::now();
dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
res_gpu += out[i];
}
auto t2_gpu = std::chrono::system_clock::now();
// GPU version with shared memory
dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nblocks; ++i) {
res_gpu_2 += out_2[i];
}
auto t3_gpu = std::chrono::system_clock::now();
// CPU version for result-check
for(size_t i=0; i<vec_len; ++i) {
res_cpu += u[i] * v[i];
}
auto t2_cpu = std::chrono::system_clock::now();
double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();
printf("Number of threads per block : %i \n", nThreadsPerBlock);
printf("Number of blocks in the grid: %i \n", nblocks);
printf("Total number of threads : %i \n", nThreadsPerBlock*nblocks);
printf("Length of vectors : %i \n\n", vec_len);
printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
printf("GPU using shared : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
printf("CPU result : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);
cudaFree(dev_u);
cudaFree(dev_v);
cudaFree(dev_out);
cudaFree(dev_out_2);
free(u);
free(v);
free(out);
free(out_2);
return 0;
}
$ nvcc -std=c++11 t397.cu -o t397
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6904191971, time consummed: 0.89290 ms
GPU using shared : 9.6906833649, time consummed: 0.04289 ms
CPU result : 9.6904191971, time consummed: 0.41527 ms
$ nvcc -std=c++11 t397.cu -o t397 -DUSE_DOUBLE
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6913433287, time consummed: 1.33016 ms
GPU using shared : 9.6913433287, time consummed: 0.05032 ms
CPU result : 9.6913433287, time consummed: 0.41275 ms
$

How to keep track of executed CUDA blocks?

Just for the sake of testing my understanding of things, I decided to modify the vector addition found in the CUDA samples so that the kernel quits after a specific time and is then re-launched to complete. The way I achieve the "timeout" is by having a pinned variable that the host sets to 1 after some time. Within the kernel, a check of this variable is performed to determine whether execution should continue. If the thread continues its execution it is marked as complete. In order to test that each thread executes just once, I've modified the addition to C[i] = C[i] + B[i] This all works as expected; the device code looks as follows:
/* Function
* Internal device function used for getting the current thread's global ID
* regardless of the block/grid configuration. It assumes that the
* grid and block are 3 dimensional.
*
* #return: The thread's global ID
*/
static __device__ int get_global_idx()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
/* Function
* Device function that determines if the current thread should continue execution.
* A check should be used on the return value. If the timeout has not been set
* and the thread has not previously executed the index at the thread's ID in the
* thread_ids array is set to 1 to indicate it was allowed to proceed.
*
* #param thread_ids: A pointer to the array with a size that matches the max number
* of threads that will be spawned
*
* #param time_out: Memory mapped variable used by the host to signal the kernel when
* execution should suspend
*
* #return: A boolean value indicating whether the current thread should continue or not
*/
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out)
{
if(*time_out == 1){
return false;
}
int tid = get_global_idx();
if(thread_ids[tid] == 1)
{
return false;
}
thread_ids[tid] = 1;
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, unsigned int *thread_ids, volatile unsigned int *timeout)
{
if(!continue(thread_ids, timeout))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
/* C[i] = A[i] + B[i]; */
C[i] = C[i] + B[i]; //Modifed from above
}
}
I considered how this may fail if __syncthreads() was used. So I decided to do block level suspension. Based on my understanding, I thought this would be simple. Keep track of if a block has started, and count how many threads have executed for that block and only suspend when all threads of an already started block have completed and deny any threads who's block has not started. So I used a struct and modified the continue function as follows:
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
This does not work, when I execute verification on the host (h_B[i] - h_C[i]) I don't get a consistent zero result. Which means that some threads somehow managed to execute multiple times. Any ideas how/why this is happening with the latter attempt? Thanks.
I don't care about performance at this point; just trying to understand what is really happening.
EDIT
Here is the complete code, compile with nvcc file_name.cu and execute program_name <vector-length>.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
if(!continue_execution(time_out, b_info))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
You have a race condition in your continue_execution routine. Consider the following scenario:
warp0 of a threadblock enters the continue_execution routine. At the moment that it checks the variables *time_out and b_info[bid].started it witnesses those to be 0 and 0 respectively. So it proceeds to the next if test.
warp1 of the same threadblock enters the continue_execution routine (let's say slightly later), and it witnesses the variables to be 1 and 0 respectively. So it returns false and causes the warp1 threads to exit.
warp0 continues on and eventually sets b_info[bid].started to 1, and then updates the thread_count. It then returns true and proceeds with the vector add.
I could continue with this, but I think if you consider the above 3 items carefully you will realize it is a case you did not account for. Your implicit expectation is that every thread would read a coherent (i.e. the same across a given threadblock) value for *time_out. But this is not guaranteed by your code, and if it fails to do so, then we end up with some threadblocks where some threads have completed their work and some have not.
So how could we fix this? The above description should point the way. One possible approach is to guarantee that for any given threadblock, that every thread gets the same value for *time_out whether it be 1 or 0. One possible solution would be to make the following changes to the beginning of your vectorAdd kernel:
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
with those changes, we ensure that every thread in a block gets a coherent view of the time out variable, and according to my testing, the problem is resolved:
$ cat t100.cu
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
#ifdef USE_FIX
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
#else
if(!continue_execution(time_out, b_info))
#endif
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %ld elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
$ nvcc -arch=sm_61 -o t100 t100.cu
$ ./t100 327678
[Vector addition of 327678 elements]
Result verification failed at element 0!
$ nvcc -arch=sm_61 -o t100 t100.cu -DUSE_FIX
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$
One other change I made to your code was in this line:
printf("[Vector addition of %d elements]\n", numElements);
this has no bearing on the problem, but your format specifier does not match your variable type. Fix by changing to %ld.

the Kernel delay increase by increasing the blocksPerGrid and threadsPerBlock in CUDA VecAdd example, what does it mean?

when i tested the following example, i found that by increasing the blocksPerGrid and threadsPerBlock the Kernel delay increase
such that if
int threadsPerBlock = 1;
int blocksPerGrid = 1;
blocksPerGrid and threadsPerBlock equal 1 the delay of the kernel = .0072 ms
but when i make the following it the delay become higher = .049 ms
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
where
N = 50000; //the no. of array elements
on the following the complete VecAdd example. you can test it
// Includes
#include <stdio.h>
#include <cutil_inline.h>
#include <shrQATest.h>
// Variables
float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
bool noprompt = false;
// Functions
void CleanupResources(void);
void RandomInit(float*, int);
void ParseArguments(int, char**);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Host code
int main(int argc, char** argv)
{
shrQAStart(argc, argv);
cudaEvent_t event1, event2;
cudaEventCreate(&event1);
cudaEventCreate(&event2);
printf("Vector Addition\n");
int N = 50000;
size_t size = N * sizeof(float);
ParseArguments(argc, argv);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
if (h_A == 0) CleanupResources();
h_B = (float*)malloc(size);
if (h_B == 0) CleanupResources();
h_C = (float*)malloc(size);
if (h_C == 0) CleanupResources();
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// Allocate vectors in device memory
cutilSafeCall( cudaMalloc((void**)&d_A, size) );
cutilSafeCall( cudaMalloc((void**)&d_B, size) );
cutilSafeCall( cudaMalloc((void**)&d_C, size) );
// Copy vectors from host memory to device memory
cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );
// Invoke kernel
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
cudaEventRecord(event1, 0);
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaEventRecord(event2, 0);
cudaEventSynchronize(event1); //optional
cudaEventSynchronize(event2);
float dt_ms;
cudaEventElapsedTime(&dt_ms, event1, event2);
printf("delay_time = %f\n", dt_ms);
cutilCheckMsg("kernel launch failure");
#ifdef _DEBUG
cutilSafeCall( cutilDeviceSynchronize() );
#endif
// Copy result from device memory to host memory
// h_C contains the result in host memory
cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-5)
break;
}
CleanupResources();
shrQAFinishExit(argc, (const char **)argv, (i==N) ? QA_PASSED : QA_FAILED);
}
void CleanupResources(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
cutilDeviceReset();
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
// Parse program arguments
void ParseArguments(int argc, char** argv)
{
for (int i = 0; i < argc; ++i) {
if (strcmp(argv[i], "--noprompt") == 0 ||
strcmp(argv[i], "-noprompt") == 0)
{
noprompt = true;
break;
}
}
}
can any one explain for me what does it mean?
In case 1, a kernel of size 1 thread is launched and performs 2 reads and 1 write operation. In case 2, a kernel of size 50176 threads are launched and perform 100,000 reads and 50,000 writes operations. Increasing the workload by 50,000 increased execution time by ~7x. The work done by the two launches is significantly different.