The result of an experiment different from CUDA Occupancy Calculator

The result of an experiment different from CUDA Occupancy Calculator - cuda

I study CUDA architecture.
I made some of parallel processing code in environment like below.
GPU : GTX580 (CC is 2.0)
Threads Per Block : 16x16 = 256
Registers Per Thread : 16
Shared Memory Per Block : 48 bytes
I know the number of Registers and Shared Memory size by the compile option: --ptxas-options=-v
In addition, grid size is 32x32 = 1024 and there is not extra shared memory.
So, I tried to use CUDA_Occupancy_Calculator by NVIDIA.
then, It said,
3.) GPU Occupancy Data is displayed here and in the graphs:
Active Threads per Multiprocessor 1536
Active Warps per Multiprocessor 48
Active Thread Blocks per Multiprocessor 6
Occupancy of each Multiprocessor 100%
So, I run the application.
But, the result showed that the block size is 8x8 faster than 16x16.
8x8 means the block size, and the gird size is 64x64.
16x16 means the block size, and the grid size is 32x32.
So, the total amount of threads is same. It's unchanged.
I don't know the why. Please help me.
Following code is a part of my Program.
void LOAD_VERTEX(){
MEM[0] = 60; //y0
MEM[1] = 50; //x0
MEM[2] = 128; //r0
MEM[3] = 0; //g0
MEM[4] = 70; //b0
MEM[5] = 260;
MEM[6] = 50;
MEM[7] = 135;
MEM[8] = 70;
MEM[9] = 0;
MEM[10] = 260;
MEM[11] = 250;
MEM[12] = 0;
MEM[13] = 200;
MEM[14] = 55;
MEM[15] = 60;
MEM[16] = 250;
MEM[17] = 55;
MEM[18] = 182;
MEM[19] = 100;
MEM[20] = 30;
MEM[21] = 330;
MEM[22] = 72;
MEM[23] = 12;
MEM[24] = 25;
MEM[25] = 30;
MEM[26] = 130;
MEM[27] = 80;
MEM[28] = 255;
MEM[29] = 15;
MEM[30] = 230;
MEM[31] = 330;
MEM[32] = 56;
MEM[33] = 186;
MEM[34] = 201;
}
__global__ void PRINT_POLYGON( unsigned char *IMAGEin, int *MEMin, int dev_ID, int a, int b, int c)
{
int i = blockIdx.x*TILE_WIDTH + threadIdx.x;
int j = blockIdx.y*TILE_HEIGHT + threadIdx.y;
float result_a, result_b;
int temp[15];
int k;
for(k = 0; k < 5; k++){
temp[k] = a*5+k;
temp[k+5] = b*5+k;
temp[k+10] = c*5+k;
}
int result_a_up = ((MEMin[temp[11]]-MEMin[temp[1]])*(i-MEMin[temp[0]]))-((MEMin[temp[10]]-MEMin[temp[0]])*(j-MEMin[temp[1]]));
int result_a_down = ((MEMin[temp[11]]-MEMin[temp[1]])*(MEMin[temp[5]]-MEMin[temp[0]]))-((MEMin[temp[6]]-MEMin[temp[1]])*(MEMin[temp[10]]-MEMin[temp[0]]));
int result_b_up = ((MEMin[temp[6]] -MEMin[temp[1]])*(MEMin[temp[0]]-i))-((MEMin[temp[5]] -MEMin[temp[0]])*(MEMin[temp[1]]-j));
int result_b_down = ((MEMin[temp[11]]-MEMin[temp[1]])*(MEMin[temp[5]]-MEMin[temp[0]]))-((MEMin[temp[6]]-MEMin[temp[1]])*(MEMin[temp[10]]-MEMin[temp[0]]));
result_a = float(result_a_up) / float(result_a_down);
result_b = float(result_b_up) / float(result_b_down);
int isIn = (0 <= result_a && result_a <=1) && ((0 <= result_b && result_b <= 1)) && ((0 <= (result_a+result_b) && (result_a+result_b) <= 1));
IMAGEin[(i*HEIGHTs+j)*CHANNELS] += (int)(float(MEMin[temp[2]]) + (float(MEMin[temp[7]])-float(MEMin[temp[2]]))*result_a + (float(MEMin[temp[12]])-float(MEMin[temp[2]]))*result_b) * isIn; //Red Channel
IMAGEin[(i*HEIGHTs+j)*CHANNELS+1] += (int)(float(MEMin[temp[3]]) + (float(MEMin[temp[8]])-float(MEMin[temp[3]]))*result_a + (float(MEMin[temp[13]])-float(MEMin[temp[3]]))*result_b) * isIn; //Green Channel
IMAGEin[(i*HEIGHTs+j)*CHANNELS+2] += (int)(float(MEMin[temp[4]]) + (float(MEMin[temp[9]])-float(MEMin[temp[4]]))*result_a + (float(MEMin[temp[14]])-float(MEMin[temp[4]]))*result_b) * isIn; //Blue Channel
}
//The information each device
struct DataStruct {
int deviceID;
unsigned char IMAGE_SEG[WIDTH*HEIGHTs*CHANNELS];
};
void* routine( void *pvoidData ) {
DataStruct *data = (DataStruct*)pvoidData;
unsigned char *dev_IMAGE;
int *dev_MEM;
unsigned char *IMAGE_SEG = data->IMAGE_SEG;
HANDLE_ERROR(cudaSetDevice(data->deviceID));
//initialize array
memset(IMAGE_SEG, 0, WIDTH*HEIGHTs*CHANNELS);
printf("Device %d Starting..\n", data->deviceID);
//Evaluate Time
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
HANDLE_ERROR( cudaMalloc( (void **)&dev_MEM, sizeof(int)*35) ); //Creating int array each Block
HANDLE_ERROR( cudaMalloc( (void **)&dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS) ); //output array
cudaMemcpy(dev_MEM, MEM, sizeof(int)*256, cudaMemcpyHostToDevice);
cudaMemset(dev_IMAGE, 0, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS);
dim3 grid(WIDTH/TILE_WIDTH, HEIGHTs/TILE_HEIGHT); //blocks in a grid
dim3 block(TILE_WIDTH, TILE_HEIGHT); //threads in a block
cudaEventRecord(start, 0);
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 1, 2); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 2, 3); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 3, 4); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 4, 5); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 3, 2, 4); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 2, 6, 4); //Start the Kernel
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
HANDLE_ERROR( cudaMemcpy( IMAGE_SEG, dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS, cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaFree( dev_MEM ) );
HANDLE_ERROR( cudaFree( dev_IMAGE ) );
cudaEventElapsedTime( &elapsed_time_ms[data->deviceID], start, stop ); //Calculate elapsed time
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Algorithm Elapsed Time : %f ms(Device %d)\n", elapsed_time_ms[data->deviceID], data->deviceID);
printf("Device %d Complete!\n", data->deviceID);
return 0;
}
int main( void )
{
int i;
CUTThread thread[7];
printf("Program Start.\n");
LOAD_VERTEX();
DataStruct data[DEVICENUM]; //define device info
for(i = 0; i < DEVICENUM; i++){
data[i].deviceID = i;
thread[i] = start_thread(routine, &(data[i]));
}
for(i = 0; i < DEVICENUM; i++){
end_thread(thread[i]);
}
cudaFreeHost(MEM);
return 0;
}

Since you copied over your question from the Nvidia forum, I'll copy my answer as well:
For your kernel your finding of reduced performance with higher occupancy is easily explained by the cache overflowing for higher occupancy.
The local array temp[] at full occupancy requires 1536×15×4=92160 bytes of cache, while at 33% occupancy (for the smaller 8×8 block size) only 512×15×4=30720 bytes are required per SM. With the larger 48kB cache/SM setting the latter could be fully cached eliminating off-chip memory accesses for temp[] almost completely, but even in the default 16kB cache/SM setting the cache hit probability is substantially higher.
As the temp[] array is not needed anyway, the fastest option (at either occupancy) would be to completely eliminate it. The compiler might already be able to achieve this if you just insert a #pragma unroll before the initialization loop. Otherwise replace all uses of temp[] with a little macro or inline function, or even just substitute the result into the code (which in this case I would even find more readable).

Related

cudaMallocManaged and cudaDeviceSynchronize()

I have the following two mostly identical example codes. code1.cu use cudaMalloc and cudaMemcpy to handling device/host variable value exchange.
The code2.cu use cudaMallocManaged and thus cudaMemcpy is not needed. When cudaMallocManaged is used, I have to include cudaDeviceSynchronize() to get the correct results, while for the one with cudaMalloc, this is not needed. I would appreciate some hint on why this is happening
code2.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMallocManaged((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
tot =*d_tot;
//
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
code1.cu
#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;
// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}
int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMalloc((void **)&d_tot, sizeof(float));
// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;
// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);
for (int i = 0; i < Ny; i++)
vy.push_back(i*10);
//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];
cout<<"CPU: tot: "<<tot<<endl;
//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);
//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);
cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);
// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();
//
cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);
//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);
return 0;
}
//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0
After remove the comment on cudaDeviceSynchronize(),
GPU: tot: 8.79609e+12

CUDA kernel launches are asynchronous. That means that they execute independently of the CPU thread that launched them.
Because of this asynchronous launch, the CUDA kernel is not guaranteed to be finished (or even started) by the time your CPU thread code begins testing the result.
Therefore it is necessary to wait until the GPU kernel is complete, and cudaDeviceSynchronize() does exactly that. cudaMemcpy also has a synchronizing effect, so when you remove the cudaMemcpy operations, you lose that synchronization, but cudaDeviceSynchronize() restores it.

CUDA: Understanding the behavior of variables in the registers file in a loop with a dot product example

I am very new to CUDA programming. Currently I have difficulties in understanding the behavior of the following program to calculate dot product of two vectors.
The dot product kernel, dotProd calculates the product of each element and reduce the the results to a shorter vector of length blockDim.x*gridDim.x. Then the results in the vector *out is copied back to Host for further reduction.
The second version, dotProdWithSharedMem is copied from the CUDA By Example book, see here.
My questions are:
When the kernel is initiated with enough threads (nThreadsPerBlock*nblocks >= vector_length), the result of dotProd matches the one calculated by CPU, but the result of dotProdWithSharedMem is different from the two. What can be the possible causes? A possible output of $ dot_prod.o 17 512:
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6904191971, time consummed: 0.56154 ms
GPU using shared : 9.6906833649, time consummed: 0.04473 ms
CPU result : 9.6904191971, time consummed: 0.28504 ms
When the kernel is initiated with not enough threads (nThreadsPerBlock*nblocks < vector_length), the GPU results seem to be less accurate. However the while loop is supposed to handle this problem. I guess there might be something happen to the registers variable temp in the loop, otherwise the result should remain the same as in question 1. A possible output of $ dot_prod.o 17 256:
Number of threads per block : 256
Number of blocks in the grid: 256
Total number of threads : 65536
Length of vectors : 131072
GPU using registers: 9.6906890869, time consummed: 0.31478 ms
GPU using shared : 9.6906604767, time consummed: 0.03530 ms
CPU result : 9.6904191971, time consummed: 0.28404 ms
I don't quite understand the size of the cache in dotProdWithSharedMem. Why it is of nThreadsPerBlock elements other than the total number of threads nThreadsPerBlock * nblocks? I think that should be the right number of temp values, is this correct?
The code:
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>
#define PI (float) 3.141592653589793
const size_t nThreadsPerBlock = 256;
static void HandleError(cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void dotProd(int length, float *u, float *v, float *out) {
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
float temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
out[tid_const] = temp;
}
__global__ void dotProdWithSharedMem(int length, float *u, float *v, float *out) {
__shared__ float cache[nThreadsPerBlock];
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned cid = threadIdx.x;
float temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
cache[cid] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i != 0) {
if (cid < i) {
cache[cid] += cache[cid + i];
}
__syncthreads();
i /= 2;
}
if (cid == 0) {
out[blockIdx.x] = cache[0];
}
}
int main(int argc, char* argv[]) {
size_t vec_len = 1 << std::stoi(argv[1]);
size_t size = vec_len * sizeof(float);
size_t nblocks = std::stoi(argv[2]);
size_t size_out = nThreadsPerBlock*nblocks*sizeof(float);
size_t size_out_2 = nblocks*sizeof(float);
float *u = (float *)malloc(size);
float *v = (float *)malloc(size);
float *out = (float *)malloc(size_out);
float *out_2 = (float *)malloc(size_out_2);
float *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays
float res_gpu = 0;
float res_gpu_2 = 0;
float res_cpu = 0;
dim3 dimGrid(nblocks, 1, 1);
dim3 dimBlocks(nThreadsPerBlock, 1, 1);
// Initiate values
for(size_t i=0; i<vec_len; ++i) {
u[i] = std::sin(i*PI*1E-2);
v[i] = std::cos(i*PI*1E-2);
}
HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );
auto t1_gpu = std::chrono::system_clock::now();
dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
res_gpu += out[i];
}
auto t2_gpu = std::chrono::system_clock::now();
// GPU version with shared memory
dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nblocks; ++i) {
res_gpu_2 += out_2[i];
}
auto t3_gpu = std::chrono::system_clock::now();
// CPU version for result-check
for(size_t i=0; i<vec_len; ++i) {
res_cpu += u[i] * v[i];
}
auto t2_cpu = std::chrono::system_clock::now();
double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();
printf("Number of threads per block : %i \n", nThreadsPerBlock);
printf("Number of blocks in the grid: %i \n", nblocks);
printf("Total number of threads : %i \n", nThreadsPerBlock*nblocks);
printf("Length of vectors : %i \n\n", vec_len);
printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
printf("GPU using shared : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
printf("CPU result : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);
cudaFree(dev_u);
cudaFree(dev_v);
cudaFree(dev_out);
cudaFree(dev_out_2);
free(u);
free(v);
free(out);
free(out_2);
return 0;
}
Thank you for your patience for having done reading this LONG post! Any help will be deeply appreciated!
Niko

You're exploring the limits of float precision combined with the variation associated with floating point order of operations. The actual "accuracy" here will depend on the exact data and exact order of operations. The different algorithms will have different order of operations, and therefore different results.
You may want to read this paper.
One of the assumptions you seem to be making is that the CPU result is the accurate one without any justification for that assumption.
If we define "accuracy" as the difference (i.e. "closeness") between the result and the numerically correct result, I suspect that the shared memory result is the more accurate one.
If we convert your code to use double type instead of float type, we observe that:
The result of all 3 approaches are much closer (identical in the printout).
The double results don't match any of the float case.
The shared memory result from the float case is actually the result that is closest to the double case results.
Here's a test case demonstrating this:
$ cat t397.cu
#include <iostream>
#include <string>
#include <cmath>
#include <chrono>
#include <cuda.h>
#ifndef USE_DOUBLE
typedef float ft;
#else
typedef double ft;
#endif
#define PI (ft) 3.141592653589793
const size_t nThreadsPerBlock = 256;
static void HandleError(cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void dotProd(int length, ft *u, ft *v, ft *out) {
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned tid_const = threadIdx.x + blockDim.x * blockIdx.x;
ft temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
out[tid_const] = temp;
}
__global__ void dotProdWithSharedMem(int length, ft *u, ft *v, ft *out) {
__shared__ ft cache[nThreadsPerBlock];
unsigned tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned cid = threadIdx.x;
ft temp = 0;
while (tid < length) {
temp += u[tid] * v[tid];
tid += blockDim.x * gridDim.x;
}
cache[cid] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i != 0) {
if (cid < i) {
cache[cid] += cache[cid + i];
}
__syncthreads();
i /= 2;
}
if (cid == 0) {
out[blockIdx.x] = cache[0];
}
}
int main(int argc, char* argv[]) {
size_t vec_len = 1 << std::stoi(argv[1]);
size_t size = vec_len * sizeof(ft);
size_t nblocks = std::stoi(argv[2]);
size_t size_out = nThreadsPerBlock*nblocks*sizeof(ft);
size_t size_out_2 = nblocks*sizeof(ft);
ft *u = (ft *)malloc(size);
ft *v = (ft *)malloc(size);
ft *out = (ft *)malloc(size_out);
ft *out_2 = (ft *)malloc(size_out_2);
ft *dev_u, *dev_v, *dev_out, *dev_out_2; // Device arrays
ft res_gpu = 0;
ft res_gpu_2 = 0;
ft res_cpu = 0;
dim3 dimGrid(nblocks, 1, 1);
dim3 dimBlocks(nThreadsPerBlock, 1, 1);
// Initiate values
for(size_t i=0; i<vec_len; ++i) {
u[i] = std::sin(i*PI*1E-2);
v[i] = std::cos(i*PI*1E-2);
}
HANDLE_ERROR( cudaMalloc((void**)&dev_u, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_v, size) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out, size_out) );
HANDLE_ERROR( cudaMalloc((void**)&dev_out_2, size_out_2) );
HANDLE_ERROR( cudaMemcpy(dev_u, u, size, cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy(dev_v, v, size, cudaMemcpyHostToDevice) );
auto t1_gpu = std::chrono::system_clock::now();
dotProd <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out, dev_out, size_out, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nThreadsPerBlock*nblocks; ++i) {
res_gpu += out[i];
}
auto t2_gpu = std::chrono::system_clock::now();
// GPU version with shared memory
dotProdWithSharedMem <<<dimGrid, dimBlocks>>> (vec_len, dev_u, dev_v, dev_out_2);
cudaDeviceSynchronize();
HANDLE_ERROR( cudaMemcpy(out_2, dev_out_2, size_out_2, cudaMemcpyDeviceToHost) );
// Reduction
for(size_t i=0; i<nblocks; ++i) {
res_gpu_2 += out_2[i];
}
auto t3_gpu = std::chrono::system_clock::now();
// CPU version for result-check
for(size_t i=0; i<vec_len; ++i) {
res_cpu += u[i] * v[i];
}
auto t2_cpu = std::chrono::system_clock::now();
double t_gpu = std::chrono::duration <double, std::milli> (t2_gpu - t1_gpu).count();
double t_gpu_2 = std::chrono::duration <double, std::milli> (t3_gpu - t2_gpu).count();
double t_cpu = std::chrono::duration <double, std::milli> (t2_cpu - t3_gpu).count();
printf("Number of threads per block : %i \n", nThreadsPerBlock);
printf("Number of blocks in the grid: %i \n", nblocks);
printf("Total number of threads : %i \n", nThreadsPerBlock*nblocks);
printf("Length of vectors : %i \n\n", vec_len);
printf("GPU using registers: %.10f, time consummed: %.5f ms\n", res_gpu, t_gpu);
printf("GPU using shared : %.10f, time consummed: %.5f ms\n", res_gpu_2, t_gpu_2);
printf("CPU result : %.10f, time consummed: %.5f ms\n", res_cpu, t_cpu);
cudaFree(dev_u);
cudaFree(dev_v);
cudaFree(dev_out);
cudaFree(dev_out_2);
free(u);
free(v);
free(out);
free(out_2);
return 0;
}
$ nvcc -std=c++11 t397.cu -o t397
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6904191971, time consummed: 0.89290 ms
GPU using shared : 9.6906833649, time consummed: 0.04289 ms
CPU result : 9.6904191971, time consummed: 0.41527 ms
$ nvcc -std=c++11 t397.cu -o t397 -DUSE_DOUBLE
$ ./t397 17 512
Number of threads per block : 256
Number of blocks in the grid: 512
Total number of threads : 131072
Length of vectors : 131072
GPU using registers: 9.6913433287, time consummed: 1.33016 ms
GPU using shared : 9.6913433287, time consummed: 0.05032 ms
CPU result : 9.6913433287, time consummed: 0.41275 ms
$

Optimize vector matrix multiplication in cuda with large number of zeros

I am using the following kernel to optimize vector-matrix multiplication for the case where both the vector and the matrix have a large number of zeros. The use of this kernel may reduce the time taken for such a multiplication by up to half of the time taken by cublasSgemv, for the case where there are more than 90% zeros. But, it is still much longer than an equivalent blas gemm host call on Ubuntu 14.04
vec = 1 x m, mat = m x m and prod = 1 x m; all are in row-major order
m >= 5000
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
What can be done to further enhance the performance of this kernel apart from libraries like cuSparse?
Would be nice if this optimization was compatible with Compute Capability of 1.2
Thanks
EDIT
Corrected: prod = 1 x m
GPU = Quadro FX 1800M, Cuda v.5.0 on Ubuntu 14.04
EDIT
Complete code that performs multiplication using i. blas, ii. cublas, iii. above kernel for m = 6000. Please enter 0, when asked to enter a value
#include <iostream>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <cublas_v2.h>
#include <math.h>
using namespace std;
const int m = 6000;
const int BS = 512; // threads per block
const int NB = ceil((float) m / BS); // number of blocks
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
int main()
{
timespec blas_start, blas_end, cublas_start, cublas_end, opt_start, opt_end;
long totalnsec; //total nano sec
double totalsec, totaltime;
int i, j;
float *A = new float[m]; // 1 x m
float *B = new float[m*m]; // m x m
float *C = new float[m]; // 1 x m
float input;
cout<<"Enter a value to populate the vector (0 to make it sparse) ";
cin>>input;
// input martix A: every 600th element is non-zero i.e 90% zero
for(i = 0; i < m; i++)
{
A[i] = input;
if( i % 600 == 0) //adjust for sparsity
A[i] = i;
}
// input matrix B: identity matrix
for(i = 0; i < m; i++)
for(j = 0; j < m; j++)
B[j*m + i] = (i==j);
//blas on host
clock_gettime(CLOCK_REALTIME, &blas_start);
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, m, m, 1.0f, A, m, B, m, 0.0f, C, m);
//cblas_sgemv(CblasRowMajor, CblasTrans, m, m, 1.0f, B, m, A, 1, 0.0f, C, 1);
clock_gettime(CLOCK_REALTIME, &blas_end);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
//cublas section
cudaError_t cudaStat;
cublasHandle_t handle;
cublasCreate(&handle);
float *A_d, *B_d, *C_d;
cudaStat = cudaMalloc(&A_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for A_d\n");
cudaStat = cudaMalloc(&B_d, sizeof(float)*m*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for B_d\n");
cudaStat = cudaMalloc(&C_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for C_d\n");
cudaMemcpy(A_d, A, sizeof(float)*m, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B, sizeof(float)*m*m, cudaMemcpyHostToDevice);
float alpha = 1.0f, beta = 0.0f;
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_start);
cublasSgemv(handle, CUBLAS_OP_N, m, m, &alpha, B_d, m, A_d, 1, &beta, C_d, 1);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Call kernel having Optimization for Zeros
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_start);
/////////////////// call kernel //////////////////
calc_v_m<<<NB, BS>>>(A_d, B_d, C_d, m);
//////////////////////////////////////////////////
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/*for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Print times
// blas time
totalsec = (double)blas_end.tv_sec - (double)blas_start.tv_sec;
totalnsec = blas_end.tv_nsec - blas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"blas Time = "<< totaltime << "\n";
//cublas time
totalsec = (double)cublas_end.tv_sec - (double)cublas_start.tv_sec;
totalnsec = cublas_end.tv_nsec - cublas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"cublas Time = "<< totaltime << "\n";
//Optimized Kernel Time
totalsec = (double)opt_end.tv_sec - (double)opt_start.tv_sec;
totalnsec = opt_end.tv_nsec - opt_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"Opt Kernel Time = "<< totaltime << "\n";
return 0;
}
Results
$ nvcc -arch=sm_12 blascomp.cu -o blascomp.o -lblas -lcublas
$ ./blascomp.o
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 0.000105207
cublas Time = 0.0070294
Opt Kernel Time = 0.00642797
At least on my system blas is still the fastest for such a scenario
Things get even more interesting if every '1200th' element instead of '600th' is set to 0
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 7.84e-05
cublas Time = 0.00698783
Opt Kernel Time = 0.00643042

The important thing to recognise here is that the gemv operation you are concerned with is fundamentally memory throughput limited on GPUs, rather than compute throughput limited. This implies that an "optimisation" as you have shown in your kernel:
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
isn't really an optmisation at all, simply because the memory transactions are the performance bottleneck in the kernel, not the floating point arithmetic, and your code must perform most of the memory transactions irrespective of whether the multiply add operation will be performed because of zero detection or not.
Consider the following, instrumented version of roughly the same code:
__constant__ float cvec1[2];
__global__ void
__launch_bounds__(512,4)
calc_v_m1(const float* __restrict__ vec,
const float* __restrict__ mat,
float* __restrict__ prod,
int m,
int do_reads = 1,
int do_write = 1)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
float res = 0;
float mval = cvec1[0], vval = cvec1[1];
#pragma unroll 8
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if (do_reads) {
mval = mat[offset];
vval = vec[i];
}
res += mval * vval;
}
if (do_write) prod[x] = res;
}
}
Here I have added two optional arguments which control whether the kernel will load from global memory, and whether the kernel will store to global memory. This allows me to quantify the performance impact of the memory loads, computation, and memory stores independently. The results using your test code are instructive:
Function nvprof time
-----------------------------------------------
cublasSgemv 942.75us
calc_v_m 2798.4us
calc_v_m1(do_reads=1, do_write=1) 962.40us
calc_v_m1(do_reads=1, do_write=0) 970.40us
calc_v_m1(do_reads=0, do_write=1) 55.166us
calc_v_m1(do_reads=0, do_write=0) 55.102us
[All benchmarking done on a GTX970 using the CUDA 7.5 release toolchain and CUBLAS 7.5 library]
In no particular order:
The full instrumented kernel runtime is within a few percent of the equivalent CUBLAS call
The memory fetches from global memory are the bottleneck
The actual computations in the kernel only constitute 5% of the kernel running time
The "fire-and-forget" nature of write operations in CUDA means that the latency of the write has no significant effect on throughput.
Your "optimised" kernel is considerably slower than either CUBLAS or the instrumented kernel, probably because all you are introducing is branch divergence without addressing the source of the kernel bottleneck (the latency of the memory loads).
The only times conditionally executing the FMAD operation makes sense would be in an architecture where memory has near zero latency and floating point throughput was severely constrained. The GPU definitely doesn't fall into that category.
The only other option for optimising this would be to exploit a priori information about the sparsity patterns in the LHS matrix to remove the need to read zero entries. Which is precisely what sparse matrix formats and linear algebra codes are designed to accommodate.

cudaStream strange performance

I try to develop an example of sobel with cudaStream. Here is the program:
void SobelStream(void)
{
cv::Mat imageGrayL2 = cv::imread("/home/xavier/Bureau/Image1.png",0);
u_int8_t *u8_PtImageHost;
u_int8_t *u8_PtImageDevice;
u_int8_t *u8_ptDataOutHost;
u_int8_t *u8_ptDataOutDevice;
u_int8_t u8_Used[NB_STREAM];
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
checkCudaErrors(cudaMalloc((void**)&u8_ptDataOutDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));
u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
checkCudaErrors(cudaMalloc((void**)&u8_PtImageDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
checkCudaErrors(cudaMallocArray(&Array_PatchsMaxDevice, &channelDesc,WIDTH,HEIGHT ));
checkCudaErrors(cudaBindTextureToArray(Image,Array_PatchsMaxDevice));
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));
ClearKernel<<<blocks,threads>>>(u8_ptDataOutDevice,WIDTH,HEIGHT);
int blockh = HEIGHT/NB_STREAM;
Stream = (cudaStream_t *) malloc(NB_STREAM * sizeof(cudaStream_t));
for (int i = 0; i < NB_STREAM; i++)
{
checkCudaErrors(cudaStreamCreate(&(Stream[i])));
}
// for(int i=0;i<NB_STREAM;i++)
// {
// cudaSetDevice(0);
// cudaStreamCreate(&Stream[i]);
// }
cudaEvent_t Start;
cudaEvent_t Stop;
cudaEventCreate(&Start);
cudaEventCreate(&Stop);
cudaEventRecord(Start, 0);
//////////////////////////////////////////////////////////
for(int i=0;i<NB_STREAM;i++)
{
if(i == 0)
{
int localHeight = blockh;
checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
0,
0,
imageGrayL2.data,//u8_PtImageDevice,
WIDTH,
WIDTH,
blockh,
cudaMemcpyHostToDevice ,
Stream[i]));
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHeight/BLOC_Y));
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1);
checkCudaErrors(cudaGetLastError());
u8_Used[i] = 1;
}else{
int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM );
int hoffset = HEIGHT/NB_STREAM *i;
int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
int localHeight = min(HEIGHT - (blockh*i),blockh);
//printf("hoffset: %d hoffsetkernel %d localHeight %d rest %d ioffsetImage %d \n",hoffset,hoffsetkernel,localHeight,HEIGHT - (blockh +1 +blockh*(i-1)),ioffsetImage*i/WIDTH);
checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
0,
hoffset,
&imageGrayL2.data[ioffsetImage*i],//&u8_PtImageDevice[ioffset*i],
WIDTH,
WIDTH,
localHeight,
cudaMemcpyHostToDevice ,
Stream[i]));
u8_Used[i] = 1;
if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
{
break;
}
}
}
///////////////////////////////////////////
for(int i=0;i<NB_STREAM;i++)
{
if(i == 0)
{
int localHeight = blockh;
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(1,1);
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1);
checkCudaErrors(cudaGetLastError());
u8_Used[i] = 1;
}else{
int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM );
int hoffset = HEIGHT/NB_STREAM *i;
int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
int localHeight = min(HEIGHT - (blockh*i),blockh);
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(1,1);
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,hoffsetkernel,WIDTH,localHeight);
checkCudaErrors(cudaGetLastError());
u8_Used[i] = 1;
if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
{
break;
}
}
}
///////////////////////////////////////////////////////
for(int i=0;i<NB_STREAM;i++)
{
if(i == 0)
{
int localHeight = blockh;
checkCudaErrors(cudaMemcpyAsync(u8_ptDataOutHost,u8_ptDataOutDevice,WIDTH*(localHeight-1)*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));
u8_Used[i] = 1;
}else{
int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM );
int hoffset = HEIGHT/NB_STREAM *i;
int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
int localHeight = min(HEIGHT - (blockh*i),blockh);
checkCudaErrors(cudaMemcpyAsync(&u8_ptDataOutHost[hoffsetkernel*WIDTH],&u8_ptDataOutDevice[hoffsetkernel*WIDTH],WIDTH*localHeight*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));
u8_Used[i] = 1;
if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
{
break;
}
}
}
for(int i=0;i<NB_STREAM;i++)
{
cudaStreamSynchronize(Stream[i]);
}
cudaEventRecord(Stop, 0);
cudaEventSynchronize(Start);
cudaEventSynchronize(Stop);
float dt_ms;
cudaEventElapsedTime(&dt_ms, Start, Stop);
printf("dt_ms %f \n",dt_ms);
}
I had a really strange performance on th execution of my program. I decided to profile my example and I get that:
I don't understand it seems that each stream are waiting each other.
Can someone help me about that?

First of all, in the future, please provide a complete code. I'm also working off of your cross-posting here to fill in some details such as kernel sizes.
You have two issues to address:
First, any time you wish to use cudaMemcpyAsync, you will most likely want to be working with pinned host allocations. If you use allocations created e.g. with malloc, you will not get the expected behavior from cudaMemcpyAsync as far as asynchronous concurrent execution is concerned. This necessity is covered in the programming guide:
If host memory is involved in the copy, it must be page-locked.
So the first change to make to your code is to convert this:
u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
to this:
checkCudaErrors(cudaHostAlloc(&u8_PtImageHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));
checkCudaErrors(cudaHostAlloc(&u8_ptDataOutHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));
with that change alone, your execution duration drops from about 21ms to 7ms according to my testing. The reason for this is that without the change, we get no overlap whatsoever:
With the change, the copy activity can overlap with each other (H->D and D->H) and with kernel execution:
The second issue you face to get to concurrent kernel execution is that your kernels are just too large (too many blocks/threads):
#define WIDTH 6400
#define HEIGHT 4800
#define NB_STREAM 10
#define BLOC_X 32
#define BLOC_Y 32
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));
I would suggest that if these are the sizes of kernels you need to run, there's probably not much benefit to try and strive for kernel overlap - each kernel is launching enough blocks to "fill" the GPU, so you have already exposed enough parallelism to keep the GPU busy. But if you are desperate to witness kernel concurrency, you could make your kernels use a smaller number of blocks while causing each kernel to spend more time executing. We could do this by launching 1 block, and have just the the threads in each block perform the image filtering.

Problem with CUDA operation overlapping example

all
I referred to simpleMultiCopy.cu in CUDA SDK 4.0 and wrote one, see code below.
simpleMultiCopy.cu is an example of operation overlapping in a loop. And mine is similar, it will send a slice of data to GPU to compute each iteration in a loop where I perform the overlapping operation.
This is just a test/demo, don't care the logic of the kernel(increment_kernel), it was used just to delay some time. The main logic lies in processWithStreams function.
But this program works incorrectly with this out put:
i: 0, current_stream: 0, next_stream: 1
i: 1, current_stream: 1, next_stream: 0
Cuda error in file 'ttt.cu' in line 132 : unspecified launch failure.
line 132 is:
CUDA_SAFE_CALL( cudaMemcpyAsync(
d_data_in[next_stream],
h_data_in[next_stream],
memsize,
cudaMemcpyHostToDevice,
stream[next_stream]) ); //this is line 132
I don't have much ideas about how CUDA works, so please help.
Any help will be appreciate.
Code:
#include <stdio.h>
#include <cutil_inline.h>
float processWithStreams(int streams_used);
#define STREAM_COUNT 2
int N = 1 << 24;
int *h_data_source;
int *h_data_sink;
int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];
int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];
cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];
cudaEvent_t start, stop;
dim3 block(512);
dim3 grid;
int memsize;
__global__ void increment_kernel(int *g_data, int inc_value)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//g_data[idx] = g_data[idx] + inc_value;
int i = blockDim.x * gridDim.x;
for(; i > 0; i /= 2)
{
if(idx > i)
g_data[idx]++;
}
}
int main(int argc, char *argv[])
{
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId());
h_data_source = (int *)malloc(sizeof(int) * N);
memset(h_data_source, 0, sizeof(int) * N);
int i;
memsize = 1024 * 1024 * sizeof(int);
for(i = 0; i < STREAM_COUNT; i++)
{
CUDA_SAFE_CALL( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) );
CUDA_SAFE_CALL( cudaMalloc(&d_data_in[i], memsize) );
CUDA_SAFE_CALL( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) );
CUDA_SAFE_CALL( cudaMalloc(&d_data_out[i], memsize) );
CUDA_SAFE_CALL( cudaStreamCreate(&stream[i]) );
CUDA_SAFE_CALL( cudaEventCreate(&cycleDone[i]) );
cudaEventRecord(cycleDone[i], stream[i]);
}
CUDA_SAFE_CALL( cudaEventCreate(&start) );
CUDA_SAFE_CALL( cudaEventCreate(&stop) );
grid.x = N / block.x;
grid.y = 1;
float time1 = processWithStreams(STREAM_COUNT);
printf("time: %f\n", time1);
free( h_data_source );
free( h_data_sink );
for( i = 0; i < STREAM_COUNT; ++i ) {
cudaFreeHost(h_data_in[i]);
cudaFree(d_data_in[i]);
cudaStreamDestroy(stream[i]);
cudaEventDestroy(cycleDone[i]);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaThreadExit();
cutilExit(argc, argv);
return 0;
}
float processWithStreams(int streams_used) {
int current_stream = 0;
float time;
cudaEventRecord(start, 0);
for( int i=0; i < N / 1024 / 1024; ++i ) {
int next_stream = (current_stream + 1 ) % streams_used;
printf("i: %d, current_stream: %d, next_stream: %d\n", i, current_stream, next_stream);
// Ensure that processing and copying of the last cycle has finished
cudaEventSynchronize(cycleDone[next_stream]);
// Process current frame
increment_kernel<<<grid, block, 0, stream[current_stream]>>>(
d_data_in[current_stream], 1);
// Upload next frame
CUDA_SAFE_CALL( cudaMemcpyAsync(
d_data_in[next_stream],
h_data_in[next_stream],
memsize,
cudaMemcpyHostToDevice,
stream[next_stream]) );
CUDA_SAFE_CALL( cudaEventRecord(
cycleDone[next_stream],
stream[next_stream]) );
// Download current frame
CUDA_SAFE_CALL( cudaMemcpyAsync(
h_data_out[current_stream],
d_data_out[current_stream],
memsize,
cudaMemcpyDeviceToHost,
stream[current_stream]) );
CUDA_SAFE_CALL( cudaEventRecord(
cycleDone[current_stream],
stream[current_stream]) );
current_stream = next_stream;
}
cudaEventRecord(stop, 0);
cudaEventElapsedTime(&time, start, stop);
return time;
}

The problem is in your kernel. One thing that happens when checking errors in CUDA is that errors that occurred previously and were not checked will be reported next time you check for an error. That line is the first time you check for errors after the kernel launch which returned the error your are seeing.
The error unspecified launch failure is usually associated with out of bounds accesses to memory if I recall correctly.
You are launching your kernel with 32768 blocks and 512 threads per block. Calculating the idx value for the last thread of the last block we have 32767 * 512 + 511 = 16777215. In the first iteration idx < i and in the following ones you are trying to read and write to position 16777215 of g_data when you only allocated space for 1024 * 1024 integers.
edit: just noticed, why the tag operator overloading?

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

The result of an experiment different from CUDA Occupancy Calculator - cuda

Related

cudaMallocManaged and cudaDeviceSynchronize()

CUDA: Understanding the behavior of variables in the registers file in a loop with a dot product example

Optimize vector matrix multiplication in cuda with large number of zeros

cudaStream strange performance

Problem with CUDA operation overlapping example

Categories

Resources