CUDA branch divergence doesn't make any differences - cuda

I'm trying to learn CUDA by myself, and I'm now into the issue of branch divergence. As far as I understand, this is the name given to the problem that arises when several threads in a block are said to take a branch (due to if or switch statements, for example), but others in that block don't have to take it.
In order to investigate a little bit further this phenomena and its consequences, I've written a little file with a couple of CUDA functions. One of them is supposed to take lots of time, since the threads are stopped for much more time (9999... iterations) than in the other one (in which they're only stopped for an assignation).
However, when I run the code, I'm getting very similar times. Furthermore, even measuring the time that running both of them takes I get a time similar to running only one. Did I code anything wrong, or is there a logical explanation for this?
Code:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#define ITERATIONS 9999999999999999999
#define BLOCK_SIZE 16
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
float result = 0;
if(threadIdx.x % 2 == 0)
{
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*A[threadIdx.x];
}
} else
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*B[threadIdx.x];
}
}
__global__ void betterDivergence(float *A, float *B){
float result = 0;
float *aux;
//This structure should not affect performance that much
if(threadIdx.x % 2 == 0)
aux = A;
else
aux = B;
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*aux[threadIdx.x];
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
elementsB[x] = (x%2==0)?1:3;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(32/dimBlock.x, 32/dimBlock.y);
divergence<<<dimBlock, dimGrid>>>(d_a, d_b);
betterDivergence<<<dimBlock, dimGrid>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer);
printf("kernel execution time (secs): %f s\n", timerValue);
return 0;
}

1) You have no memory writes in your __global__ code except the local variable(result). I'm not sure that cuda compiler does that, but all your code can be safely removed with no side effect(and maybe the compiler had done that).
2) All your reads from device memory in __global__ functions are from one place on each iteration. Cuda will store the value in register memory and the longest operation(memory access) will be done very fast here.
3) May be the compiler had replaced your cycles with single multiplication like `result=ITERATIONS*A[threadIdx.x]*B[threadIdx.x]
4) If all the code in your functions will be executed as you wrote it, your betterDivergence is going to be approximately 2 times faster than your another function because you have the loops in if branches in slower one and no loops in branches in faster one. But there won't be any idle time in threads among the threads that execute same loop because all threads are going to execute the body of the loop each iteration.
I suggest you to write another example where you will store the result in some device memory and then copy that memory back to host and make some more unpredictable calculations to prevent possible optimizations.

Below is shown the final, tested, right example of a code that allows to compare the performance between CUDA code with and without branch divergence:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
//#define ITERATIONS 9999999999999999999
#define ITERATIONS 999999
#define BLOCK_SIZE 16
#define WARP_SIZE 32
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > 2)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
__global__ void noDivergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > WARP_SIZE)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(128/dimBlock.x, 128/dimBlock.y);
//divergence<<<dimGrid, dimBlock>>>(d_a, d_b);
noDivergence<<<dimGrid, dimBlock>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer)/1000;
printf("kernel execution time (secs): %f s\n", timerValue);
cudaMemcpy(elementsB, d_b, BLOCK_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}

Related

CUDA device runtime api cudaMemsetAsync doesn't work

I am trying to call cudaMemsetAsync from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.
Here is my test code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>
const int size = 5;
__global__ void kernel(int *c)
{
cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
}
int main()
{
cudaError_t cudaStatus;
int c[size] = { 12, 12, 12, 12, 12 };
int *dev_c = 0;
cudaStatus = cudaSetDevice(0);
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
cudaStatus = cudaDeviceReset();
printf("%d\n", cudaStatus);
printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
And if I run it, I got output like this:
>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
kernel.cu
Creating library a.lib and object a.exp
0
{0,0,0,0,0}
When I call memory set, I use value 0x7FFFFFFF. I'm expecting non-zero numbers, but it always shows zero.
Is this a bug? or I did something wrong? I'm using CUDA 8.0
I can confirm this appears not to work in CUDA 8 on the systems I tested it with.
If you want a single thread to perform the operation, you can use memset directly in device code (it, like memcpy, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread.
If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this:
#include <cstring>
#include <cstdio>
const int size = 5;
__global__ void myMemset_kernel(void* p, unsigned char val, size_t sz)
{
size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned char* _p = (unsigned char*)p;
for(; tid < sz; tid += blockDim.x * gridDim.x) {
_p[tid] = val;
}
}
__device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL)
{
const dim3 blocksz(256,1,1);
size_t nblocks = (sz + blocksz.x -1) / blocksz.x;
unsigned charval = val & 0xff;
myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz);
}
__global__ void kernel(int *c)
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
myMemset(c, 0x7FFFFFFF, size * 4, s);
cudaDeviceSynchronize();
}
int main()
{
int c[size];
int *dev_c;
memset(&c[0], 0xffffff0c, size * sizeof(int));
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
which compiles and does this:
$ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt
$ ./memset
{0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c}
{ffffffff,ffffffff,ffffffff,ffffffff,ffffffff}
A final point -- note the values above and read this question and answer. In your code, it is not possible to use cudaMemset to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, cudaMemset and its relatives work like regular memset and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway.

Experiment to find out affect of block size on cuda program speed

I want to find out how the number of threads in a block affects the performance and speed of a cuda program. I wrote a simple vector addition code, here is my code:
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void gpuVecAdd(float *a, float *b, float *c, int n) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < n) {
c[id] = a[id] + b[id];
}
}
int main() {
int n = 1000000;
float *h_a, *h_b, *h_c, *t;
srand(time(NULL));
size_t bytes = n* sizeof(float);
h_a = (float*) malloc(bytes);
h_b = (float*) malloc(bytes);
h_c = (float*) malloc(bytes);
for (int i=0; i<n; i++)
{
h_a[i] =rand()%10;
h_b[i] =rand()%10;
}
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
gpuErrchk( cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice));
gpuErrchk( cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice));
clock_t t1,t2;
t1 = clock();
int block_size = 1024;
gpuVecAdd<<<ceil(float(n/block_size)),block_size>>>(d_a, d_b, d_c, n);
gpuErrchk( cudaPeekAtLastError() );
t2 = clock();
cout<<(float)(t2-t1)/CLOCKS_PER_SEC<<" seconds";
gpuErrchk(cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost));
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
}
I read this post and Based on the talonmies' answer "The number of threads per block should be a round multiple of the warp size, which is 32 on all current hardware."
I checked the code with a different number of threads per block, for example, 2 and 1024 (which is the multiply of 32 and also the maximum number of thread per block). The average running time for both sizes is almost equal and I don't see a huge difference between them. Why is that? Is my benchmarking incorrect?
GPU kernel launches in CUDA are asynchronous. This means that control will be returned to the CPU thread before the kernel has finished executing.
If we want the CPU thread to time the duration of the kernel, we must cause the CPU thread to wait until the kernel has finished. We can do this by putting a call to cudaDeviceSynchronize() in the timing region. Then the measured time will include the full duration of kernel execution.

Wrong results of a CUDA dynamic parallelism code

I recently bumped in the problem illustrated at Uncorrectable ECC error. Shortly speaking, from time to time I receive an Uncorrectable ECC error and my dynamic parallelism code generates uncorrect results. The most probable hypothesis of the uncorrectable ECC error is a corrupted driver stack, which has also been indirectly confirmed by the experience of another user (see the above post). I would now like to face the second issue, i.e., the algorithmic one. To this end, I'm dealing with the reproducer reported below which, since the original code generating uncorrect results uses dynamic parallelism, uses this CUDA feature too.
I do not see any evindent issue with this code. I think that the synchronization regarding the child kernel launch should be ok: the first __syncthreads() should not be necessary and the cudaDeviceSynchronize() should ensure that all the memory writes of the child kernel are accomplished before the printf.
My question is: is this code wrong or the wrong results are due to a non-programming issue?
My configuration: CUDA 5.0, Windows 7, 4-GPU system equipped with Kepler K20c, driver 327.23.
#include <stdio.h>
#include <conio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(double* P1)
{
int m = threadIdx.x;
P1[m] = (double)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
double* P1 = new double[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
for(int m=0; m<2*K+1; m++) printf("%f %f\n",P1[m],(double)m);
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
getch();
return 0;
}
I'm pretty sure you're exceeding the launch pending limit. It's nearly impossible to tell with your code as-is, but I've modified it and added error checking on the child kernel launch.
When I do that, I get launch errors, signified by a printout of !. Skipping the launch error cases, all of my in-kernel checking of P1[m] vs. m passes (I get no * printout at all.)
#include <stdio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(unsigned long long* P1)
{
int m = threadIdx.x;
P1[m] = (unsigned long long)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
unsigned long long* P1 = new unsigned long long[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("!");
else for(unsigned long long m=0; m<dimBlock.x; m++) if (P1[m] != m) printf("*");
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Feel free to add further decoding of the err variable in the parent kernel to convince yourself that you are exceeding the launch pending limit. As another test, you can set M to 2048 instead of 19000 in your host code, and all the ! printouts go away. (launch pending limit default == 2048)
As I've stated in the comments, I think the uncorrectable ECC error is a separate issue, and I suggest trying the driver 321.01 that I linked in the comments.

How can I check the progress of matrix multiplication?

I'm now only need to show an intermediate progress of matrix multiplication.
for(unsigned int col=0; col<mtxSize; col++) {
unsigned tmp = 0;
for(unsigned int row=0; row<mtxSize; row++) {
for(unsigned int idx=0; idx<mtxSize; idx++) {
tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
}
h_Rs[col*mtxSize+row] = tmp;
tmp = 0;
int rate_tmp = (col*mtxSize + (row+1))*100;
// Maybe like this...
fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
fflush(stdout);
}
}
In the case of the host code(use CPU), it is very easy beacause it process sequentially so we can check very easily.
But in the case of the GPU which process in parallel, what should I do?
Once the kernel is running, it does not return until finish the kernel execution.
So I can't check mid-data during the kernel execution time.
I think I need to use asynchronous kernel call, but I do not know well.
And even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to write atomicAdd() (in other words, global memory access) function which is including some overhead?
Give me some advice or hint.
And I want to know in the case of CUDA.
Here is a code which demonstrates how to check progress from a matrix multiply kernel:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(volatile int *data){
unsigned long time;
for (int i = 0; i < INCS; i++){
atomicAdd((int *)data,1);
__threadfence_system();
time = clock64();
while((clock64() - time)<TIME_INC) {};
}
printf("progress check finished\n");
}
__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
float temp = 0.0f;
for (unsigned int k = 0; k < colA; k++)
temp += a[(row*colA)+k] * b[(k*colB) + col];
c[(row*colB)+col] = temp;
#if USE_PROGRESS
if (!(threadIdx.x || threadIdx.y)){
atomicAdd((int *)progress, 1);
__threadfence_system();
}
#endif
}
}
int main(){
// simple test to demonstrate reading progress data from kernel
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
printf("kernel starting\n");
mykernel<<<1,1>>>(d_data);
cudaCheckErrors("kernel fail");
int value = 0;
do{
int value1 = *h_data;
if (value1 > value){
printf("h_data = %d\n", value1);
value = value1;}}
while (value < (INCS-1));
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail 2");
// now try matrix multiply with progress
float *h_c, *d_a, *d_b, *d_c;
h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
if (h_c == NULL) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc a fail");
cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc b fail");
cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc c fail");
for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy a fail");
cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy b fail");
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
*h_data=0;
dim3 block(16,16);
dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
printf("matrix multiply kernel starting\n");
cudaEventRecord(start);
matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
cudaEventRecord(stop);
#if USE_PROGRESS
unsigned int num_blocks = grid.x*grid.y;
float my_progress = 0.0f;
value = 0;
printf("Progress:\n");
do{
cudaEventQuery(stop); // may help WDDM scenario
int value1 = *h_data;
float kern_progress = (float)value1/(float)num_blocks;
if ((kern_progress - my_progress)> 0.1f) {
printf("percent complete = %2.1f\n", (kern_progress*100));
my_progress = kern_progress;}}
while (my_progress < 0.9f);
printf("\n");
#endif
cudaEventSynchronize(stop);
cudaCheckErrors("event sync fail");
float et;
cudaEventElapsedTime(&et, start, stop);
cudaCheckErrors("event elapsed time fail");
cudaDeviceSynchronize();
cudaCheckErrors("mat mult kernel fail");
printf("matrix multiply finished. elapsed time = %f milliseconds\n", et);
return 0;
}
The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.
The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.
Some other uses of signalling are discussed here

Performance of static versus dynamic CUDA shared memory allocation

I have 2 kernels that do exactly the same thing. One of them allocates shared memory statically while the other allocates the memory dynamically at run time. I am using the shared memory as 2D array. So for the dynamic allocation, I have a macro that computes the memory location. Now, the results generated by the 2 kernels are exactly the same. However, the timing results I got from both kernels are 3 times apart! The static memory allocation is much faster. I am sorry that I can't post any of my code. Can someone give a justification for this?
I have no evidence that static shared memory allocation is faster than dynamic shared memory allocation. As was evidenced in the comments above, it would be impossible to answer your question without a reproducer. In at least the case of the code below, the timings of the same kernel, when run with static or dynamic shared memory allocations, are exactly the same:
#include <cuda.h>
#include <stdio.h>
#define BLOCK_SIZE 512
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
__global__ void kernel_static_memory_allocation(int *d_inout, int N)
{
__shared__ int s[BLOCK_SIZE];
const int tid = threadIdx.x;
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
s[tid] = d_inout[i];
__syncthreads();
s[tid] = s[tid] * s[tid];
__syncthreads();
d_inout[i] = s[tid];
}
}
/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void kernel_dynamic_memory_allocation(int *d_inout, int N)
{
extern __shared__ int s[];
const int tid = threadIdx.x;
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) {
s[tid] = d_inout[i];
__syncthreads();
s[tid] = s[tid] * s[tid];
__syncthreads();
d_inout[i] = s[tid];
}
}
/********/
/* MAIN */
/********/
int main(void)
{
int N = 1000000;
int* a = (int*)malloc(N*sizeof(int));
for (int i = 0; i < N; i++) { a[i] = i; }
int *d_inout; gpuErrchk(cudaMalloc(&d_inout, N * sizeof(int)));
int n_blocks = N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1);
gpuErrchk(cudaMemcpy(d_inout, a, N*sizeof(int), cudaMemcpyHostToDevice));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
kernel_static_memory_allocation<<<n_blocks,BLOCK_SIZE>>>(d_inout, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Static allocation - elapsed time: %3.3f ms \n", time);
cudaEventRecord(start, 0);
kernel_dynamic_memory_allocation<<<n_blocks,BLOCK_SIZE,BLOCK_SIZE*sizeof(int)>>>(d_inout, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Dynamic allocation - elapsed time: %3.3f ms \n", time);
}
The possible reason for that is due to the fact that the disassembled codes for the two kernels are exactly the same and do not change even on replacing int N = 1000000; with int N = rand();.