I recently bumped in the problem illustrated at Uncorrectable ECC error. Shortly speaking, from time to time I receive an Uncorrectable ECC error and my dynamic parallelism code generates uncorrect results. The most probable hypothesis of the uncorrectable ECC error is a corrupted driver stack, which has also been indirectly confirmed by the experience of another user (see the above post). I would now like to face the second issue, i.e., the algorithmic one. To this end, I'm dealing with the reproducer reported below which, since the original code generating uncorrect results uses dynamic parallelism, uses this CUDA feature too.
I do not see any evindent issue with this code. I think that the synchronization regarding the child kernel launch should be ok: the first __syncthreads() should not be necessary and the cudaDeviceSynchronize() should ensure that all the memory writes of the child kernel are accomplished before the printf.
My question is: is this code wrong or the wrong results are due to a non-programming issue?
My configuration: CUDA 5.0, Windows 7, 4-GPU system equipped with Kepler K20c, driver 327.23.
#include <stdio.h>
#include <conio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(double* P1)
{
int m = threadIdx.x;
P1[m] = (double)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
double* P1 = new double[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
for(int m=0; m<2*K+1; m++) printf("%f %f\n",P1[m],(double)m);
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
getch();
return 0;
}
I'm pretty sure you're exceeding the launch pending limit. It's nearly impossible to tell with your code as-is, but I've modified it and added error checking on the child kernel launch.
When I do that, I get launch errors, signified by a printout of !. Skipping the launch error cases, all of my in-kernel checking of P1[m] vs. m passes (I get no * printout at all.)
#include <stdio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(unsigned long long* P1)
{
int m = threadIdx.x;
P1[m] = (unsigned long long)m;
}
__global__ void parent_kernel(double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
unsigned long long* P1 = new unsigned long long[13];
dim3 dimBlock(2*K+1,1); dim3 dimGrid(1,1);
__syncthreads();
child_kernel<<<dimGrid,dimBlock>>>(P1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("!");
else for(unsigned long long m=0; m<dimBlock.x; m++) if (P1[m] != m) printf("*");
}
}
int main() {
const int M = 19000;
//gpuErrchk(cudaSetDevice(0));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++) x[i] = (double)i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(d_x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
Feel free to add further decoding of the err variable in the parent kernel to convince yourself that you are exceeding the launch pending limit. As another test, you can set M to 2048 instead of 19000 in your host code, and all the ! printouts go away. (launch pending limit default == 2048)
As I've stated in the comments, I think the uncorrectable ECC error is a separate issue, and I suggest trying the driver 321.01 that I linked in the comments.
Related
I'm running a toy CUDA sample on my GeForce 1080 Ti (Pascal) on windows 10 and CUDA 9.2.
Goal is to test cudaMemPrefetchAsync to the CPU, as it's supposed to work.
However, I get a CUDA error (invalid device ordinal) on this particular line.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cstdlib>
void fill(int* a, int val, int N) {
for (int k = 0; k < N; ++k) {
a[k] = val;
}
}
__global__ void add(int* a, int* b, int N)
{
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += blockDim.x * gridDim.x) {
a[i] += b[i];
}
}
inline void check(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
::fprintf(stderr, "ERROR at %s[%d] : %s\n", file, line, cudaGetErrorString(err));
abort();
}
}
#define CUDA_CHECK(err) do { check(err, __FILE__, __LINE__); } while(0)
int main()
{
int deviceId;
CUDA_CHECK(cudaGetDevice(&deviceId));
const int N = 1024*1024*32;
int *a, *b;
CUDA_CHECK(cudaMallocManaged(&a, N * sizeof(int)));
CUDA_CHECK(cudaMallocManaged(&b, N * sizeof(int)));
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), cudaCpuDeviceId)); // program breaks here
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), cudaCpuDeviceId));
fill(a, 1, N);
fill(a, 2, N);
CUDA_CHECK(cudaMemPrefetchAsync(a, N * sizeof(int), deviceId));
CUDA_CHECK(cudaMemPrefetchAsync(b, N * sizeof(int), deviceId));
add<<<32, 256>>>(a, b, N);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
return 0;
}
Is that a hardware/driver/OS limitation? Can I simply ignore the error?
Is that a hardware/driver/OS limitation?
Yes, the latter. Quoting from the documentation
GPUs with SM architecture 6.x or higher (Pascal class or newer)
provide additional Unified Memory features such as on-demand page
migration and GPU memory oversubscription that are outlined throughout
this document. Note that currently these features are only supported
on Linux operating systems.
So asynchronous page migration is not supported in Windows at the moment and that it why you get an error when you try to enable it.
I noticed that when calling cublasSgemm function for each call of gemm from a host, there are 3 kernel invocations: memset, scal_kernel and gemm kernel itself (e.g. sgemm_large). This happens even if I use constants alpha/beta allocated in device memory. While the overhead of memset and scal_kernel is relatively small, the problem is memset is always launched in default stream which causes unnecessary synchronization.
The code:
__constant__ __device__ float alpha = 1;
__constant__ __device__ float beta = 1;
int main()
{
// ... memory allocation skipped ...
float* px = thrust::raw_pointer_cast(x.data());
float* py = thrust::raw_pointer_cast(y.data());
float* pmat = thrust::raw_pointer_cast(mat.data());
for (int iter = 0; iter < 3; ++iter)
{
cbstatus = cublasSgemm(cbh, CUBLAS_OP_N, CUBLAS_OP_N, crow, ccol, cshared, &alpha, px, crow, py, cshared, &beta, pmat, crow);
assert(0 == cbstatus);
}
}
This is what I see in profiler:
The question: is there a way to avoid memset or make it run in the stream assigned to CUBLAS handle?
One idea is to use DP and run device version of the gemm function, but this will work only on CC 3.0 and higher.
There was a bug in CUBLAS5.5 where a cudaMemset was used instead of cudaMemsetAsync in the specialized path where k >> m,n.
It is fixed in CUBLAS6.0 RC. And you can have access to it if you are a registered developer.
Btw, I wonder why you use __constant__ __device__ for alpha,beta.
Are you using pointerMode = DEVICE?
If not, you could simply use alpha,beta on the host.
Try the code below. The code is conceived to have only a cublasSgemm call, apart from unavoidable memory allocations and copies. You will see that
You have only one kernel launched (gemm_kernel1x1_core);
The two calls to cublasSgemm run perfectly in two different streams.
In the picture, the Visual Profiler timeline is shown.
My system: GeForce 540M, Windows 7, CUDA 5.5.
#include <conio.h>
#include <stdio.h>
#include <assert.h>
#include <cublas_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
/********/
/* MAIN */
/********/
int main()
{
int N = 5;
float *A1, *A2, *B1, *B2, *C1, *C2;
float *d_A1, *d_A2, *d_B1, *d_B2, *d_C1, *d_C2;
A1 = (float*)malloc(N*N*sizeof(float));
B1 = (float*)malloc(N*N*sizeof(float));
C1 = (float*)malloc(N*N*sizeof(float));
A2 = (float*)malloc(N*N*sizeof(float));
B2 = (float*)malloc(N*N*sizeof(float));
C2 = (float*)malloc(N*N*sizeof(float));
gpuErrchk(cudaMalloc((void**)&d_A1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C1,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_A2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_B2,N*N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_C2,N*N*sizeof(float)));
for (int i=0; i<N*N; i++) {
A1[i] = ((float)rand()/(float)RAND_MAX);
A2[i] = ((float)rand()/(float)RAND_MAX);
B1[i] = ((float)rand()/(float)RAND_MAX);
B2[i] = ((float)rand()/(float)RAND_MAX);
}
gpuErrchk(cudaMemcpy(d_A1, A1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B1, B1, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_A2, A2, N*N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B2, B2, N*N*sizeof(float), cudaMemcpyHostToDevice));
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));
float alpha = 1.f;
float beta = 1.f;
cublasSafeCall(cublasSetStream(handle,stream1));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A1, N, d_B1, N, &beta, d_C1, N));
cublasSafeCall(cublasSetStream(handle,stream2));
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A2, N, d_B2, N, &beta, d_C2, N));
gpuErrchk(cudaDeviceReset());
return 0;
}
I'm now only need to show an intermediate progress of matrix multiplication.
for(unsigned int col=0; col<mtxSize; col++) {
unsigned tmp = 0;
for(unsigned int row=0; row<mtxSize; row++) {
for(unsigned int idx=0; idx<mtxSize; idx++) {
tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
}
h_Rs[col*mtxSize+row] = tmp;
tmp = 0;
int rate_tmp = (col*mtxSize + (row+1))*100;
// Maybe like this...
fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
fflush(stdout);
}
}
In the case of the host code(use CPU), it is very easy beacause it process sequentially so we can check very easily.
But in the case of the GPU which process in parallel, what should I do?
Once the kernel is running, it does not return until finish the kernel execution.
So I can't check mid-data during the kernel execution time.
I think I need to use asynchronous kernel call, but I do not know well.
And even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to write atomicAdd() (in other words, global memory access) function which is including some overhead?
Give me some advice or hint.
And I want to know in the case of CUDA.
Here is a code which demonstrates how to check progress from a matrix multiply kernel:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(volatile int *data){
unsigned long time;
for (int i = 0; i < INCS; i++){
atomicAdd((int *)data,1);
__threadfence_system();
time = clock64();
while((clock64() - time)<TIME_INC) {};
}
printf("progress check finished\n");
}
__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
float temp = 0.0f;
for (unsigned int k = 0; k < colA; k++)
temp += a[(row*colA)+k] * b[(k*colB) + col];
c[(row*colB)+col] = temp;
#if USE_PROGRESS
if (!(threadIdx.x || threadIdx.y)){
atomicAdd((int *)progress, 1);
__threadfence_system();
}
#endif
}
}
int main(){
// simple test to demonstrate reading progress data from kernel
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
printf("kernel starting\n");
mykernel<<<1,1>>>(d_data);
cudaCheckErrors("kernel fail");
int value = 0;
do{
int value1 = *h_data;
if (value1 > value){
printf("h_data = %d\n", value1);
value = value1;}}
while (value < (INCS-1));
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail 2");
// now try matrix multiply with progress
float *h_c, *d_a, *d_b, *d_c;
h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
if (h_c == NULL) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc a fail");
cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc b fail");
cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc c fail");
for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy a fail");
cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy b fail");
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
*h_data=0;
dim3 block(16,16);
dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
printf("matrix multiply kernel starting\n");
cudaEventRecord(start);
matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
cudaEventRecord(stop);
#if USE_PROGRESS
unsigned int num_blocks = grid.x*grid.y;
float my_progress = 0.0f;
value = 0;
printf("Progress:\n");
do{
cudaEventQuery(stop); // may help WDDM scenario
int value1 = *h_data;
float kern_progress = (float)value1/(float)num_blocks;
if ((kern_progress - my_progress)> 0.1f) {
printf("percent complete = %2.1f\n", (kern_progress*100));
my_progress = kern_progress;}}
while (my_progress < 0.9f);
printf("\n");
#endif
cudaEventSynchronize(stop);
cudaCheckErrors("event sync fail");
float et;
cudaEventElapsedTime(&et, start, stop);
cudaCheckErrors("event elapsed time fail");
cudaDeviceSynchronize();
cudaCheckErrors("mat mult kernel fail");
printf("matrix multiply finished. elapsed time = %f milliseconds\n", et);
return 0;
}
The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.
The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.
Some other uses of signalling are discussed here
I am trying to find the maximum of an array.. I took the help from CUDA Maximum Reduction Algorithm Not Working. and do some own modification. However I am running it for 16 data. I am finding that in kernel code shared memory copies only 1st 4data. rest are lost. I put two cuPrintf..1st printf shows data is their in the shared memory. But the 2nd cuPrintf is just after __syncthreads.. and that shows 0 from thread ids 4 onwords.. pls help
#include
#include
#include
#include
#include
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void findMax(int size,float *array_device , float *outPut)
{
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i< size)
{
sdata[tid] = array_device[i];
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
__threadfence();
}
__syncthreads();
if(tid<size)
cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
for ( int s=blockDim.x/2; s>0; s=s>>1)//s=blockDim.x/2
{
if (tid < s)
{
sdata[tid]= MaxOf2(sdata[tid],sdata[tid+s]);
}
__syncthreads();
}
if (tid == 0) outPut[blockIdx.x] = sdata[0];
}
int main()
{
long double M = pow(2,20);
long double N = 2;
int noThreadsPerBlock = 512 ;
printf("\n Provide the array Size N.(array will be of size N * 2^20 ) :-");
scanf("%Lf",&N);
long int size = 16;
int numOfBlock = (int)size /noThreadsPerBlock + 1;
printf("\n num of blocks==%ld",numOfBlock);
float *array_device , *outPut;
float array_host[]={221,100,2,340,47,36,500,1,33,4460,5,6,7,8,9,11};
cudaMalloc((void **)&array_device, size*sizeof(float));
cudaMalloc((void **)&outPut, size*sizeof(float));
cudaError_t error0 = cudaGetLastError();
printf("\n 0CUDA error: %s\n", cudaGetErrorString(error0));
printf("size===%ld",size);
cudaMemcpy(array_device, array_host, size*sizeof(float), cudaMemcpyHostToDevice);
cudaError_t error1 = cudaGetLastError();
printf("\n1CUDA error: %s\n", cudaGetErrorString(error1));
while(size>1 )
{
cudaPrintfInit();
findMax<<< numOfBlock,noThreadsPerBlock>>>(size,array_device, outPut);cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
cudaError_t error2 = cudaGetLastError();
printf(" 2CUDA error: %s\n", cudaGetErrorString(error2));
cudaMemcpy(array_device, outPut, size*sizeof(float), cudaMemcpyDeviceToDevice);
size = numOfBlock;
printf("\n ****size==%ld\n",size);
numOfBlock = (int)size /noThreadsPerBlock + 1;
}
cudaMemcpy(array_host, outPut, size*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t error3 = cudaGetLastError();
printf("\n3CUDA error: %s\n", cudaGetErrorString(error3));
for(int i=0;i<size;i++)
printf("\n index==%d ;data=%f ",i,array_host[i]);
return 0;
}
I'm posting my comment as an answer as requested.
Firstly, you havent specified dynamic size of shared memory in kernel launch. It should look something like:
findMax<<< numOfBlock,noThreadsPerBlock,sizeof(float)*noThreadsPerBlock>>>
Secondly, what was the concept behind condition if(tid<size) on second cuPrintf? Providing output of the program could also help.
I'm trying to learn CUDA by myself, and I'm now into the issue of branch divergence. As far as I understand, this is the name given to the problem that arises when several threads in a block are said to take a branch (due to if or switch statements, for example), but others in that block don't have to take it.
In order to investigate a little bit further this phenomena and its consequences, I've written a little file with a couple of CUDA functions. One of them is supposed to take lots of time, since the threads are stopped for much more time (9999... iterations) than in the other one (in which they're only stopped for an assignation).
However, when I run the code, I'm getting very similar times. Furthermore, even measuring the time that running both of them takes I get a time similar to running only one. Did I code anything wrong, or is there a logical explanation for this?
Code:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#define ITERATIONS 9999999999999999999
#define BLOCK_SIZE 16
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
float result = 0;
if(threadIdx.x % 2 == 0)
{
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*A[threadIdx.x];
}
} else
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*B[threadIdx.x];
}
}
__global__ void betterDivergence(float *A, float *B){
float result = 0;
float *aux;
//This structure should not affect performance that much
if(threadIdx.x % 2 == 0)
aux = A;
else
aux = B;
for(int i=0;i<ITERATIONS;i++){
result+=A[threadIdx.x]*aux[threadIdx.x];
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
elementsB[x] = (x%2==0)?1:3;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(32/dimBlock.x, 32/dimBlock.y);
divergence<<<dimBlock, dimGrid>>>(d_a, d_b);
betterDivergence<<<dimBlock, dimGrid>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer);
printf("kernel execution time (secs): %f s\n", timerValue);
return 0;
}
1) You have no memory writes in your __global__ code except the local variable(result). I'm not sure that cuda compiler does that, but all your code can be safely removed with no side effect(and maybe the compiler had done that).
2) All your reads from device memory in __global__ functions are from one place on each iteration. Cuda will store the value in register memory and the longest operation(memory access) will be done very fast here.
3) May be the compiler had replaced your cycles with single multiplication like `result=ITERATIONS*A[threadIdx.x]*B[threadIdx.x]
4) If all the code in your functions will be executed as you wrote it, your betterDivergence is going to be approximately 2 times faster than your another function because you have the loops in if branches in slower one and no loops in branches in faster one. But there won't be any idle time in threads among the threads that execute same loop because all threads are going to execute the body of the loop each iteration.
I suggest you to write another example where you will store the result in some device memory and then copy that memory back to host and make some more unpredictable calculations to prevent possible optimizations.
Below is shown the final, tested, right example of a code that allows to compare the performance between CUDA code with and without branch divergence:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
//#define ITERATIONS 9999999999999999999
#define ITERATIONS 999999
#define BLOCK_SIZE 16
#define WARP_SIZE 32
unsigned int hTimer;
void checkCUDAError (const char *msg)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,cudaGetErrorString( err) );
getchar();
exit(EXIT_FAILURE);
}
}
__global__ void divergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > 2)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
__global__ void noDivergence(float *A, float *B){
int a = blockIdx.x*blockDim.x + threadIdx.x;
if (a >= ITERATIONS) return;
if(threadIdx.x > WARP_SIZE)
{
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]+1;
}
} else
for(int i=0;i<ITERATIONS;i++){
B[a]=A[a]-1;
}
}
// ------------------------
// MAIN function
// ------------------------
int main(int argc, char ** argv){
float* d_a;
float* d_b;
float* d_result;
float *elementsA;
float *elementsB;
elementsA = (float *)malloc(BLOCK_SIZE*sizeof(float));
elementsB = (float *)malloc(BLOCK_SIZE*sizeof(float));
//"Randomly" filling the arrays
for(int x=0;x<BLOCK_SIZE;x++){
elementsA[x] = (x%2==0)?2:1;
}
cudaMalloc((void**) &d_a, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_b, BLOCK_SIZE*sizeof(float));
cudaMalloc((void**) &d_result, sizeof(float));
cudaMemcpy(d_a, elementsA, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, elementsB, BLOCK_SIZE*sizeof(float), cudaMemcpyHostToDevice);
CUT_SAFE_CALL(cutCreateTimer(&hTimer));
CUT_CHECK_ERROR("cudaCreateTimer\n");
CUT_SAFE_CALL( cutResetTimer(hTimer) );
CUT_CHECK_ERROR("reset timer\n");
CUT_SAFE_CALL( cutStartTimer(hTimer) );
CUT_CHECK_ERROR("start timer\n");
float timerValue;
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(128/dimBlock.x, 128/dimBlock.y);
//divergence<<<dimGrid, dimBlock>>>(d_a, d_b);
noDivergence<<<dimGrid, dimBlock>>>(d_a, d_b);
checkCUDAError("kernel invocation");
cudaThreadSynchronize();
CUT_SAFE_CALL(cutStopTimer(hTimer));
CUT_CHECK_ERROR("stop timer\n");
timerValue = cutGetTimerValue(hTimer)/1000;
printf("kernel execution time (secs): %f s\n", timerValue);
cudaMemcpy(elementsB, d_b, BLOCK_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}