Improving kernel performance by increasing occupancy? - cuda

Here is an output of Compute Visual Profiler for my kernel on GT 440:
Kernel details: Grid size: [100 1 1], Block size: [256 1 1]
Register Ratio: 0.84375 ( 27648 / 32768 ) [35 registers per thread]
Shared Memory Ratio: 0.336914 ( 16560 / 49152 ) [5520 bytes per
Block]
Active Blocks per SM: 3 (Maximum Active Blocks per SM: 8)
Active threads per SM: 768 (Maximum Active threads per SM: 1536)
Potential Occupancy: 0.5 ( 24 / 48 )
Occupancy limiting factor: Registers
Please, pay your attention to the bullets marked bold. Kernel execution time is 121195 us.
I reduced a number of registers per thread by moving some local variables to the shared memory. The Compute Visual Profiler output became:
Kernel details: Grid size: [100 1 1], Block size: [256 1 1]
Register Ratio: 1 ( 32768 / 32768 ) [30 registers per thread]
Shared Memory Ratio: 0.451823 ( 22208 / 49152 ) [5552 bytes per Block]
Active Blocks per SM: 4 (Maximum Active Blocks per SM: 8)
Active threads per SM: 1024 (Maximum Active threads per SM: 1536)
Potential Occupancy: 0.666667 ( 32 / 48 )
Occupancy limiting factor: Registers
Hence, now 4 blocks are simultaneously executed on a single SM versus 3 blocks in the previous version. However, the execution time is 115756 us, which is almost the same! Why? Aren't the blocks totally independent being executed on different CUDA cores?

You are implicitly assuming that higher occupancy automatically translates into higher performance. That is most often not the case.
The NVIDIA architecture needs a certain number of active warps per MP in order to hide the instruction pipeline latency of the GPU. On your Fermi based card, that requirement translates to a minimum occupancy of about 30%. Aiming for higher occupancies than that minimum will not necessarily result in higher throughput, as the latency bottleneck can have moved to another part of the GPU. Your entry level GPU doesn't have a lot of memory bandwidth, and it is quite possible that 3 blocks per MP is sufficient to make you code memory bandwidth limited, in which case increasing the number of blocks won't have any effect on performance (it might even go down because of increased memory controller contention and cache misses). Further, you said you spilled variables to shared memory in order to reduce the register foot print of the kernel. On Fermi, shared memory only has about 1000 Gb/s of bandwidth, compared to about 8000 Gb/s for registers (see the link below for the microbenchmarking results which demonstrate this). So you have moved variables to slower memory, which may also have a negative effect on performance, offsetting any benefit which high occupancy affords.
If you have not already seen it, I highly recommend Vasily Volkov's presentation from GTC 2010 "Better performance at lower occupancy" (pdf). Here is it shown how exploiting instruction level parallelism can increase GPU throughput to very high levels at very, very low levels of occupancy.

talonmies has already answered your question, so I just want to share a code inspired by the first part of V. Volkov's presentation mentioned in the answer above.
This is the code:
#include<stdio.h>
#define N_ITERATIONS 8192
//#define DEBUG
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x ;
if (tid < N) {
int a = d_a[tid];
int b = d_b[tid];
int c = d_c[tid];
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a = a * b + c;
}
d_a[tid] = a;
}
}
/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/2) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/2];
int b2 = d_b[tid+N/2];
int c2 = d_c[tid+N/2];
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
}
d_a[tid] = a1;
d_a[tid+N/2] = a2;
}
}
/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(int *d_a, int *d_b, int *d_c, unsigned int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N/4) {
int a1 = d_a[tid];
int b1 = d_b[tid];
int c1 = d_c[tid];
int a2 = d_a[tid+N/4];
int b2 = d_b[tid+N/4];
int c2 = d_c[tid+N/4];
int a3 = d_a[tid+N/2];
int b3 = d_b[tid+N/2];
int c3 = d_c[tid+N/2];
int a4 = d_a[tid+3*N/4];
int b4 = d_b[tid+3*N/4];
int c4 = d_c[tid+3*N/4];
for(unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
a3 = a3 * b3 + c3;
a4 = a4 * b4 + c4;
}
d_a[tid] = a1;
d_a[tid+N/4] = a2;
d_a[tid+N/2] = a3;
d_a[tid+3*N/4] = a4;
}
}
/********/
/* MAIN */
/********/
void main() {
const int N = 1024;
int *h_a = (int*)malloc(N*sizeof(int));
int *h_a_result_host = (int*)malloc(N*sizeof(int));
int *h_a_result_device = (int*)malloc(N*sizeof(int));
int *h_b = (int*)malloc(N*sizeof(int));
int *h_c = (int*)malloc(N*sizeof(int));
for (int i=0; i<N; i++) {
h_a[i] = 2;
h_b[i] = 1;
h_c[i] = 2;
h_a_result_host[i] = h_a[i];
for(unsigned int k = 0; k < N_ITERATIONS; k++) {
h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
}
}
int *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(int)));
int *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(int)));
int *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(int)));
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(int), cudaMemcpyHostToDevice));
// --- Creating events for timing
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/***********/
/* KERNEL0 */
/***********/
cudaEventRecord(start, 0);
kernel0<<<1, N>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
/***********/
/* KERNEL1 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
cudaEventRecord(start, 0);
kernel1<<<1, N/2>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
/***********/
/* KERNEL2 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice));
cudaEventRecord(start, 0);
kernel2<<<1, N/4>>>(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GFlops = %f\n", (1.e-6)*(float)(N*N_ITERATIONS)/time);
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_a_result_host[i], h_a_result_device[i]); return; }
cudaDeviceReset();
}
On my GeForce GT540M, the result is
kernel0 GFlops = 21.069281 Occupancy = 66%
kernel1 GFlops = 21.183354 Occupancy = 33%
kernel2 GFlops = 21.224517 Occupancy = 16.7%
which means that kernels with lower occupancy can still exhibit high performance, if Instruction Level Parallelism (ILP) is exploited.

Related

Incorrect addition of Prime numbers in CUDA [duplicate]

This question already has an answer here:
How to find the sum of array in CUDA by reduction
(1 answer)
Closed 3 years ago.
I use reduction logic in code by referring How to find the sum of array in CUDA by reduction.
But It is giving some errors. I am not getting my mistake, could you please help me out??
required specification:
1.Cuda toolkit v6.5
2. graphics: GTX 210 (compute capability 1.2)
3. visual studio 2013
#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>
#define SIZE 10
#define N 100
__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
__shared__ int sdata[256];
int i = threadIdx.x + (blockIdx.x*blockDim.x);
sdata[threadIdx.x] = d_a[i];
__syncthreads();
if (i<SIZE)
for (i = 2; i<SIZE; i++)
{
int counter = 0;
for (int j = 2; j<d_a[i]; j++)
{
if (d_a[i] % j == 0)
{
counter = 1; break;
}
}
if (counter == 0)
{
d_b[i] = d_a[i];
}
}
// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(d_c, sdata[0]);
}
}
int main()
{
clock_t tic = clock();
int *a, *b, *summation=0, sum = 0,count=-1; //declare summation as double/long if needed
int *d_a, *d_b, *d_c;
//int blocks, block_size = 512;
int size = N * sizeof(int);
a = (int *)malloc(SIZE*sizeof(int));
b = (int *)malloc(SIZE*sizeof(int));
summation = (int *)malloc(SIZE*sizeof(int));
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
cudaMalloc((void**)&d_b, SIZE * sizeof(int));
cudaMalloc((void**)&d_c, SIZE * sizeof(int));
for (int i = 1; i<SIZE; i++)
{
a[i] = i;
b[i] = 0;
}
cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
/*blocks = SIZE / block_size;
if (SIZE% block_size != 0)
blocks++; */
dim3 blocksize(256); // create 1D threadblock
dim3 gridsize(N / blocksize.x); //create 1D grid
vectoreAdd << < gridsize, blocksize >> >(d_a, d_b, d_c);
//cudaThreadSynchronize();
cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(summation, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
for (int m = 0; m < SIZE; m++)
{
if (b[m] != 0)
{
printf("\n prime no is:%d", b[m]);
count = count + 1;
}
}
printf("\n\n Total prime no. are: %d", count);
/* for (int j = 1; j<SIZE; j++)
{
sum = sum + b[j];
}*/
printf("\n \nsum of all prime no upto %d is:%d", SIZE, summation);
clock_t toc = clock();
printf("\n\nElapsed: %f seconds\n", (double)(toc - tic) / CLOCKS_PER_SEC);
free(a); free(b); free(summation);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
getchar(); return 0;
}
There are lots of mistakes in your code :
cudaMalloc((void**)&d_a, SIZE * sizeof(int));
should be :
cudaMalloc((void**)&d_a, N * sizeof(int)); //OR
cudaMalloc((void**)&d_a, size);
as you already calculated but didnt passed it. same in case of malloc() //Host code

The efficiency and performance of ILP for the NVIDIA Kepler architecture

Quoting the "Kepler Tuning Guide" provided by NVIDIA:
Also note that Kepler GPUs can utilize ILP in place of
thread/warp-level parallelism (TLP) more readily than Fermi GPUs can.
In my opinion, the following code snippet
a = .....;
a2 = f(a);
a3 = g(a2);
can be improved as follows
a = ...;
b = ....;
a2 = f(a);
b2 = f(b);
a3 = g(a2);
b3 = g(b2);
So in my projects, I have a section of code as follows (example 1)
if(x < src.cols && y < src.rows)
{
if(!mask(y,x))
{
src.ptr(y)[x] = make_short4(0,0,0,0);
}
}
and I rewrite it as follows (example2)
if(x < src.cols && y < src.rows)
{
if(!mask(y,x))
{
short4 t;
t.x = 0;
t.y = 0;
t.z = 0;
t.w = 0;
src.ptr(y)[x].x = t.x;
src.ptr(y)[x].y = t.y;
src.ptr(y)[x].z = t.z;
src.ptr(y)[x].w = t.w;
}
}
In the Kepler architecture, the example2 will be more efficient and exhibit better performance than example1, is that right?
A good explanation on Instruction Level Parallelism (ILP) can be found at CUDA Performance: Maximizing Instruction-Level Parallelism.
It has been pointed out by Robert Crovella and talonmies, and it has been recognized by yourself, that your example above does not reach ILP.
Concerning how implementing ILP, I'm showing below the classical example, translated from the PyCUDA code at numbapro-examples, which I have tested for a Fermi and for a Kepler GPU. Please, notice that for the latter case I have not observed relevant speedups.
THE CODE
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE 64
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
d_c[i] = d_a[i] + d_b[i];
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
int m = l + stride;
float am = d_a[m];
float bm = d_b[m];
int n = m + stride;
float an = d_a[n];
float bn = d_b[n];
int p = n + stride;
float ap = d_a[p];
float bp = d_b[p];
int q = p + stride;
float aq = d_a[q];
float bq = d_b[q];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
float cm = am + bm;
float cn = an + bn;
float cp = ap + bp;
float cq = aq + bq;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
d_c[m] = cm;
d_c[n] = cn;
d_c[p] = cp;
d_c[q] = cq;
}
/********/
/* MAIN */
/********/
void main() {
float timing;
cudaEvent_t start, stop;
const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE
float* a = (float*)malloc(N*sizeof(float));
float* b = (float*)malloc(N*sizeof(float));
float* c = (float*)malloc(N*sizeof(float));
float* c_ref = (float*)malloc(N*sizeof(float));
srand(time(NULL));
for (int i=0; i<N; i++) {
a[i] = rand() / RAND_MAX;
b[i] = rand() / RAND_MAX;
c_ref[i] = a[i] + b[i];
}
float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));
/******************/
/* ILP0 TEST CASE */
/******************/
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP0: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP2 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP2: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP4 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP4: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP8 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP8: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("%f %f\n",c[i],c_ref[i]);
printf("Error!\n");
return;
}
printf("Test passed!\n");
}
PERFORMANCE
Card Kernel Time [ms] Speedup
GeForce GT540M ILP0 4.609 1
" ILP2 2.666 1.72
" ILP4 1.675 2.76
" ILP8 1.477 3.12
Kepler K20c ILP0 0.045
" ILP2 0.043
" ILP4 0.043
" ILP8 0.042

Instructions Per Count (IPC) and Instruction Level Parallelism (ILP) in CUDA

I observe IPC drops as ILP goes up for 32-bit int operations when trying to speed up my cryptographic kernel. The kernel consists of fairly unrolled loops of long sequence of ADD and XOR operations, which should have a throughput of 160 ops per 192 cores per cycle on Kepler (GTX Titan/780).
IPC for my kernel hits the upper bound of 3.28. Using ILP even drops IPC. Apparently ILP fails to help achieve my goal -- fully utilize the pipeline, so I wrote some little experiments. I put the code for ILP 4 at the end.
Profiler Measurements
Results are measured on GTX Titan.
cubin outputs are examined to make sure no instructions are eliminated during optimization.
Executed IPC is almost the same as issued IPC, so I just list one of them.
ADD instructions (XORs have identical behavior)
| ILP 1 | ILP 2 | ILP 4 | ILP 8
--------------------------------------------------
IPC | 4.00 | 3.32 | 2.72 | 3.44
--------------------------------------------------
Issue Slot | 99.17% | 59.34% | 48.61% | 61.71%
Utilization | | | |
I expect ILP 2, 4 and 8 would give better performance, but not.
Recall the integer throughput is 160. The 4 warp scheduler per SM should dual issue up to 5 instructions per cycle, so that IPC should go up towards 5. How can I explain what I observed? Why is the issue slot 99% utilized when IPC = 4?
Float / Int ADD instruction mix
If I modify the code for ILP 4 to do two int ADDs and two float ADDs:
IPC: 5.1
Issue slot utilization: 99.12%
Strangely enough, it seems that the warp scheduler does a better job to issue floating operations.
Discussion
Available literature suggests using ILP help reach the peak performance for floating point operations. Why doesn't ILP apply to integers? How can I do this for integer operations?
My kernel theoretically should do 2.25 integer operations per candidate. This is consistent with what I observed in cuobjdump. There are 2^48 candidates, so the minimun runtime on GTX Titan should be 2.25 * 2^48 / (2688 * 160/192) / 876 MHz = 322.75s. Is this estimation reasonable?
The measured performance for my kernel is 523s. This does imply that integer throughput is only about 160 * 3.28 (measure IPC) / 5 (max IPC).
ILP test code
__device__ int x[10];
__global__ void test(int flag = 0)
{
int a = x[0], b = x[1], c = x[2], d = x[3];
int _a = x[4], _b = x[5], _c = x[6], _d = x[7];
#pragma unroll 128
for (int i = 0; i < 51200; ++i)
{
asm volatile("add.u32 %0, %0, %1;": "+r"(a): "r"(_a));
asm volatile("add.u32 %0, %0, %1;": "+r"(b): "r"(_b));
asm volatile("add.u32 %0, %0, %1;": "+r"(c): "r"(_c));
asm volatile("add.u32 %0, %0, %1;": "+r"(d): "r"(_d));
}
int v = a + b + c + d;
if (flag * v == 1)
x[0] = v;
}
Code fragment for 4 candidates
Each candidate takes 9 / 4 = 2.25 ops. Cuobjdump also verifies this.
d ^= d2(1, 3); // d2 is located in constant memory
s ^= d;
t ^= d2(1, 16);
u ^= d2(1, 17);
v ^= some_const;
flag_s = min(flag_s, s); // int min has throughput of 160
flag_t = flag_t || (s == t); // setp.or should be the same
flag_u = flag_u || (s == u);
flag_v = flag_v || (s == v);
I'm providing an answer to remove this question from the unanswered list.
I do not observe a change in executed Instructions Per Count (IPC) with Instruction Level Parallelism. Overall, it is difficult to argue the reason for the effect observed by the OP without knowing any further information but that provided by the OP himself (f.i., the launch configuration).
In the code below, I'm considering an example using floats, although I have tested the same code with ints without changing the conceptual results. The code implements cyclical Multiply Add (MAD) operations with ILP=1, ILP=2 and ILP=4.
The executed IPC has been the following
ILP IPC FLOPs
1 3.924 67108864
2 4.323 67108864
4 4.016 67108864
for N=8192. The code has been compiled with CUDA 8.0 and run on an NVIDIA GT920M. As it can be seen, IPC keeps almost constant for the differently considered values of ILP. The Floating Point Operations (FLOPs) as estimated by the code assuming 2 FLOPs per MAD coincides with that measured by the Visual Profiler.
THE CODE
#include<stdio.h>
#define N_ITERATIONS 8192
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCKSIZE 512
//#define DEBUG
/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float a = d_a[tid];
float b = d_b[tid];
float c = d_c[tid];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a = a * b + c;
}
d_a[tid] = a;
}
}
/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N / 2) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float c1 = d_c[tid];
float a2 = d_a[tid + N / 2];
float b2 = d_b[tid + N / 2];
float c2 = d_c[tid + N / 2];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
}
d_a[tid] = a1;
d_a[tid + N / 2] = a2;
}
}
/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N / 4) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float c1 = d_c[tid];
float a2 = d_a[tid + N / 4];
float b2 = d_b[tid + N / 4];
float c2 = d_c[tid + N / 4];
float a3 = d_a[tid + N / 2];
float b3 = d_b[tid + N / 2];
float c3 = d_c[tid + N / 2];
float a4 = d_a[tid + 3 * N / 4];
float b4 = d_b[tid + 3 * N / 4];
float c4 = d_c[tid + 3 * N / 4];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
a3 = a3 * b3 + c3;
a4 = a4 * b4 + c4;
}
d_a[tid] = a1;
d_a[tid + N / 4] = a2;
d_a[tid + N / 2] = a3;
d_a[tid + 3 * N / 4] = a4;
}
}
/********/
/* MAIN */
/********/
int main() {
//const int N = 8192 * 64;
const int N = 8192;
//const int N = 1024;
TimingGPU timerGPU;
float *h_a = (float*)malloc(N*sizeof(float));
float *h_a_result_host = (float*)malloc(N*sizeof(float));
float *h_a_result_device = (float*)malloc(N*sizeof(float));
float *h_b = (float*)malloc(N*sizeof(float));
float *h_c = (float*)malloc(N*sizeof(float));
for (int i = 0; i<N; i++) {
h_a[i] = 2.;
h_b[i] = 1.;
h_c[i] = 2.;
h_a_result_host[i] = h_a[i];
for (unsigned int k = 0; k < N_ITERATIONS; k++) {
h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
}
}
float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(float)));
float *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(float)));
float *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(float), cudaMemcpyHostToDevice));
/***********/
/* KERNEL0 */
/***********/
timerGPU.StartCounter();
kernel0 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
/***********/
/* KERNEL1 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
kernel1 << <iDivUp(N / 2, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
/***********/
/* KERNEL2 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
kernel2 << <iDivUp(N / 4, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
cudaDeviceReset();
return 0;
}

Matrix Multiplication giving wrong output [duplicate]

This question already has an answer here:
Unable to execute device kernel in CUDA
(1 answer)
Closed 7 years ago.
What I am attempting to do is Multiply Matrix A & Matrix B and then from the product matrix I get the index of the maximum value per column. But unfortunately, only the first 128*128 values of the matrix multiplication are correct while others are just garbage. I do not quite understand how this works. I request you to kindly guide me with this ..
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 4096;
const int wB = 4096;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = 32; //x + y*wA;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = 21; //x + y*wB;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i,j;
f1 = fopen("C.txt","w");
for(i = hA - 2 ; i < hA; i ++){
for(j = 0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
// free the memory allocated on the CPU
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + k] > temp){
temp = Pd[x*wB + k];
temp_idx = x*wB + k;
}
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
// free the memory allocated on the GPU
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
In your code you seem to have more than one problem. One of the problems is, in place of this:
dim3 dimGrid(wA/blockD, hB/blockD);
You should have this:
dim3 dimGrid(wB/blockD, hA/blockD);
Ultimately you need one thread in your grid for each output point. Your formulation was giving you a grid of 4 blocks by 4 blocks, whereas you need a grid of 128 blocks by 128 blocks.
The other problem I found with your code was in these lines in the kernel:
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
They are not indexing properly through the output array. Rather than try to sort it out using your scheme, I used this instead:
Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
When I made the above two changes to your code, I got what I believe are correct results throughout the array. And it took about 32 seconds on my machine to run it. (Note that I haven't tried fixing your original max-finding code -- see below for a better approach.)
Based on your previous question, you seemed to be concerned about speed. If you want to do fast matrix multiply, you should use cublas. The following code shows how to use cublas to multiply two ordinary C-style matrices (they don't have to be square). I've also included a column-max finding kernel that will be fast when the number of columns is large (say, over 500 or so. You have 4096 columns in your example). For small numbers of columns, there may be quicker ways to perform this function, but small numbers of columns also suggests that the overall problem size may be small and so speed (of this piece of code) will not really be an issue.
Here's the code:
#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx < cols){
float tempmax = mat[idx];
unsigned int tempmidx = 0;
for (int i = 1; i< rows; i++)
if (mat[idx + (i*cols)] > tempmax){
tempmax = mat[idx + (i*cols)];
tempmidx = i;}
max[idx] = tempmax;
midx[idx] = tempmidx;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
unsigned int *h_idx, *d_idx;
h_A = (float *)malloc(SIZ_A*sizeof(float));
if (h_A==0) {printf("malloc fail\n"); return -1;}
h_B = (float *)malloc(SIZ_B*sizeof(float));
if (h_B==0) {printf("malloc fail\n"); return -1;}
h_C = (float *)malloc(SIZ_C*sizeof(float));
if (h_C==0) {printf("malloc fail\n"); return -1;}
h_max = (float *)malloc(COL_C*sizeof(float));
if (h_max==0) {printf("malloc fail\n"); return -1;}
h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));
if (h_idx==0) {printf("malloc fail\n"); return -1;}
cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
cudaMalloc((void **)&d_max, COL_C*sizeof(float));
cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
cudaCheckErrors("cuda malloc fail");
// initialize data
for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);
cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
// C = A*B
// due to cublas expecting column-major storage, parameters
// are scrambled
cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 2 fail");
col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
cudaCheckErrors("kernel launch fail");
cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 3 fail/kernel fail");
if (VERBOSE){
printf("A: \n");
for (int i=0; i< ROW_A; i++){
for (int j=0; j< COL_A; j++)
printf("%7.5G", h_A[j+(i*COL_A)]);
printf("\n");}
printf("B: \n");
for (int i=0; i< ROW_B; i++){
for (int j=0; j< COL_B; j++)
printf("%7.5G", h_B[j+(i*COL_B)]);
printf("\n");}
printf("C = A*B: \n");
for (int i=0; i< ROW_C; i++){
for (int j=0; j< COL_C; j++)
printf("%7.5G", h_C[j+(i*COL_C)]);
printf("\n");}
printf("COLUMN MAX:\n");
for (int i=0; i< COL_C; i++)
printf("%7.5G", h_max[i]);
printf("\nCOLUMN MAX IDX:\n");
for (int i=0; i< COL_C; i++)
printf("%7d", h_idx[i]);
}
printf("\n finished!\n");
return 0;
}
Here's what I used to compile:
$ nvcc -arch=sm_20 -O3 -o t221 t221.cu -lcublas
And here's the sample output:
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$
When I extended my code to handle the same sizes you indicated, (A = 4096x128, B=128x4096) it took about 1 second on my machine. So it's much faster than your code. However, when I take your code and comment out your call to MaxFunction in the kernel, it also only takes about 1 second to compute the matrix multiply result. So if you wanted to keep your matrix multiply code (i.e. not use cublas) you could break the code into 2 kernels, and use your multiply routine in the first kernel with my max-finding routine (col_max) in the second kernel, and also probably get a pretty fast result.
As #talonmies indicated, if you are running on a windows machine, be sure you are aware of the ramifications of windows TDR. (search that in the upper right corner search box if needed)

Difference between program using constant memory and global memory

I have two programs. the only difference is that one uses constant memory to store input while the other uses global memory.I want to know why the global memory one is faster than the constant memory one? They both compute dot product btw 2 matrices
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__constant__ float deva[n],devb[n];
__global__ void addVal( float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += deva[tid] * devb[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
//float *deva, *devb, *devc;
float *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
//cudaMalloc((void**)&deva, n * sizeof(float));
//cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
//cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
//cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(deva, a, n * sizeof(float));
cudaMemcpyToSymbol(devb, b, n * sizeof(float));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>( devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
Below is the global memory version.
#include<cuda_runtime.h>
#include<cuda.h>
#include<stdio.h>
#include<stdlib.h>
#define intMin(a,b) ((a<b)?a:b)
//Threads per block
#define TPB 128
//blocks per grid
#define BPG intMin(128, ((n+TPB-1)/TPB))
const int n = 4;
__global__ void addVal(float *a, float *b, float *c){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
//Using shared memory to temporary store results
__shared__ float cache[TPB];
float temp = 0;
while(tid < n){
temp += a[tid] * b[tid];
tid += gridDim.x * blockDim.x;
}
cache[threadIdx.x] = temp;
__syncthreads();
int i = blockDim.x/2;
while( i !=0){
if(threadIdx.x < i){
cache[threadIdx.x] = cache[threadIdx.x] +cache[threadIdx.x + i] ;
}
__syncthreads();
i = i/2;
}
if(threadIdx.x == 1){
c[blockIdx.x ] = cache[0];
}
}
int main(){
float a[n] , b[n] , c[BPG];
float *deva, *devb, *devc;
int i;
//Filling with random values to test
for( i =0; i< n; i++){
a[i] = i;
b[i] = i*2;
}
printf("Not using constant memory\n");
cudaMalloc((void**)&deva, n * sizeof(float));
cudaMalloc((void**)&devb, n * sizeof(float));
cudaMalloc((void**)&devc, BPG * sizeof(float));
cudaMemcpy(deva, a, n *sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(devb, b, n *sizeof(float), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//Call function to do dot product
addVal<<<BPG, TPB>>>(deva, devb, devc);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time,start, stop);
printf("The elapsed time is: %f\n", time);
//copy result back
cudaMemcpy(c, devc, BPG * sizeof(float), cudaMemcpyDeviceToHost);
float sum =0 ;
for ( i = 0 ; i< BPG; i++){
sum+=c[i];
}
//display answer
printf("%f\n",sum);
getchar();
return 0;
}
You are not getting advantage of the constant memory.
A single read from constant memory can be broadcast to a half-warp (not your case as every thread load from its own tid).
Constant memory is cached (not used in your case as you only read once from each position in the constant memory array).
As each thread in a half-warp does a single read to different data, the 16 different reads get serialized, taking 16 times the amount of time to place the request.
If they are reading from global memory, the request are done at the same time, coalesced. That's why your global memory example is better than the constant memory.
Of course, this conclusion can vary with devices of compute capability 2.x with a L1 and L2 cache.
Regards!