Surface memory takes more time(twice) than global memory - cuda

I am working on optimization of cuda program. So I first started with optimization of matrix multiplication program. Threading scheme which I have used for parallelization is Blocksize(1, 1),Gridsize(N ,N). I am using surface memory for memory optimization purpose(as use of shared memory is not possible for this threading scheme). When I compare the time after and before optimization, I found that execution takes double time after using surface memory(I have tried with different threading scheme but the problem remains same). From whatever I have read till now, global memory is slower than surface memory. So use of surface memory should take less time.Below I am giving matrix multiplication program with surface memory used. Can somebody tell me what is the problem?
#include < stdio.h >
#include < cuda.h >
//#define N 3
surface < void, 2 > a_surf;
surface < void, 2 > b_surf;
surface < void, 2 > c_surf;
void CUDA_SAFE_CALL(cudaError_t call, int line) {
switch (call) {
case cudaSuccess:
break;
default:
printf("ERROR at line :%i.%d' ' %s\n",
line, call, cudaGetErrorString(call));
exit(-1);
break;
}
}
__global__ void mul(int N) {
int a, b, c, temp;
int i;
unsigned int x = blockIdx.x * blockDim.x + (threadIdx.x);
unsigned int y = blockIdx.y * blockDim.y + (threadIdx.y);
if (x < N && y < N) {
temp = 0;
for (i = 0; i < N; i++) {
surf2Dread( & a, a_surf, (x) * 4, i);
surf2Dread( & b, b_surf, (i) * 4, y);
temp += a * b;
}
c = temp;
// Write to output surface
surf2Dwrite(c, c_surf, x * 4, y);
}
}
int main() {
int N = 100;
int a[N][N], b[N][N], c[N][N];
int i, j;
int temp;
clock_t t1, t2;
cudaArray * da, * db, * dc;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc < int > ();
dim3 dimBlock(1, 1);
dim3 dimGrid(N, N);
temp = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
a[i][j] = ++temp;
temp = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
b[i][j] = ++temp;
CUDA_SAFE_CALL(cudaMallocArray( & da, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & db, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & dc, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
int s = N * N * sizeof(int);
CUDA_SAFE_CALL(cudaMemcpyToArray(da, 0, 0, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(db, 0, 0, b, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(a_surf, da), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(b_surf, db), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(c_surf, dc), __LINE__);
t1 = clock();
mul <<<dimGrid, dimBlock>>> (N);
t2 = clock();
CUDA_SAFE_CALL(cudaMemcpyFromArray(c, dc, 0, 0, s, cudaMemcpyDeviceToHost), __LINE__);
double t3 = (double) t2 - (double) t1;
t3 = t3 / CLOCKS_PER_SEC;
printf("\n CUDA time :%lf", t3);
CUDA_SAFE_CALL(cudaFreeArray(da), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(db), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(dc), __LINE__);
}

Optimizing for caches is not a trivial matter. So such trivialized generalization as this:
From whatever I have read till now, global memory is slower than surface memory. So use of surface memory should take less time.
are simply so broad as to be incorrect, in my opinion. It will be frequently true, but not always true. The specifics matter, and proper programming practice matters, too.
Surface memory is nothing more than global memory with an intervening cache. But global memory (on all GPUs supported by current CUDA versions) already has support from L2 (and in some cases L1) cache(s).
The code you have proposed for test/comparison has a number of issues that I would point out:
Your timing methodology is incorrect. This:
t1 = clock();
mul <<<dimGrid, dimBlock>>> (N);
t2 = clock();
will time the duration of the kernel launch not the duration of the kernel execution. So this is almost never the correct way to time things. We can fix this by putting a cudaDeviceSynchronize(); call in the timing region, to force completion of the kernel before timing closure.
This is a particularly bad construct if you are interested in performance:
dim3 dimBlock(1, 1);
because 31 out of every 32 threads in every GPU warp will be inactive, you are leaving 31/32 of the performance of the GPU unused. This has wide-ranging implications. I have no interest in studying the performance of such a scenario, and you shouldn't either (as it is not reflective of real-world performance on well-written codes), unless you are interested in microbenchmarking (not comparative benchmarking). So your code should be fixed to handle at least 32, and ideally 256 or more threads per block.
You've provided no "global memory" comparison case. So I shall provide one.
You've not stated many other factors important for comparative benchmarking, or perf analysis, such as the GPU and platform you are running on, as well as the compile command.
In my opinion, the problem size is too small. A matrix multiply of 100x100 matrices is on the edge of a code that could reasonably occupy the GPU, or test it's performance limits. So I shall make the problem size larger.
With respect to the problem size argument, this is important for the cache discussion. First of all, the surface cache tends to be a spatially-optimized cache, whereas the ordinary L1 and L2 caches are linearly (cache-line) optimized. For very large 2D problems, the surface cache might give better behavior than the L2. But for very small problems, the difference will be less pronounced. Secondly, the surface cache is in addition to the L1 and L2 caches, so a good optimization strategy is to funnel some data through L1 and L2, and other data through surface, to maximize the available cache lines. In fact, since your input matrices are read-only, a further optimization might be to use textures rather than surface for those. But from a contrary point of view, if my problem is so small as to completely fit in the L2 cache, then the surface cache is not likely to give a significant improvement. Your original problem size included 3 matrices of 100x100 int quantities, so about 40Kbytes each, or 120K bytes total. This problem size will fit in the L2 cache of most GPUs. By increasing the problem size (as we shall see - to about 12MB total) we can severely handicap the global-memory-only case.
Here's a code and fully worked example, that has been modified to address most of the above issues. When I run this code on my Quadro5000 GPU on CUDA 7.5/Fedora 20, I observe the surface case to be about 8x faster than the global memory case:
$ cat t1129.cu
#include <stdio.h>
#include <iostream>
typedef int mytype;
const int blk_dim=16;
#define my_N 1000
#define A_VAL 1
#define B_VAL 2
surface < void, 2 > a_surf;
surface < void, 2 > b_surf;
surface < void, 2 > c_surf;
void CUDA_SAFE_CALL(cudaError_t call, int line) {
switch (call) {
case cudaSuccess:
break;
default:
printf("ERROR at line :%i.%d' ' %s\n",
line, call, cudaGetErrorString(call));
exit(-1);
break;
}
}
#ifdef USE_GLOBAL
__global__ void mul(const mytype * __restrict__ d_a, const mytype * __restrict__ d_b, mytype * __restrict__ d_c, const int N)
#else
__global__ void mul(const int N)
#endif
{
mytype a, b, c, temp;
int i;
unsigned int x = blockIdx.x * blockDim.x + (threadIdx.x);
unsigned int y = blockIdx.y * blockDim.y + (threadIdx.y);
if (x < N && y < N) {
temp = 0;
for (i = 0; i < N; i++) {
#ifdef USE_GLOBAL
a = d_a[x*N+i];
b = d_b[i*N+y];
#else
surf2Dread( & a, a_surf, (x) * sizeof(mytype), i);
surf2Dread( & b, b_surf, (i) * sizeof(mytype), y);
#endif
temp += a * b;
}
c = temp;
#ifdef USE_GLOBAL
d_c[x*N+y] = c;
#else
// Write to output surface
surf2Dwrite(c, c_surf, x * sizeof(mytype), y);
#endif
}
}
int main() {
const int N = my_N;
mytype *a, *b, *c, *d_a, *d_b, *d_c;
int i, j;
clock_t t1, t2;
cudaArray * da, * db, * dc;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc < mytype > ();
dim3 dimBlock(blk_dim, blk_dim);
dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x, (N+dimBlock.y-1)/dimBlock.y);
int s = N * N * sizeof(mytype);
a = (mytype *)malloc(s);
b = (mytype *)malloc(s);
c = (mytype *)malloc(s);
CUDA_SAFE_CALL(cudaMalloc(&d_a, s), __LINE__);
CUDA_SAFE_CALL(cudaMalloc(&d_b, s), __LINE__);
CUDA_SAFE_CALL(cudaMalloc(&d_c, s), __LINE__);
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
a[i*N+j] = A_VAL;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
b[i*N+j] = B_VAL;
CUDA_SAFE_CALL(cudaMallocArray( & da, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & db, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & dc, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(da, 0, 0, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(db, 0, 0, b, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(a_surf, da), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(b_surf, db), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(c_surf, dc), __LINE__);
#ifdef USE_GLOBAL
CUDA_SAFE_CALL(cudaMemcpy(d_a, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, s, cudaMemcpyHostToDevice), __LINE__);
#endif
t1 = clock();
#ifdef USE_GLOBAL
mul <<<dimGrid, dimBlock>>> (d_a, d_b, d_c, N);
#else
mul <<<dimGrid, dimBlock>>> (N);
#endif
cudaDeviceSynchronize();
t2 = clock();
CUDA_SAFE_CALL(cudaMemcpyFromArray(c, dc, 0, 0, s, cudaMemcpyDeviceToHost), __LINE__);
#ifdef USE_GLOBAL
CUDA_SAFE_CALL(cudaMemcpy(c, d_c, s, cudaMemcpyDeviceToHost), __LINE__);
#endif
double t3 = (double) t2 - (double) t1;
t3 = t3 / CLOCKS_PER_SEC;
printf("\n CUDA time :%lf\n", t3);
for (i=0; i < N*N; i++)
if(c[i] != A_VAL*B_VAL*N) {std::cout << "mismatch at: " << i << ", was: " << c[i] << " should be: " << A_VAL*B_VAL*N << std::endl; return 1;}
CUDA_SAFE_CALL(cudaFreeArray(da), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(db), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(dc), __LINE__);
std::cout << "Success!" << std::endl;
return 0;
}
[bob#cluster1 misc]$ nvcc -O3 -o t1129 t1129.cu
[bob#cluster1 misc]$ ./t1129
CUDA time :0.028771
Success!
$ nvcc -O3 -DUSE_GLOBAL -o t1129 t1129.cu
$ ./t1129
CUDA time :0.243635
Success!
$
As a final note, there are many other optimizations we could talk about, which would probably shift the comparison one way or the other. But if you actually want to do fast matrix multiply operations, you should use CUBLAS. You should not write your own matrix multiply routines.

Related

Performance difference due to indexing during matrix multiplication

I'm trying out the difference between using a tiled and naive implementation in CUDA C++. I expect to see a performance gap in these variations because of the repeated usage of shared memory. However, the speedup was only about twice as fast (naive ~12ms and tiled ~6ms). Here are the code snippets:
#include <iostream>
#include <assert.h>
using namespace std;
# define N 1024
# define THREADS 16
# define IDX(x, y, s) (x*s + y)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void init_values(int *a, int *b, int sz) {
for(int i=0; i<sz; i++) {
a[i] = rand()%513 - 256;
b[i] = rand()%513 - 256;
}
}
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int t = 0;
for(int i=0; i<n; i++) {
t += (a[IDX(x, i, n)] * b[IDX(i, y, n)]);
}
c[IDX(x, y, n)] = t;
}
void matmul_verify(int *a, int *b, int *c, int n) {
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++) {
int t = 0;
for(int k=0; k<n; k++)
t += a[IDX(i, k, n)] * b[IDX(k, j, n)];
// cout << i << " " << j << " " << c[IDX(i, j, n)] << " " << t << endl;
assert(c[IDX(i, j, n)] == t);
}
}
}
int main()
{
int *a, *b, *c;
int *da, *db, *dc;
size_t sz = N * N * sizeof(int);
a = (int*)malloc(sz);
b = (int*)malloc(sz);
c = (int*)malloc(sz);
init_values(a, b, N*N);
gpuErrchk(cudaMalloc((void**)&da, sz));
gpuErrchk(cudaMalloc((void**)&db, sz));
gpuErrchk(cudaMalloc((void**)&dc, sz));
gpuErrchk(cudaMemcpy(da, a, sz, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(db, b, sz, cudaMemcpyHostToDevice));
// init grid size
dim3 grids(N/THREADS, N/THREADS);
dim3 blocks(THREADS, THREADS);
// time it
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
matmul<<<grids, blocks>>>(da, db, dc, N);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cout << "Took " << milliseconds << " milliseconds.\n";
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(c, dc, sz, cudaMemcpyDeviceToHost));
matmul_verify(a, b, c, N);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
free(a);
free(b);
free(c);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}
and for the tiled implementation, I change the kernel as
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int ty = threadIdx.y, by = blockIdx.y;
int tx = threadIdx.x, bx = blockIdx.x;
int x = bx * blockDim.x + tx;
int y = by * blockDim.y + ty;
// block IDs tell us which block to solve for
// (bx, by) --> (bx: bx + tx, by:by + ty)
__shared__ int A[SHMEM_SIZE];
__shared__ int B[SHMEM_SIZE];
const int tile_size = THREADS;
// to get value of tile [tx, ty] in block [bx, by], we need blocks A[bx, *] and blocks B[*, by]
int res = 0;
for(int blk=0; blk < n; blk+=tile_size) {
// block index
A[IDX(tx, ty, tile_size)] = a[IDX(x, blk + ty, n)];
B[IDX(tx, ty, tile_size)] = b[IDX(blk + tx, y, n)];
__syncthreads();
for(int k=0; k<tile_size; k++) {
res += (A[IDX(tx, k, tile_size)] * B[IDX(k, ty, tile_size)]);
}
__syncthreads();
}
// for(int k=0; k<n; k++)
// res += a[IDX(x, k, n)] * b[IDX(k, y, n)];
c[IDX(x, y, n)] = res;
}
nothing else really changes. However, in the tiled implementation, if I simply change
int ty = threadIdx.x, by = blockIdx.x;
int tx = threadIdx.y, bx = blockIdx.y;
for the initialization of thread and block indices, I get about a ~1ms runtime (12x speedup). How is this happening? I read from the book "CUDA By Example" that the thread and block indices in 2 dimensions are just for programmer convenience and do not reflect any difference in performance. This seems to be false. Any clarification is really appreciated.
CUDA thread blocks are partitioned into warps of 32 threads. Ideally the neighboring lanes of a warp should always load neighboring elements from global memory. This is called coalescing and allows for maximum memory bandwidth. In hardware all the coalesced loads from a warp will be bundled into a minimal number of memory transactions.
Other factors that can deteriorate memory bandwidth are the size of the load (one can try to use the builtin vector types to get bigger loads for optimization, e.g. int2, int4, float2, etc.) and alignment.
The mapping from 3D threadIdx to warp lanes always takes the first dimension .x as the continuous dimension, i.e. a block of dimensions (32, 2, 1) will have one warp with threadIdx.y == 0 and one warp with threadIdx.y == 1 where the lanes of each warp correspond to threadIdx.x.
Therefore to allow for coalescing, you have to access memory as
A[ty * s + tx] // coalesced access
instead of
A[tx * s + ty] // strided access
to achieve optimal performance.
What is probably meant in the book you mentioned is that there shouldn't be a performance difference between launching a grid of (32, 2, 1) blocks and a grid of (64, 1, 1) blocks while manually getting ty = threadIdx.x / 32 and tx = threadIdx.x % 32. These divisions probably happen internally when having a block that is not flat in the first place.

issues of cuBLAS performance on batched complex linear system solver

I am new to cuda and cuBlas, and recently I am trying to use batched cuBlas API to solve multiple systems of linear equations. Here's my code:
The size of the matrix is N, and the number of matrices (batch size) is numOfMat.
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
#include <cuComplex.h> // deal with complex numbers
#include <cuda_profiler_api.h>
using namespace std::chrono;
#define N 6
#define numOfMat 500000
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
std::random_device device;
std::mt19937 generator(device());
std::uniform_real_distribution<double> distribution(1., 5.);
high_resolution_clock::time_point t1;
high_resolution_clock::time_point t2;
double duration = 0;
double duration_1 = 0;
// step 1: cuda solver initialization
cublasHandle_t cublas_handle;
cublasCreate_v2(&cublas_handle);
cublasStatus_t stat;
int* PivotArray;
int* infoArray;
cudaError_t cudaStatUnified1 = cudaSuccess;
cudaError_t cudaStatUnified2 = cudaSuccess;
const cuDoubleComplex alpha = make_cuDoubleComplex(1.0f, 0.0f);
cudaStatUnified1 = cudaMallocManaged(&PivotArray, N * numOfMat * sizeof(int));
cudaStatUnified2 = cudaMallocManaged(&infoArray, numOfMat * sizeof(int));
if ((cudaSuccess != cudaStatUnified1) || (cudaSuccess != cudaStatUnified2))
std::cout <<"unified memory allocated unsuccessful!"<<std::endl;
//ALLOCATE MEMORY - using unified memory
cuDoubleComplex** h_A;
cudaMallocManaged(&h_A, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_A[0]), sizeof(cuDoubleComplex)*numOfMat*N*N);
for (int nm = 1; nm < numOfMat; nm++)
h_A[nm] = h_A[nm-1]+ N * N;
cuDoubleComplex** h_b;
cudaMallocManaged(&h_b, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_b[0]), sizeof(cuDoubleComplex) * numOfMat * N);
for (int nm = 1; nm < numOfMat; nm++)
h_b[nm] = h_b[nm-1] + N;
// FILL MATRICES
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[nm][j * N + i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
// FILL COEFFICIENTS
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
h_b[nm][i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
t1 = high_resolution_clock::now();
// step 2: Perform CUBLAS LU solver
stat = cublasZgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("-data download failed");
gpuErrchk( cudaDeviceSynchronize() );
// check if the input matrix is singular
/*for (int i = 0; i < numOfMat; i++)
if (infoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
}*/
// step 3: INVERT UPPER AND LOWER TRIANGULAR MATRICES
// --- Function solves the triangular linear system with multiple RHSs
// --- Function overrides b as a result
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("--data download failed");
gpuErrchk( cudaDeviceSynchronize() );
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("---data download failed");
gpuErrchk( cudaDeviceSynchronize() );
t2 = high_resolution_clock::now();
duration = duration_cast<microseconds>(t2 - t1).count();
std::cout<<duration<<std::endl;
}
The code works fine, but when I plot the computational time versus the number of matrices, the curve looks like this:
My question is: why does the computational time show linear to the number of matrices? Intuitively, the curve should look to be flat when the batch size is large in some extent. However, when the batch size reaches up to 500,000, the time still appears to be linear to the batch size.
How can it be? Is there any explanation behind such a circumstance?
I think you need to look more closely at your data. If I run a modification of your code on Google Colab (Tesla T4) I get this:
Which looks largely like your figure. But look more closely (log scales help):
You can clearly see that up to a certain point, the runtime is largely independent of the number of matrices (around 2^8 = 64), but then scaling is linear as sizes increase. That is the transition from being able to parallelize the workload to reaching parallel capacity and having to schedule many parallel groups of operations to execute the workload. You might infer that for this particular GPU, the GPU run out of parallel capacity at between 64 and 128 concurrent operations (The T4 has 40 SM, so it might well be 80 if an SM could accommodate 2 operations per SM concurrently), after which runtime scales with multiples of that limiting size.
This is completely normal behaviour for any parallel computation architecture I am familiar with.

CUB reduction using 2D grid of blocks

I'm trying to make a sum using the CUB reduction method.
The big problem is:
I'm not sure how to return the values of each block to the Host when using 2-dimensional grids.
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
The idea here is to return a 1D array, which contains the sum of each block.
I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that).
I looked for some example that use 2D grid with CUB, but did not find.
I really new in CUDA program, maybe I'm making a mistake.
edit: I put the complete code.
For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121
Probably the main issue is that your output indexing is not correct. Here's a reduced version of your code demonstrating correct results for arbitrary GRID_SIZE:
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
The important change I made to your kernel code was in the output indexing:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
We want a simulated 2D index into an array that has width and height of GRID_SIZE consisting of one float quantity per point. Therefore the width of this array is given by gridDim.x (not blockDim). The gridDim variable gives the dimensions of the grid in terms of blocks - and this lines up exactly with how our results array is set up.
Your posted code will fail if GRID_SIZE and BLOCK_SIZE are different (for example, if GRID_SIZE were smaller than BLOCK_SIZE, cuda-memcheck will show illegal accesses, and if GRID_SIZE is larger than BLOCK_SIZE then this indexing error will result in blocks overwriting each other's values in the output array) because of this mixup between blockDim and gridDim.
Also note that float operations typically only have around 5 decimal digits of precision. So small differences in the 5th or 6th decimal place may be attributable to order of operations differences when doing floating-point arithmetic. You can prove this to yourself by switching to double arithmetic.

Optimize vector matrix multiplication in cuda with large number of zeros

I am using the following kernel to optimize vector-matrix multiplication for the case where both the vector and the matrix have a large number of zeros. The use of this kernel may reduce the time taken for such a multiplication by up to half of the time taken by cublasSgemv, for the case where there are more than 90% zeros. But, it is still much longer than an equivalent blas gemm host call on Ubuntu 14.04
vec = 1 x m, mat = m x m and prod = 1 x m; all are in row-major order
m >= 5000
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
What can be done to further enhance the performance of this kernel apart from libraries like cuSparse?
Would be nice if this optimization was compatible with Compute Capability of 1.2
Thanks
EDIT
Corrected: prod = 1 x m
GPU = Quadro FX 1800M, Cuda v.5.0 on Ubuntu 14.04
EDIT
Complete code that performs multiplication using i. blas, ii. cublas, iii. above kernel for m = 6000. Please enter 0, when asked to enter a value
#include <iostream>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <cublas_v2.h>
#include <math.h>
using namespace std;
const int m = 6000;
const int BS = 512; // threads per block
const int NB = ceil((float) m / BS); // number of blocks
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
int main()
{
timespec blas_start, blas_end, cublas_start, cublas_end, opt_start, opt_end;
long totalnsec; //total nano sec
double totalsec, totaltime;
int i, j;
float *A = new float[m]; // 1 x m
float *B = new float[m*m]; // m x m
float *C = new float[m]; // 1 x m
float input;
cout<<"Enter a value to populate the vector (0 to make it sparse) ";
cin>>input;
// input martix A: every 600th element is non-zero i.e 90% zero
for(i = 0; i < m; i++)
{
A[i] = input;
if( i % 600 == 0) //adjust for sparsity
A[i] = i;
}
// input matrix B: identity matrix
for(i = 0; i < m; i++)
for(j = 0; j < m; j++)
B[j*m + i] = (i==j);
//blas on host
clock_gettime(CLOCK_REALTIME, &blas_start);
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, m, m, 1.0f, A, m, B, m, 0.0f, C, m);
//cblas_sgemv(CblasRowMajor, CblasTrans, m, m, 1.0f, B, m, A, 1, 0.0f, C, 1);
clock_gettime(CLOCK_REALTIME, &blas_end);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
//cublas section
cudaError_t cudaStat;
cublasHandle_t handle;
cublasCreate(&handle);
float *A_d, *B_d, *C_d;
cudaStat = cudaMalloc(&A_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for A_d\n");
cudaStat = cudaMalloc(&B_d, sizeof(float)*m*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for B_d\n");
cudaStat = cudaMalloc(&C_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for C_d\n");
cudaMemcpy(A_d, A, sizeof(float)*m, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B, sizeof(float)*m*m, cudaMemcpyHostToDevice);
float alpha = 1.0f, beta = 0.0f;
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_start);
cublasSgemv(handle, CUBLAS_OP_N, m, m, &alpha, B_d, m, A_d, 1, &beta, C_d, 1);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Call kernel having Optimization for Zeros
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_start);
/////////////////// call kernel //////////////////
calc_v_m<<<NB, BS>>>(A_d, B_d, C_d, m);
//////////////////////////////////////////////////
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/*for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Print times
// blas time
totalsec = (double)blas_end.tv_sec - (double)blas_start.tv_sec;
totalnsec = blas_end.tv_nsec - blas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"blas Time = "<< totaltime << "\n";
//cublas time
totalsec = (double)cublas_end.tv_sec - (double)cublas_start.tv_sec;
totalnsec = cublas_end.tv_nsec - cublas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"cublas Time = "<< totaltime << "\n";
//Optimized Kernel Time
totalsec = (double)opt_end.tv_sec - (double)opt_start.tv_sec;
totalnsec = opt_end.tv_nsec - opt_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"Opt Kernel Time = "<< totaltime << "\n";
return 0;
}
Results
$ nvcc -arch=sm_12 blascomp.cu -o blascomp.o -lblas -lcublas
$ ./blascomp.o
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 0.000105207
cublas Time = 0.0070294
Opt Kernel Time = 0.00642797
At least on my system blas is still the fastest for such a scenario
Things get even more interesting if every '1200th' element instead of '600th' is set to 0
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 7.84e-05
cublas Time = 0.00698783
Opt Kernel Time = 0.00643042
The important thing to recognise here is that the gemv operation you are concerned with is fundamentally memory throughput limited on GPUs, rather than compute throughput limited. This implies that an "optimisation" as you have shown in your kernel:
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
isn't really an optmisation at all, simply because the memory transactions are the performance bottleneck in the kernel, not the floating point arithmetic, and your code must perform most of the memory transactions irrespective of whether the multiply add operation will be performed because of zero detection or not.
Consider the following, instrumented version of roughly the same code:
__constant__ float cvec1[2];
__global__ void
__launch_bounds__(512,4)
calc_v_m1(const float* __restrict__ vec,
const float* __restrict__ mat,
float* __restrict__ prod,
int m,
int do_reads = 1,
int do_write = 1)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
float res = 0;
float mval = cvec1[0], vval = cvec1[1];
#pragma unroll 8
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if (do_reads) {
mval = mat[offset];
vval = vec[i];
}
res += mval * vval;
}
if (do_write) prod[x] = res;
}
}
Here I have added two optional arguments which control whether the kernel will load from global memory, and whether the kernel will store to global memory. This allows me to quantify the performance impact of the memory loads, computation, and memory stores independently. The results using your test code are instructive:
Function nvprof time
-----------------------------------------------
cublasSgemv 942.75us
calc_v_m 2798.4us
calc_v_m1(do_reads=1, do_write=1) 962.40us
calc_v_m1(do_reads=1, do_write=0) 970.40us
calc_v_m1(do_reads=0, do_write=1) 55.166us
calc_v_m1(do_reads=0, do_write=0) 55.102us
[All benchmarking done on a GTX970 using the CUDA 7.5 release toolchain and CUBLAS 7.5 library]
In no particular order:
The full instrumented kernel runtime is within a few percent of the equivalent CUBLAS call
The memory fetches from global memory are the bottleneck
The actual computations in the kernel only constitute 5% of the kernel running time
The "fire-and-forget" nature of write operations in CUDA means that the latency of the write has no significant effect on throughput.
Your "optimised" kernel is considerably slower than either CUBLAS or the instrumented kernel, probably because all you are introducing is branch divergence without addressing the source of the kernel bottleneck (the latency of the memory loads).
The only times conditionally executing the FMAD operation makes sense would be in an architecture where memory has near zero latency and floating point throughput was severely constrained. The GPU definitely doesn't fall into that category.
The only other option for optimising this would be to exploit a priori information about the sparsity patterns in the LHS matrix to remove the need to read zero entries. Which is precisely what sparse matrix formats and linear algebra codes are designed to accommodate.

Unspecified launch failure after cudaDeviceSynchronize() call when program starts. But no errors using step-through debugging. CUDA

I've spent several hours struggling with unspecified launch failure.
I've come up with a tiny task for myself in order to understand how shared memory works.
Task is to divide array [1, 2, 3, ... , N] into K group of (N / K) elements and find the sum of each group. (Difference between current and previous element of the array equals 1)
I was planning to use N threads in grid divided between K blocks. So every threadblock contains (N / K) threads. Thus one threadblock could be used to compute sum of one group. Also I wanted to dynamically allocate shared memory.
When I start program I got unspecified launch failure after cudaDeviceSynchronize() call. But when I try step-through debugging everthing is ok and works fine.
What am I doing wrong? (Visual Studio 2012 Professional, Compute Capability 2.1) I would very appreciate any help.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
extern __shared__ double shrd[];
__global__ void kernel(double * a){
size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
temp[threadID_block] = static_cast<double>(threadID_global);
__syncthreads();
if (threadID_block == 0){
a[blockID_global] = 0.0;
for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
a[blockID_global] += temp[index];
}
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
dim3 gridSize(2,2,1);
dim3 blockSize(4,4,1);
double * dev_a = NULL;
size_t length = gridSize.x * gridSize.y ;
size_t byteSize = length * sizeof(double);
CUDA_CALL(cudaMalloc(&dev_a,byteSize));
size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
kernel <<< gridSize, blockSize, shmem_perBlock >>> (dev_a);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
double * a = new double [length];
CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
for (size_t index = 0; index < length; index++){
printf("%.3f\n",a[index]);
}
printf("\n");
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaDeviceReset());
delete[]a;
system("pause");
return 0;
}
If you are on kepler or later first read this:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
Otherwise if you are pre-kepler read this:
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
There are some fundamentals you are missing in terms of CUDA programming. I have given you a template of your code below. It is for clarification of some of these fundamentals. Do not expect this to be optimized as I am expecting you to program the parallel reduction. This will get you started with an understanding on how to use shared memory.
Good Luck!
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10000
#define K 100
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
__global__ void kernel(double* a, double* results){
extern __shared__ double shared[];
size_t tid, tid_local, stride;
tid = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
tid_local = threadIdx.x; //thread id within a block
stride = blockDim.x*gridDim.x; //total number of threads
double *start = &a[K*blockIdx.x]; //each block will get K of a block.
shared[tid_local]=start[tid_local]; //copy K elements into shared memory
__syncthreads();
//Perform Parallel reduction, you will have to implement this
//After parallel reduction, result should be in shared[0]
//for demonstration I made the code serial for each block on thread 0.
//This is for demonstration only.
double sum=0;
if(tid_local==0){
for(int i=0; i<K; i++){
sum+=shared[i];
}
a[blockIdx.x]=sum;
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
double * dev_a = NULL;
double * dev_results=NULL;
CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));
//copy dev_a onto GPU (this is the array you are summing).
dim3 block_size(K, 1, 1);
dim3 grid_size (N/K, 1, 1);
size_t shmem_perBlock = K * sizeof(double);
kernel <<< grid_size, block_size, shmem_perBlock >>> (dev_a, dev_results);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
//copy dev_results back to CPU, this is your result.
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaFree(dev_results));
system("pause");
return 0;
}