issues of cuBLAS performance on batched complex linear system solver - cuda

I am new to cuda and cuBlas, and recently I am trying to use batched cuBlas API to solve multiple systems of linear equations. Here's my code:
The size of the matrix is N, and the number of matrices (batch size) is numOfMat.
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
#include <cuComplex.h> // deal with complex numbers
#include <cuda_profiler_api.h>
using namespace std::chrono;
#define N 6
#define numOfMat 500000
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
std::random_device device;
std::mt19937 generator(device());
std::uniform_real_distribution<double> distribution(1., 5.);
high_resolution_clock::time_point t1;
high_resolution_clock::time_point t2;
double duration = 0;
double duration_1 = 0;
// step 1: cuda solver initialization
cublasHandle_t cublas_handle;
cublasCreate_v2(&cublas_handle);
cublasStatus_t stat;
int* PivotArray;
int* infoArray;
cudaError_t cudaStatUnified1 = cudaSuccess;
cudaError_t cudaStatUnified2 = cudaSuccess;
const cuDoubleComplex alpha = make_cuDoubleComplex(1.0f, 0.0f);
cudaStatUnified1 = cudaMallocManaged(&PivotArray, N * numOfMat * sizeof(int));
cudaStatUnified2 = cudaMallocManaged(&infoArray, numOfMat * sizeof(int));
if ((cudaSuccess != cudaStatUnified1) || (cudaSuccess != cudaStatUnified2))
std::cout <<"unified memory allocated unsuccessful!"<<std::endl;
//ALLOCATE MEMORY - using unified memory
cuDoubleComplex** h_A;
cudaMallocManaged(&h_A, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_A[0]), sizeof(cuDoubleComplex)*numOfMat*N*N);
for (int nm = 1; nm < numOfMat; nm++)
h_A[nm] = h_A[nm-1]+ N * N;
cuDoubleComplex** h_b;
cudaMallocManaged(&h_b, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_b[0]), sizeof(cuDoubleComplex) * numOfMat * N);
for (int nm = 1; nm < numOfMat; nm++)
h_b[nm] = h_b[nm-1] + N;
// FILL MATRICES
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[nm][j * N + i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
// FILL COEFFICIENTS
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
h_b[nm][i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
t1 = high_resolution_clock::now();
// step 2: Perform CUBLAS LU solver
stat = cublasZgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("-data download failed");
gpuErrchk( cudaDeviceSynchronize() );
// check if the input matrix is singular
/*for (int i = 0; i < numOfMat; i++)
if (infoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
}*/
// step 3: INVERT UPPER AND LOWER TRIANGULAR MATRICES
// --- Function solves the triangular linear system with multiple RHSs
// --- Function overrides b as a result
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("--data download failed");
gpuErrchk( cudaDeviceSynchronize() );
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("---data download failed");
gpuErrchk( cudaDeviceSynchronize() );
t2 = high_resolution_clock::now();
duration = duration_cast<microseconds>(t2 - t1).count();
std::cout<<duration<<std::endl;
}
The code works fine, but when I plot the computational time versus the number of matrices, the curve looks like this:
My question is: why does the computational time show linear to the number of matrices? Intuitively, the curve should look to be flat when the batch size is large in some extent. However, when the batch size reaches up to 500,000, the time still appears to be linear to the batch size.
How can it be? Is there any explanation behind such a circumstance?

I think you need to look more closely at your data. If I run a modification of your code on Google Colab (Tesla T4) I get this:
Which looks largely like your figure. But look more closely (log scales help):
You can clearly see that up to a certain point, the runtime is largely independent of the number of matrices (around 2^8 = 64), but then scaling is linear as sizes increase. That is the transition from being able to parallelize the workload to reaching parallel capacity and having to schedule many parallel groups of operations to execute the workload. You might infer that for this particular GPU, the GPU run out of parallel capacity at between 64 and 128 concurrent operations (The T4 has 40 SM, so it might well be 80 if an SM could accommodate 2 operations per SM concurrently), after which runtime scales with multiples of that limiting size.
This is completely normal behaviour for any parallel computation architecture I am familiar with.

Related

sum vectors values with cuda C++

I try to sum many vectors values using CUDA c++. I found some solution for two vectors. As you can see, just possible to add two vectors but I wanna generate vectors dynamically with the same length.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
// Get our global thread ID
int id = blockIdx.x*blockDim.x+threadIdx.x;
// Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}
int main( int argc, char* argv[] )
{
// Size of vectors
int n = 100000;
// Host input vectors
double *h_a;
double *h_b;
//Host output vector
double *h_c;
// Device input vectors
double *d_a;
double *d_b;
//Device output vector
double *d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
// Allocate memory for each vector on GPU
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
int i;
// Initialize vectors on host
for( i = 0; i < n; i++ ) {
h_a[i] = sin(i)*sin(i);
h_b[i] = cos(i)*cos(i);
}
// Copy host vectors to device
cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
int blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (int)ceil((float)n/blockSize);
// Execute the kernel
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
// Copy array back to host
cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
// Sum up vector c and the print result divided by n, this should equal 1
within error
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
// Release device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// Release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
Is there a way to do this for many vectors? My vectors size are:
#vector length
N = 1000
#number of vectors
i = 300000
v[i] = [1,2,..., N]
As result i need to get:
out[i]= [sum(v[1]), sum(v[2]),..., sum(v[i])]
Thanks for any advice.
Summing multiple vectors together in a fashion similar to the code you have shown (i.e. generating elementwise sums) is equivalent to summing the columns of a matrix. And this idea represents a sensible way to realize the solution.
We will treat your vectors as a matrix, where each vector is a row in the matrix. The CUDA kernel will assign one thread to each column, and will sum the elements of that column, producing a single number result. That single number result will become one element of the vector result of the entire problem.
Here is a fully worked example demonstrating one possible approach:
$ cat t2.cu
#include <iostream>
typedef double mt;
const int nTPB = 64;
template <typename T>
__global__ void column_sum(T *matrix, T *sums, unsigned n_vectors, unsigned vector_length){
unsigned idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < vector_length){
T temp = 0;
for (unsigned i = 0; i < n_vectors; i++)
temp += matrix[i*vector_length+idx];
sums[idx] = temp;}
}
int main(){
const unsigned vlen = 1000;
const unsigned nvec = 300000;
mt *h_matrix, *d_matrix, *h_sums, *d_sums;
// create the desired number of vectors as a single matrix
h_sums = new mt[vlen];
h_matrix = new mt[vlen*nvec];
cudaMalloc(&d_matrix, vlen*nvec*sizeof(mt));
cudaMalloc(&d_sums, vlen*sizeof(mt));
size_t count = 0;
for (unsigned i = 0; i < nvec; i++)
for (unsigned j = 0; j < vlen; j++)
h_matrix[count++] = j;
cudaMemcpy(d_matrix, h_matrix, vlen*nvec*sizeof(mt), cudaMemcpyHostToDevice);
column_sum<<<(vlen+nTPB-1)/nTPB,nTPB>>>(d_matrix, d_sums, nvec, vlen);
cudaMemcpy(h_sums, d_sums, vlen*sizeof(mt), cudaMemcpyDeviceToHost);
for (unsigned i = 0; i < vlen; i++) if (h_sums[i] != ((mt)nvec)*i) {std::cout << " mismatch at " << i << " was: " << h_sums[i] << " should be: " << ((mt)nvec)*i << std::endl; return -1;}
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
}
$ nvcc -o t2 t2.cu
$ cuda-memcheck ./t2
========= CUDA-MEMCHECK
no error
========= ERROR SUMMARY: 0 errors
$
Note that this methodology only creates as many threads on the GPU as there are vector elements (1000 in the above example). 1000 threads would be enough to keep only the smallest GPUs busy. However this algorithm will be efficient on most GPUs if your vector length is 10,000 or longer. If you'd like to explore creating more efficient algorithms for small problem sizes, you can study the idea of a classical parallel reduction.

CUB reduction using 2D grid of blocks

I'm trying to make a sum using the CUB reduction method.
The big problem is:
I'm not sure how to return the values of each block to the Host when using 2-dimensional grids.
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
The idea here is to return a 1D array, which contains the sum of each block.
I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that).
I looked for some example that use 2D grid with CUB, but did not find.
I really new in CUDA program, maybe I'm making a mistake.
edit: I put the complete code.
For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121
Probably the main issue is that your output indexing is not correct. Here's a reduced version of your code demonstrating correct results for arbitrary GRID_SIZE:
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
The important change I made to your kernel code was in the output indexing:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
We want a simulated 2D index into an array that has width and height of GRID_SIZE consisting of one float quantity per point. Therefore the width of this array is given by gridDim.x (not blockDim). The gridDim variable gives the dimensions of the grid in terms of blocks - and this lines up exactly with how our results array is set up.
Your posted code will fail if GRID_SIZE and BLOCK_SIZE are different (for example, if GRID_SIZE were smaller than BLOCK_SIZE, cuda-memcheck will show illegal accesses, and if GRID_SIZE is larger than BLOCK_SIZE then this indexing error will result in blocks overwriting each other's values in the output array) because of this mixup between blockDim and gridDim.
Also note that float operations typically only have around 5 decimal digits of precision. So small differences in the 5th or 6th decimal place may be attributable to order of operations differences when doing floating-point arithmetic. You can prove this to yourself by switching to double arithmetic.

Surface memory takes more time(twice) than global memory

I am working on optimization of cuda program. So I first started with optimization of matrix multiplication program. Threading scheme which I have used for parallelization is Blocksize(1, 1),Gridsize(N ,N). I am using surface memory for memory optimization purpose(as use of shared memory is not possible for this threading scheme). When I compare the time after and before optimization, I found that execution takes double time after using surface memory(I have tried with different threading scheme but the problem remains same). From whatever I have read till now, global memory is slower than surface memory. So use of surface memory should take less time.Below I am giving matrix multiplication program with surface memory used. Can somebody tell me what is the problem?
#include < stdio.h >
#include < cuda.h >
//#define N 3
surface < void, 2 > a_surf;
surface < void, 2 > b_surf;
surface < void, 2 > c_surf;
void CUDA_SAFE_CALL(cudaError_t call, int line) {
switch (call) {
case cudaSuccess:
break;
default:
printf("ERROR at line :%i.%d' ' %s\n",
line, call, cudaGetErrorString(call));
exit(-1);
break;
}
}
__global__ void mul(int N) {
int a, b, c, temp;
int i;
unsigned int x = blockIdx.x * blockDim.x + (threadIdx.x);
unsigned int y = blockIdx.y * blockDim.y + (threadIdx.y);
if (x < N && y < N) {
temp = 0;
for (i = 0; i < N; i++) {
surf2Dread( & a, a_surf, (x) * 4, i);
surf2Dread( & b, b_surf, (i) * 4, y);
temp += a * b;
}
c = temp;
// Write to output surface
surf2Dwrite(c, c_surf, x * 4, y);
}
}
int main() {
int N = 100;
int a[N][N], b[N][N], c[N][N];
int i, j;
int temp;
clock_t t1, t2;
cudaArray * da, * db, * dc;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc < int > ();
dim3 dimBlock(1, 1);
dim3 dimGrid(N, N);
temp = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
a[i][j] = ++temp;
temp = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
b[i][j] = ++temp;
CUDA_SAFE_CALL(cudaMallocArray( & da, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & db, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & dc, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
int s = N * N * sizeof(int);
CUDA_SAFE_CALL(cudaMemcpyToArray(da, 0, 0, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(db, 0, 0, b, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(a_surf, da), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(b_surf, db), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(c_surf, dc), __LINE__);
t1 = clock();
mul <<<dimGrid, dimBlock>>> (N);
t2 = clock();
CUDA_SAFE_CALL(cudaMemcpyFromArray(c, dc, 0, 0, s, cudaMemcpyDeviceToHost), __LINE__);
double t3 = (double) t2 - (double) t1;
t3 = t3 / CLOCKS_PER_SEC;
printf("\n CUDA time :%lf", t3);
CUDA_SAFE_CALL(cudaFreeArray(da), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(db), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(dc), __LINE__);
}
Optimizing for caches is not a trivial matter. So such trivialized generalization as this:
From whatever I have read till now, global memory is slower than surface memory. So use of surface memory should take less time.
are simply so broad as to be incorrect, in my opinion. It will be frequently true, but not always true. The specifics matter, and proper programming practice matters, too.
Surface memory is nothing more than global memory with an intervening cache. But global memory (on all GPUs supported by current CUDA versions) already has support from L2 (and in some cases L1) cache(s).
The code you have proposed for test/comparison has a number of issues that I would point out:
Your timing methodology is incorrect. This:
t1 = clock();
mul <<<dimGrid, dimBlock>>> (N);
t2 = clock();
will time the duration of the kernel launch not the duration of the kernel execution. So this is almost never the correct way to time things. We can fix this by putting a cudaDeviceSynchronize(); call in the timing region, to force completion of the kernel before timing closure.
This is a particularly bad construct if you are interested in performance:
dim3 dimBlock(1, 1);
because 31 out of every 32 threads in every GPU warp will be inactive, you are leaving 31/32 of the performance of the GPU unused. This has wide-ranging implications. I have no interest in studying the performance of such a scenario, and you shouldn't either (as it is not reflective of real-world performance on well-written codes), unless you are interested in microbenchmarking (not comparative benchmarking). So your code should be fixed to handle at least 32, and ideally 256 or more threads per block.
You've provided no "global memory" comparison case. So I shall provide one.
You've not stated many other factors important for comparative benchmarking, or perf analysis, such as the GPU and platform you are running on, as well as the compile command.
In my opinion, the problem size is too small. A matrix multiply of 100x100 matrices is on the edge of a code that could reasonably occupy the GPU, or test it's performance limits. So I shall make the problem size larger.
With respect to the problem size argument, this is important for the cache discussion. First of all, the surface cache tends to be a spatially-optimized cache, whereas the ordinary L1 and L2 caches are linearly (cache-line) optimized. For very large 2D problems, the surface cache might give better behavior than the L2. But for very small problems, the difference will be less pronounced. Secondly, the surface cache is in addition to the L1 and L2 caches, so a good optimization strategy is to funnel some data through L1 and L2, and other data through surface, to maximize the available cache lines. In fact, since your input matrices are read-only, a further optimization might be to use textures rather than surface for those. But from a contrary point of view, if my problem is so small as to completely fit in the L2 cache, then the surface cache is not likely to give a significant improvement. Your original problem size included 3 matrices of 100x100 int quantities, so about 40Kbytes each, or 120K bytes total. This problem size will fit in the L2 cache of most GPUs. By increasing the problem size (as we shall see - to about 12MB total) we can severely handicap the global-memory-only case.
Here's a code and fully worked example, that has been modified to address most of the above issues. When I run this code on my Quadro5000 GPU on CUDA 7.5/Fedora 20, I observe the surface case to be about 8x faster than the global memory case:
$ cat t1129.cu
#include <stdio.h>
#include <iostream>
typedef int mytype;
const int blk_dim=16;
#define my_N 1000
#define A_VAL 1
#define B_VAL 2
surface < void, 2 > a_surf;
surface < void, 2 > b_surf;
surface < void, 2 > c_surf;
void CUDA_SAFE_CALL(cudaError_t call, int line) {
switch (call) {
case cudaSuccess:
break;
default:
printf("ERROR at line :%i.%d' ' %s\n",
line, call, cudaGetErrorString(call));
exit(-1);
break;
}
}
#ifdef USE_GLOBAL
__global__ void mul(const mytype * __restrict__ d_a, const mytype * __restrict__ d_b, mytype * __restrict__ d_c, const int N)
#else
__global__ void mul(const int N)
#endif
{
mytype a, b, c, temp;
int i;
unsigned int x = blockIdx.x * blockDim.x + (threadIdx.x);
unsigned int y = blockIdx.y * blockDim.y + (threadIdx.y);
if (x < N && y < N) {
temp = 0;
for (i = 0; i < N; i++) {
#ifdef USE_GLOBAL
a = d_a[x*N+i];
b = d_b[i*N+y];
#else
surf2Dread( & a, a_surf, (x) * sizeof(mytype), i);
surf2Dread( & b, b_surf, (i) * sizeof(mytype), y);
#endif
temp += a * b;
}
c = temp;
#ifdef USE_GLOBAL
d_c[x*N+y] = c;
#else
// Write to output surface
surf2Dwrite(c, c_surf, x * sizeof(mytype), y);
#endif
}
}
int main() {
const int N = my_N;
mytype *a, *b, *c, *d_a, *d_b, *d_c;
int i, j;
clock_t t1, t2;
cudaArray * da, * db, * dc;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc < mytype > ();
dim3 dimBlock(blk_dim, blk_dim);
dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x, (N+dimBlock.y-1)/dimBlock.y);
int s = N * N * sizeof(mytype);
a = (mytype *)malloc(s);
b = (mytype *)malloc(s);
c = (mytype *)malloc(s);
CUDA_SAFE_CALL(cudaMalloc(&d_a, s), __LINE__);
CUDA_SAFE_CALL(cudaMalloc(&d_b, s), __LINE__);
CUDA_SAFE_CALL(cudaMalloc(&d_c, s), __LINE__);
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
a[i*N+j] = A_VAL;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
b[i*N+j] = B_VAL;
CUDA_SAFE_CALL(cudaMallocArray( & da, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & db, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & dc, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(da, 0, 0, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(db, 0, 0, b, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(a_surf, da), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(b_surf, db), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(c_surf, dc), __LINE__);
#ifdef USE_GLOBAL
CUDA_SAFE_CALL(cudaMemcpy(d_a, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, s, cudaMemcpyHostToDevice), __LINE__);
#endif
t1 = clock();
#ifdef USE_GLOBAL
mul <<<dimGrid, dimBlock>>> (d_a, d_b, d_c, N);
#else
mul <<<dimGrid, dimBlock>>> (N);
#endif
cudaDeviceSynchronize();
t2 = clock();
CUDA_SAFE_CALL(cudaMemcpyFromArray(c, dc, 0, 0, s, cudaMemcpyDeviceToHost), __LINE__);
#ifdef USE_GLOBAL
CUDA_SAFE_CALL(cudaMemcpy(c, d_c, s, cudaMemcpyDeviceToHost), __LINE__);
#endif
double t3 = (double) t2 - (double) t1;
t3 = t3 / CLOCKS_PER_SEC;
printf("\n CUDA time :%lf\n", t3);
for (i=0; i < N*N; i++)
if(c[i] != A_VAL*B_VAL*N) {std::cout << "mismatch at: " << i << ", was: " << c[i] << " should be: " << A_VAL*B_VAL*N << std::endl; return 1;}
CUDA_SAFE_CALL(cudaFreeArray(da), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(db), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(dc), __LINE__);
std::cout << "Success!" << std::endl;
return 0;
}
[bob#cluster1 misc]$ nvcc -O3 -o t1129 t1129.cu
[bob#cluster1 misc]$ ./t1129
CUDA time :0.028771
Success!
$ nvcc -O3 -DUSE_GLOBAL -o t1129 t1129.cu
$ ./t1129
CUDA time :0.243635
Success!
$
As a final note, there are many other optimizations we could talk about, which would probably shift the comparison one way or the other. But if you actually want to do fast matrix multiply operations, you should use CUBLAS. You should not write your own matrix multiply routines.

Unspecified launch failure after cudaDeviceSynchronize() call when program starts. But no errors using step-through debugging. CUDA

I've spent several hours struggling with unspecified launch failure.
I've come up with a tiny task for myself in order to understand how shared memory works.
Task is to divide array [1, 2, 3, ... , N] into K group of (N / K) elements and find the sum of each group. (Difference between current and previous element of the array equals 1)
I was planning to use N threads in grid divided between K blocks. So every threadblock contains (N / K) threads. Thus one threadblock could be used to compute sum of one group. Also I wanted to dynamically allocate shared memory.
When I start program I got unspecified launch failure after cudaDeviceSynchronize() call. But when I try step-through debugging everthing is ok and works fine.
What am I doing wrong? (Visual Studio 2012 Professional, Compute Capability 2.1) I would very appreciate any help.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
extern __shared__ double shrd[];
__global__ void kernel(double * a){
size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
temp[threadID_block] = static_cast<double>(threadID_global);
__syncthreads();
if (threadID_block == 0){
a[blockID_global] = 0.0;
for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
a[blockID_global] += temp[index];
}
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
dim3 gridSize(2,2,1);
dim3 blockSize(4,4,1);
double * dev_a = NULL;
size_t length = gridSize.x * gridSize.y ;
size_t byteSize = length * sizeof(double);
CUDA_CALL(cudaMalloc(&dev_a,byteSize));
size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
kernel <<< gridSize, blockSize, shmem_perBlock >>> (dev_a);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
double * a = new double [length];
CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
for (size_t index = 0; index < length; index++){
printf("%.3f\n",a[index]);
}
printf("\n");
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaDeviceReset());
delete[]a;
system("pause");
return 0;
}
If you are on kepler or later first read this:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
Otherwise if you are pre-kepler read this:
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
There are some fundamentals you are missing in terms of CUDA programming. I have given you a template of your code below. It is for clarification of some of these fundamentals. Do not expect this to be optimized as I am expecting you to program the parallel reduction. This will get you started with an understanding on how to use shared memory.
Good Luck!
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10000
#define K 100
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
__global__ void kernel(double* a, double* results){
extern __shared__ double shared[];
size_t tid, tid_local, stride;
tid = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
tid_local = threadIdx.x; //thread id within a block
stride = blockDim.x*gridDim.x; //total number of threads
double *start = &a[K*blockIdx.x]; //each block will get K of a block.
shared[tid_local]=start[tid_local]; //copy K elements into shared memory
__syncthreads();
//Perform Parallel reduction, you will have to implement this
//After parallel reduction, result should be in shared[0]
//for demonstration I made the code serial for each block on thread 0.
//This is for demonstration only.
double sum=0;
if(tid_local==0){
for(int i=0; i<K; i++){
sum+=shared[i];
}
a[blockIdx.x]=sum;
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
double * dev_a = NULL;
double * dev_results=NULL;
CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));
//copy dev_a onto GPU (this is the array you are summing).
dim3 block_size(K, 1, 1);
dim3 grid_size (N/K, 1, 1);
size_t shmem_perBlock = K * sizeof(double);
kernel <<< grid_size, block_size, shmem_perBlock >>> (dev_a, dev_results);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
//copy dev_results back to CPU, this is your result.
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaFree(dev_results));
system("pause");
return 0;
}

Is there a cuda function to copy a row from a Matrix in column major?

I have a device matrix U of dimensions MxN in column major ordering. Now I'd like to extract the row K into a vector u. Is there a function to accomplish this? Note the copy would need to take into account an offset of K and a stride of M.
I was looking at the function cudaMemcpy2D but it rings no bells, coming from a more LAPACK style API I don't understand what these pitch parameters are, why are they not called simply rows and cols or M and N?
You can use
cublas<t>copy(handle, N, U+K, M, u, 1);
as
#include<stdio.h>
#include<conio.h>
#include<assert.h>
#include<cublas_v2.h>
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* cuBLAS ERROR CHECKING */
/*************************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
int main() {
const int M = 5;
const int N = 4;
const int K = 2;
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
float* U = (float*)malloc(M*N*sizeof(float));
float* d_U;
gpuErrchk(cudaMalloc((void**)&d_U,M*N*sizeof(float)));
float* u = (float*)malloc(M*sizeof(float));
float* d_u;
gpuErrchk(cudaMalloc((void**)&d_u,N*sizeof(float)));
for (int j=0; j<N; j++)
for (int i=0; i<M; i++)
U[j*M+i] = (float)(i*j); // Column-major ordering
printf("K-th row - Input\n");
for (int j=0; j<N; j++) printf("U(K,%i) = %f\n",j,U[j*M+K]);
printf("\n\n");
gpuErrchk(cudaMemcpy(d_U,U,M*N*sizeof(float),cudaMemcpyHostToDevice));
cublasSafeCall(cublasScopy(handle, N, d_U+K, M, d_u, 1));
gpuErrchk(cudaMemcpy(u,d_u,N*sizeof(float),cudaMemcpyDeviceToHost));
printf("K-th row - Output\n");
for (int j=0; j<N; j++) printf("u(%i) = %f\n",j,u[j]);
getchar();
}
As #Farzad noted, the memory access pattern for the operation you want is inefficient, but other than that, what you want can be accomplished with a call to cudaMemcpy2D (assuming u and U are of type int):
cudaMemcpy2D((void*)u, sizeof(int), (void*)(U+K), sizeof(int)*M, sizeof(int), N, cudaMemcpyDeviceToDevice);
The answer to the first part is no. The memory inside the GPU is linear, like the host side. If you want to access only row elements of a 2D matrix that is saved in column-major order, it would be costly because of non-coalesced accesses. Since GPU memory is configured in segments, every access to an element requires fetching not only the element itself but also the neighboring elements in the segment, which in column-major ordering happens to be mostly elements of the column the element resides in. While if you store your matrix in row-major order and access elements of a row, GPU tries to merge simultaneous memory requests to minimum segment transactions.
cudaMallocPitch, which is preferred for saving 2D data, pads the memory allocation so the starting address of each and every row/column with length width reside in the starting address of a segment. As a result, when you access all elements of a row/column, fetched segments would be minimized. The cost of using this method is wasting memory space.