Unable to use cublasXt - cuda

I tried the following simple program using cublasXt to multiply two matrices. I get all zero output. Can someone let me know why? My computer can use other cuda libraries normally, and I have two GPUs. My machine is 64bit, as is required by cublasXt.
Btw, I've checked that none of the function calls in the program returns error.
#include <stdio.h>
#include "cublasXt.h"
#include <curand.h>
void fill(double* &x, long m, long n, double val) {
x = new double[m * n];
for (long i = 0; i < m; ++i) {
for (long j = 0; j < n; ++j) {
x[i * n + j] = val;
}
}
}
int main() {
cublasXtHandle_t xt_;
cublasXtCreate(&xt_);
double *A, *B, *C;
long m = 10, n = 10, k = 20;
fill(A, m, k, 0.2);
fill(B, k, n, 0.3);
fill(C, m, n, 0.0);
double alpha = 1.0;
double beta = 0.0;
cublasXtDgemm(xt_, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k, &alpha, A, m, B, k, &beta, C, m
);
cudaDeviceSynchronize();
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
printf ("%lf ", C[i *n + j]);
}
printf ("\n");
}
cublasXtDestroy(xt_);
return 0;
}

The first issue with your code is that you have no call to cublasXtDeviceSelect. This is a necessary part of a cublasXt code, to tell the CUBLAS runtime how many devices to use and which devices to use.
As a simple proof point, try adding the following immediately after your handle creation call:
if(cublasXtCreate(&xt_) != CUBLAS_STATUS_SUCCESS) {printf("handle create fail\n"); return 1;}
int devices[1] = { 0 }; // add this line
if(cublasXtDeviceSelect(xt_, 1, devices) != CUBLAS_STATUS_SUCCESS) {printf("set devices fail\n"); return 1;} // add this line
This should cause your output to change from all zero's to all 1.2 (although only using 1 GPU)
However you will probably want to read the section of the documentation I linked above (for example if you want to use 2 GPUs, and they are of the correct type). cublasXt functionality at this time, that is included in the toolkit, for multi-GPU usage is limited to 2 devices (but note my comments below) and those 2 GPUs must be on a dual-GPU board, such as a Tesla K10 or GeForce GTX 690 (I think Titan Z or Tesla K80 should also work, just to pick other examples).
Additional details of licensing are here. You can get an evaluation version of the "Premier" package that has fewer restrictions on GPUs.

Related

issues of cuBLAS performance on batched complex linear system solver

I am new to cuda and cuBlas, and recently I am trying to use batched cuBlas API to solve multiple systems of linear equations. Here's my code:
The size of the matrix is N, and the number of matrices (batch size) is numOfMat.
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <iostream>
#include <chrono>
#include <random>
#include <cuda.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
#include <cuComplex.h> // deal with complex numbers
#include <cuda_profiler_api.h>
using namespace std::chrono;
#define N 6
#define numOfMat 500000
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
std::random_device device;
std::mt19937 generator(device());
std::uniform_real_distribution<double> distribution(1., 5.);
high_resolution_clock::time_point t1;
high_resolution_clock::time_point t2;
double duration = 0;
double duration_1 = 0;
// step 1: cuda solver initialization
cublasHandle_t cublas_handle;
cublasCreate_v2(&cublas_handle);
cublasStatus_t stat;
int* PivotArray;
int* infoArray;
cudaError_t cudaStatUnified1 = cudaSuccess;
cudaError_t cudaStatUnified2 = cudaSuccess;
const cuDoubleComplex alpha = make_cuDoubleComplex(1.0f, 0.0f);
cudaStatUnified1 = cudaMallocManaged(&PivotArray, N * numOfMat * sizeof(int));
cudaStatUnified2 = cudaMallocManaged(&infoArray, numOfMat * sizeof(int));
if ((cudaSuccess != cudaStatUnified1) || (cudaSuccess != cudaStatUnified2))
std::cout <<"unified memory allocated unsuccessful!"<<std::endl;
//ALLOCATE MEMORY - using unified memory
cuDoubleComplex** h_A;
cudaMallocManaged(&h_A, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_A[0]), sizeof(cuDoubleComplex)*numOfMat*N*N);
for (int nm = 1; nm < numOfMat; nm++)
h_A[nm] = h_A[nm-1]+ N * N;
cuDoubleComplex** h_b;
cudaMallocManaged(&h_b, sizeof(cuDoubleComplex*) * numOfMat);
cudaMallocManaged(&(h_b[0]), sizeof(cuDoubleComplex) * numOfMat * N);
for (int nm = 1; nm < numOfMat; nm++)
h_b[nm] = h_b[nm-1] + N;
// FILL MATRICES
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[nm][j * N + i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
// FILL COEFFICIENTS
for (int nm = 0; nm < numOfMat; nm++)
for (int i = 0; i < N; i++)
h_b[nm][i] = make_cuDoubleComplex(distribution(generator), distribution(generator));
t1 = high_resolution_clock::now();
// step 2: Perform CUBLAS LU solver
stat = cublasZgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("-data download failed");
gpuErrchk( cudaDeviceSynchronize() );
// check if the input matrix is singular
/*for (int i = 0; i < numOfMat; i++)
if (infoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
}*/
// step 3: INVERT UPPER AND LOWER TRIANGULAR MATRICES
// --- Function solves the triangular linear system with multiple RHSs
// --- Function overrides b as a result
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("--data download failed");
gpuErrchk( cudaDeviceSynchronize() );
stat = cublasZtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numOfMat);
if (stat != CUBLAS_STATUS_SUCCESS) printf ("---data download failed");
gpuErrchk( cudaDeviceSynchronize() );
t2 = high_resolution_clock::now();
duration = duration_cast<microseconds>(t2 - t1).count();
std::cout<<duration<<std::endl;
}
The code works fine, but when I plot the computational time versus the number of matrices, the curve looks like this:
My question is: why does the computational time show linear to the number of matrices? Intuitively, the curve should look to be flat when the batch size is large in some extent. However, when the batch size reaches up to 500,000, the time still appears to be linear to the batch size.
How can it be? Is there any explanation behind such a circumstance?
I think you need to look more closely at your data. If I run a modification of your code on Google Colab (Tesla T4) I get this:
Which looks largely like your figure. But look more closely (log scales help):
You can clearly see that up to a certain point, the runtime is largely independent of the number of matrices (around 2^8 = 64), but then scaling is linear as sizes increase. That is the transition from being able to parallelize the workload to reaching parallel capacity and having to schedule many parallel groups of operations to execute the workload. You might infer that for this particular GPU, the GPU run out of parallel capacity at between 64 and 128 concurrent operations (The T4 has 40 SM, so it might well be 80 if an SM could accommodate 2 operations per SM concurrently), after which runtime scales with multiples of that limiting size.
This is completely normal behaviour for any parallel computation architecture I am familiar with.

CUB reduction using 2D grid of blocks

I'm trying to make a sum using the CUB reduction method.
The big problem is:
I'm not sure how to return the values of each block to the Host when using 2-dimensional grids.
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
int natm;
char title[100];
float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
dx = fabs(s->conf[j][0] - s->conf[i][0]);
dy = fabs(s->conf[j][1] - s->conf[i][1]);
dz = fabs(s->conf[j][2] - s->conf[i][2]);
dx = dx - round(dx/L)*L;
dy = dy - round(dy/L)*L;
dz = dz - round(dz/L)*L;
rij = sqrt(dx*dx + dy*dy + dz*dz);
if ((rij <= rc) && (rij > 0.0))
{E = (4*((1/pow(rij,12))-(1/pow(rij,6))));}
// E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
frame * state = (frame*)malloc(sizeof(frame));
float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
state->natm = nat; //inicializando o numero de atomos;
char name[] = "estado1";
strcpy(state->title,name);
for (int i = 0; i < nat; i++) {
state->conf[i][0] = i;
state->conf[i][1] = i;
state->conf[i][2] = i;
}
frame * d_state;
float *d_blocksum;
cudaMalloc((void**)&d_state, sizeof(frame));
cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 gridBlock(GRID_SIZE,GRID_SIZE);
add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
cudaError_t status = cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
float Etotal = 0.0;
for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
Etotal += blocksum[k];
}
cout << endl << "energy: " << Etotal << endl;
if (cudaSuccess != status)
{
cout << cudaGetErrorString(status) << endl;
}
// Free memory
cudaFree(d_state);
cudaFree(d_blocksum);
return cudaThreadExit();
}
What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
The idea here is to return a 1D array, which contains the sum of each block.
I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that).
I looked for some example that use 2D grid with CUB, but did not find.
I really new in CUDA program, maybe I'm making a mistake.
edit: I put the complete code.
For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121
Probably the main issue is that your output indexing is not correct. Here's a reduced version of your code demonstrating correct results for arbitrary GRID_SIZE:
$ cat t1360.cu
#include <stdio.h>
#include <cub/cub.cuh>
#define BLOCK_SIZE 32
#define GRID_SIZE 25
__global__
void add(float* blocksum)
{
float E = 1.0;
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, cub::BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
}
int main(){
float *d_result, *h_result;
h_result = (float *)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
cudaMalloc(&d_result, GRID_SIZE*GRID_SIZE*sizeof(float));
dim3 grid = dim3(GRID_SIZE,GRID_SIZE);
dim3 block = dim3(BLOCK_SIZE, BLOCK_SIZE);
add<<<grid, block>>>(d_result);
cudaMemcpy(h_result, d_result, GRID_SIZE*GRID_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {printf("cuda error: %s\n", cudaGetErrorString(err)); return -1;}
float result = 0;
for (int i = 0; i < GRID_SIZE*GRID_SIZE; i++) result += h_result[i];
if (result != (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE)) printf("mismatch, should be: %f, was: %f\n", (float)(GRID_SIZE*GRID_SIZE*BLOCK_SIZE*BLOCK_SIZE), result);
else printf("Success\n");
return 0;
}
$ nvcc -o t1360 t1360.cu
$ ./t1360
Success
$
The important change I made to your kernel code was in the output indexing:
blocksum[blockIdx.y*gridDim.x + blockIdx.x] = aggregate;
We want a simulated 2D index into an array that has width and height of GRID_SIZE consisting of one float quantity per point. Therefore the width of this array is given by gridDim.x (not blockDim). The gridDim variable gives the dimensions of the grid in terms of blocks - and this lines up exactly with how our results array is set up.
Your posted code will fail if GRID_SIZE and BLOCK_SIZE are different (for example, if GRID_SIZE were smaller than BLOCK_SIZE, cuda-memcheck will show illegal accesses, and if GRID_SIZE is larger than BLOCK_SIZE then this indexing error will result in blocks overwriting each other's values in the output array) because of this mixup between blockDim and gridDim.
Also note that float operations typically only have around 5 decimal digits of precision. So small differences in the 5th or 6th decimal place may be attributable to order of operations differences when doing floating-point arithmetic. You can prove this to yourself by switching to double arithmetic.

Optimize vector matrix multiplication in cuda with large number of zeros

I am using the following kernel to optimize vector-matrix multiplication for the case where both the vector and the matrix have a large number of zeros. The use of this kernel may reduce the time taken for such a multiplication by up to half of the time taken by cublasSgemv, for the case where there are more than 90% zeros. But, it is still much longer than an equivalent blas gemm host call on Ubuntu 14.04
vec = 1 x m, mat = m x m and prod = 1 x m; all are in row-major order
m >= 5000
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
What can be done to further enhance the performance of this kernel apart from libraries like cuSparse?
Would be nice if this optimization was compatible with Compute Capability of 1.2
Thanks
EDIT
Corrected: prod = 1 x m
GPU = Quadro FX 1800M, Cuda v.5.0 on Ubuntu 14.04
EDIT
Complete code that performs multiplication using i. blas, ii. cublas, iii. above kernel for m = 6000. Please enter 0, when asked to enter a value
#include <iostream>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <cublas_v2.h>
#include <math.h>
using namespace std;
const int m = 6000;
const int BS = 512; // threads per block
const int NB = ceil((float) m / BS); // number of blocks
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
int main()
{
timespec blas_start, blas_end, cublas_start, cublas_end, opt_start, opt_end;
long totalnsec; //total nano sec
double totalsec, totaltime;
int i, j;
float *A = new float[m]; // 1 x m
float *B = new float[m*m]; // m x m
float *C = new float[m]; // 1 x m
float input;
cout<<"Enter a value to populate the vector (0 to make it sparse) ";
cin>>input;
// input martix A: every 600th element is non-zero i.e 90% zero
for(i = 0; i < m; i++)
{
A[i] = input;
if( i % 600 == 0) //adjust for sparsity
A[i] = i;
}
// input matrix B: identity matrix
for(i = 0; i < m; i++)
for(j = 0; j < m; j++)
B[j*m + i] = (i==j);
//blas on host
clock_gettime(CLOCK_REALTIME, &blas_start);
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, m, m, 1.0f, A, m, B, m, 0.0f, C, m);
//cblas_sgemv(CblasRowMajor, CblasTrans, m, m, 1.0f, B, m, A, 1, 0.0f, C, 1);
clock_gettime(CLOCK_REALTIME, &blas_end);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
//cublas section
cudaError_t cudaStat;
cublasHandle_t handle;
cublasCreate(&handle);
float *A_d, *B_d, *C_d;
cudaStat = cudaMalloc(&A_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for A_d\n");
cudaStat = cudaMalloc(&B_d, sizeof(float)*m*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for B_d\n");
cudaStat = cudaMalloc(&C_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for C_d\n");
cudaMemcpy(A_d, A, sizeof(float)*m, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B, sizeof(float)*m*m, cudaMemcpyHostToDevice);
float alpha = 1.0f, beta = 0.0f;
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_start);
cublasSgemv(handle, CUBLAS_OP_N, m, m, &alpha, B_d, m, A_d, 1, &beta, C_d, 1);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Call kernel having Optimization for Zeros
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_start);
/////////////////// call kernel //////////////////
calc_v_m<<<NB, BS>>>(A_d, B_d, C_d, m);
//////////////////////////////////////////////////
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_end);
cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);
/*for(i = 0; i < m; i++) printf("%f ", C[i]); */
// Print times
// blas time
totalsec = (double)blas_end.tv_sec - (double)blas_start.tv_sec;
totalnsec = blas_end.tv_nsec - blas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"blas Time = "<< totaltime << "\n";
//cublas time
totalsec = (double)cublas_end.tv_sec - (double)cublas_start.tv_sec;
totalnsec = cublas_end.tv_nsec - cublas_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"cublas Time = "<< totaltime << "\n";
//Optimized Kernel Time
totalsec = (double)opt_end.tv_sec - (double)opt_start.tv_sec;
totalnsec = opt_end.tv_nsec - opt_start.tv_nsec;
if(totalnsec < 0)
{
totalnsec += 1e9;
totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"Opt Kernel Time = "<< totaltime << "\n";
return 0;
}
Results
$ nvcc -arch=sm_12 blascomp.cu -o blascomp.o -lblas -lcublas
$ ./blascomp.o
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 0.000105207
cublas Time = 0.0070294
Opt Kernel Time = 0.00642797
At least on my system blas is still the fastest for such a scenario
Things get even more interesting if every '1200th' element instead of '600th' is set to 0
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 7.84e-05
cublas Time = 0.00698783
Opt Kernel Time = 0.00643042
The important thing to recognise here is that the gemv operation you are concerned with is fundamentally memory throughput limited on GPUs, rather than compute throughput limited. This implies that an "optimisation" as you have shown in your kernel:
__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
prod[x] = 0;
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if( mat[offset] != 0 && vec[i] != 0 )
prod[x] += vec[i] * mat[i*m+x];
}
}
}
isn't really an optmisation at all, simply because the memory transactions are the performance bottleneck in the kernel, not the floating point arithmetic, and your code must perform most of the memory transactions irrespective of whether the multiply add operation will be performed because of zero detection or not.
Consider the following, instrumented version of roughly the same code:
__constant__ float cvec1[2];
__global__ void
__launch_bounds__(512,4)
calc_v_m1(const float* __restrict__ vec,
const float* __restrict__ mat,
float* __restrict__ prod,
int m,
int do_reads = 1,
int do_write = 1)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
if(x < m)
{
float res = 0;
float mval = cvec1[0], vval = cvec1[1];
#pragma unroll 8
for(int i = 0; i < m; i++)
{
int offset = i*m + x;
if (do_reads) {
mval = mat[offset];
vval = vec[i];
}
res += mval * vval;
}
if (do_write) prod[x] = res;
}
}
Here I have added two optional arguments which control whether the kernel will load from global memory, and whether the kernel will store to global memory. This allows me to quantify the performance impact of the memory loads, computation, and memory stores independently. The results using your test code are instructive:
Function nvprof time
-----------------------------------------------
cublasSgemv 942.75us
calc_v_m 2798.4us
calc_v_m1(do_reads=1, do_write=1) 962.40us
calc_v_m1(do_reads=1, do_write=0) 970.40us
calc_v_m1(do_reads=0, do_write=1) 55.166us
calc_v_m1(do_reads=0, do_write=0) 55.102us
[All benchmarking done on a GTX970 using the CUDA 7.5 release toolchain and CUBLAS 7.5 library]
In no particular order:
The full instrumented kernel runtime is within a few percent of the equivalent CUBLAS call
The memory fetches from global memory are the bottleneck
The actual computations in the kernel only constitute 5% of the kernel running time
The "fire-and-forget" nature of write operations in CUDA means that the latency of the write has no significant effect on throughput.
Your "optimised" kernel is considerably slower than either CUBLAS or the instrumented kernel, probably because all you are introducing is branch divergence without addressing the source of the kernel bottleneck (the latency of the memory loads).
The only times conditionally executing the FMAD operation makes sense would be in an architecture where memory has near zero latency and floating point throughput was severely constrained. The GPU definitely doesn't fall into that category.
The only other option for optimising this would be to exploit a priori information about the sparsity patterns in the LHS matrix to remove the need to read zero entries. Which is precisely what sparse matrix formats and linear algebra codes are designed to accommodate.

Surface memory takes more time(twice) than global memory

I am working on optimization of cuda program. So I first started with optimization of matrix multiplication program. Threading scheme which I have used for parallelization is Blocksize(1, 1),Gridsize(N ,N). I am using surface memory for memory optimization purpose(as use of shared memory is not possible for this threading scheme). When I compare the time after and before optimization, I found that execution takes double time after using surface memory(I have tried with different threading scheme but the problem remains same). From whatever I have read till now, global memory is slower than surface memory. So use of surface memory should take less time.Below I am giving matrix multiplication program with surface memory used. Can somebody tell me what is the problem?
#include < stdio.h >
#include < cuda.h >
//#define N 3
surface < void, 2 > a_surf;
surface < void, 2 > b_surf;
surface < void, 2 > c_surf;
void CUDA_SAFE_CALL(cudaError_t call, int line) {
switch (call) {
case cudaSuccess:
break;
default:
printf("ERROR at line :%i.%d' ' %s\n",
line, call, cudaGetErrorString(call));
exit(-1);
break;
}
}
__global__ void mul(int N) {
int a, b, c, temp;
int i;
unsigned int x = blockIdx.x * blockDim.x + (threadIdx.x);
unsigned int y = blockIdx.y * blockDim.y + (threadIdx.y);
if (x < N && y < N) {
temp = 0;
for (i = 0; i < N; i++) {
surf2Dread( & a, a_surf, (x) * 4, i);
surf2Dread( & b, b_surf, (i) * 4, y);
temp += a * b;
}
c = temp;
// Write to output surface
surf2Dwrite(c, c_surf, x * 4, y);
}
}
int main() {
int N = 100;
int a[N][N], b[N][N], c[N][N];
int i, j;
int temp;
clock_t t1, t2;
cudaArray * da, * db, * dc;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc < int > ();
dim3 dimBlock(1, 1);
dim3 dimGrid(N, N);
temp = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
a[i][j] = ++temp;
temp = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
b[i][j] = ++temp;
CUDA_SAFE_CALL(cudaMallocArray( & da, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & db, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & dc, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
int s = N * N * sizeof(int);
CUDA_SAFE_CALL(cudaMemcpyToArray(da, 0, 0, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(db, 0, 0, b, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(a_surf, da), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(b_surf, db), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(c_surf, dc), __LINE__);
t1 = clock();
mul <<<dimGrid, dimBlock>>> (N);
t2 = clock();
CUDA_SAFE_CALL(cudaMemcpyFromArray(c, dc, 0, 0, s, cudaMemcpyDeviceToHost), __LINE__);
double t3 = (double) t2 - (double) t1;
t3 = t3 / CLOCKS_PER_SEC;
printf("\n CUDA time :%lf", t3);
CUDA_SAFE_CALL(cudaFreeArray(da), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(db), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(dc), __LINE__);
}
Optimizing for caches is not a trivial matter. So such trivialized generalization as this:
From whatever I have read till now, global memory is slower than surface memory. So use of surface memory should take less time.
are simply so broad as to be incorrect, in my opinion. It will be frequently true, but not always true. The specifics matter, and proper programming practice matters, too.
Surface memory is nothing more than global memory with an intervening cache. But global memory (on all GPUs supported by current CUDA versions) already has support from L2 (and in some cases L1) cache(s).
The code you have proposed for test/comparison has a number of issues that I would point out:
Your timing methodology is incorrect. This:
t1 = clock();
mul <<<dimGrid, dimBlock>>> (N);
t2 = clock();
will time the duration of the kernel launch not the duration of the kernel execution. So this is almost never the correct way to time things. We can fix this by putting a cudaDeviceSynchronize(); call in the timing region, to force completion of the kernel before timing closure.
This is a particularly bad construct if you are interested in performance:
dim3 dimBlock(1, 1);
because 31 out of every 32 threads in every GPU warp will be inactive, you are leaving 31/32 of the performance of the GPU unused. This has wide-ranging implications. I have no interest in studying the performance of such a scenario, and you shouldn't either (as it is not reflective of real-world performance on well-written codes), unless you are interested in microbenchmarking (not comparative benchmarking). So your code should be fixed to handle at least 32, and ideally 256 or more threads per block.
You've provided no "global memory" comparison case. So I shall provide one.
You've not stated many other factors important for comparative benchmarking, or perf analysis, such as the GPU and platform you are running on, as well as the compile command.
In my opinion, the problem size is too small. A matrix multiply of 100x100 matrices is on the edge of a code that could reasonably occupy the GPU, or test it's performance limits. So I shall make the problem size larger.
With respect to the problem size argument, this is important for the cache discussion. First of all, the surface cache tends to be a spatially-optimized cache, whereas the ordinary L1 and L2 caches are linearly (cache-line) optimized. For very large 2D problems, the surface cache might give better behavior than the L2. But for very small problems, the difference will be less pronounced. Secondly, the surface cache is in addition to the L1 and L2 caches, so a good optimization strategy is to funnel some data through L1 and L2, and other data through surface, to maximize the available cache lines. In fact, since your input matrices are read-only, a further optimization might be to use textures rather than surface for those. But from a contrary point of view, if my problem is so small as to completely fit in the L2 cache, then the surface cache is not likely to give a significant improvement. Your original problem size included 3 matrices of 100x100 int quantities, so about 40Kbytes each, or 120K bytes total. This problem size will fit in the L2 cache of most GPUs. By increasing the problem size (as we shall see - to about 12MB total) we can severely handicap the global-memory-only case.
Here's a code and fully worked example, that has been modified to address most of the above issues. When I run this code on my Quadro5000 GPU on CUDA 7.5/Fedora 20, I observe the surface case to be about 8x faster than the global memory case:
$ cat t1129.cu
#include <stdio.h>
#include <iostream>
typedef int mytype;
const int blk_dim=16;
#define my_N 1000
#define A_VAL 1
#define B_VAL 2
surface < void, 2 > a_surf;
surface < void, 2 > b_surf;
surface < void, 2 > c_surf;
void CUDA_SAFE_CALL(cudaError_t call, int line) {
switch (call) {
case cudaSuccess:
break;
default:
printf("ERROR at line :%i.%d' ' %s\n",
line, call, cudaGetErrorString(call));
exit(-1);
break;
}
}
#ifdef USE_GLOBAL
__global__ void mul(const mytype * __restrict__ d_a, const mytype * __restrict__ d_b, mytype * __restrict__ d_c, const int N)
#else
__global__ void mul(const int N)
#endif
{
mytype a, b, c, temp;
int i;
unsigned int x = blockIdx.x * blockDim.x + (threadIdx.x);
unsigned int y = blockIdx.y * blockDim.y + (threadIdx.y);
if (x < N && y < N) {
temp = 0;
for (i = 0; i < N; i++) {
#ifdef USE_GLOBAL
a = d_a[x*N+i];
b = d_b[i*N+y];
#else
surf2Dread( & a, a_surf, (x) * sizeof(mytype), i);
surf2Dread( & b, b_surf, (i) * sizeof(mytype), y);
#endif
temp += a * b;
}
c = temp;
#ifdef USE_GLOBAL
d_c[x*N+y] = c;
#else
// Write to output surface
surf2Dwrite(c, c_surf, x * sizeof(mytype), y);
#endif
}
}
int main() {
const int N = my_N;
mytype *a, *b, *c, *d_a, *d_b, *d_c;
int i, j;
clock_t t1, t2;
cudaArray * da, * db, * dc;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc < mytype > ();
dim3 dimBlock(blk_dim, blk_dim);
dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x, (N+dimBlock.y-1)/dimBlock.y);
int s = N * N * sizeof(mytype);
a = (mytype *)malloc(s);
b = (mytype *)malloc(s);
c = (mytype *)malloc(s);
CUDA_SAFE_CALL(cudaMalloc(&d_a, s), __LINE__);
CUDA_SAFE_CALL(cudaMalloc(&d_b, s), __LINE__);
CUDA_SAFE_CALL(cudaMalloc(&d_c, s), __LINE__);
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
a[i*N+j] = A_VAL;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
b[i*N+j] = B_VAL;
CUDA_SAFE_CALL(cudaMallocArray( & da, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & db, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & dc, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(da, 0, 0, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(db, 0, 0, b, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(a_surf, da), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(b_surf, db), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(c_surf, dc), __LINE__);
#ifdef USE_GLOBAL
CUDA_SAFE_CALL(cudaMemcpy(d_a, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, s, cudaMemcpyHostToDevice), __LINE__);
#endif
t1 = clock();
#ifdef USE_GLOBAL
mul <<<dimGrid, dimBlock>>> (d_a, d_b, d_c, N);
#else
mul <<<dimGrid, dimBlock>>> (N);
#endif
cudaDeviceSynchronize();
t2 = clock();
CUDA_SAFE_CALL(cudaMemcpyFromArray(c, dc, 0, 0, s, cudaMemcpyDeviceToHost), __LINE__);
#ifdef USE_GLOBAL
CUDA_SAFE_CALL(cudaMemcpy(c, d_c, s, cudaMemcpyDeviceToHost), __LINE__);
#endif
double t3 = (double) t2 - (double) t1;
t3 = t3 / CLOCKS_PER_SEC;
printf("\n CUDA time :%lf\n", t3);
for (i=0; i < N*N; i++)
if(c[i] != A_VAL*B_VAL*N) {std::cout << "mismatch at: " << i << ", was: " << c[i] << " should be: " << A_VAL*B_VAL*N << std::endl; return 1;}
CUDA_SAFE_CALL(cudaFreeArray(da), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(db), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(dc), __LINE__);
std::cout << "Success!" << std::endl;
return 0;
}
[bob#cluster1 misc]$ nvcc -O3 -o t1129 t1129.cu
[bob#cluster1 misc]$ ./t1129
CUDA time :0.028771
Success!
$ nvcc -O3 -DUSE_GLOBAL -o t1129 t1129.cu
$ ./t1129
CUDA time :0.243635
Success!
$
As a final note, there are many other optimizations we could talk about, which would probably shift the comparison one way or the other. But if you actually want to do fast matrix multiply operations, you should use CUBLAS. You should not write your own matrix multiply routines.

Cuda Summation per block. I get 0 returned to the sums. What is wrong?

I tried summation in cuda . I cant find what i did wrong here.
The sum is always returned 0. Can anyone help.
The shared tag defines the variable common in each block.
So i tried to sum one block at a time and finally sum up the result for overall sum.
But the sum doesnt work for block. And i am stuck.
Can anyone help.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <stdlib.h>
//#define BLOCK_SIZE 32 // size of vectors
__global__ void add( float * i_data, float * sum){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float s_data;
s_data = 0;
// must be synchronized
__syncthreads();
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
__syncthreads();
if (tid <= i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
}
if (tid == 0)
sum[blockIdx.x]=s_data;
}
int main() {
int T = 10, B = 5; // threads per block and blocks per grid
float *a,*b; // host pointers
float *dev_a, *dev_b; // device pointers to host memory
int sizeIN = T*B*sizeof(int);
int sizeOUT = B*sizeof(int);
a= new float[T*B];
b= new float[B];
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
a[i*T+j]=i;
}
}
for(int i = 0;i<B;i++)
{
b[i]=0;
}
cudaMalloc((void **) &dev_a, sizeIN);
cudaMalloc((void **) &dev_b, sizeOUT);
cudaMemcpy(dev_a, a, sizeIN, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeOUT, cudaMemcpyHostToDevice);
add<<< B, T >>> (dev_a, dev_b);
cudaMemcpy(a,dev_a, sizeIN, cudaMemcpyDeviceToHost);
cudaMemcpy(b,dev_b, sizeOUT, cudaMemcpyDeviceToHost);
for(int i = 0;i<B;i++)
{
for (int j=0;j<T;j++)
{
std::cout<< a[i*T+j]<<"\t";
std::cout<<std::endl;
}
std::cout<<std::endl<<std::endl<<"sum is: "<<b[i]<<std::endl;
}
std::cout<<std::endl<<std::endl;
cudaFree(dev_a);
cudaFree(dev_b);
free(a);
free(b);
return 0;
}
This is wrong in 2 ways:
if (tid = 0)
First, you should be doing a comparison == not an assignment =. I don't know why your compiler didn't warn you about this.
Second, tid is only zero for one thread in the entire grid:
int tid = blockIdx.x * blockDim.x + threadIdx.x;
You want one thread in each block to write the block result out to global memory:
if (threadIdx.x == 0)
This is also a problem, similarly:
if (tid <= i)
This is only satisfied for threads in the first block. Beyond that, I have to start to guess at what you want. I guess you're trying to sum the values in each block. Your construction is not a parallel reduction, but to make the minimum changes to get it "functional" I would rewrite the end of your kernel like this:
// reduce and sum
// typical in GPU computings
for (int i = 0; i<blockDim.x; i++)
{
if (threadIdx.x == i)
{
//s_data[blockIdx.x]+ = s_data[tid] + s_data[i+tid];
s_data+= i_data[tid];
}
__syncthreads();
}
if (threadIdx.x == 0)
sum[blockIdx.x]=s_data;
}
Although you didn't have any CUDA API errors, it's good practice to use proper cuda error checking and also run your code with cuda-memcheck any time you are having trouble with a cuda code.
I mentioned that your code above is not a classical reduction. Its just an unoptimal for-loop.
To learn about a CUDA parallel reduction, study the cuda sample code and the accompanying presentation, and there are many examples here on the CUDA tag on SO as well that you can search on.