solve Ax=b using CUDA, works for matrix size less than 128x128 - cuda

I wrote a code that uses GPU to do the forward solve (get U matrix from A = LU where here the diagonal entries of U are not set to unity). My code works fine for matrices of dimension less than 128x128 which are a factor of the block size which is 16. I am puzzled why larger matrices give me the wrong result. Something goes wrong in the last row of blocks.
------------ gaussian.h --------------------
// Thread block size
#define BLOCK_SIZE 16 //for forward solve A = LU
#define BLOCK_SIZE2 32 //for finding the pivot factor
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]
////////////////////////////////////////////////////////////////////////////////////////
------------ gaussian.cu --------------------
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>
#include "gaussian_kernel.cu"
#define OUTPUT
void runTest(int argc, char** argv);
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
int main(int argc, char** argv)
{
runTest(argc, argv);
}
void runTest(int argc, char** argv)
{
int dim;
if (argc == 2)
{
dim = atoi(argv[1]);
}
else{
printf("Wrong Usage\n");
exit(1);
}
// Note: A is a square matrix of size NxN where N % BLOCK_SIZE = 0
// Every row of A has a pivot column
// It is known that all the arithmetic operations involved for
// Gaussian Elimination are of type int (not float)
// allocate host memory for matrix A augmented with vector b (Ax=b)
unsigned int size_A = dim * (dim + BLOCK_SIZE);
unsigned int mem_size_A = sizeof(int) * size_A;
int* h_A = (int*) malloc(mem_size_A);
// initialize host memory, generate a test case such as below
// augmented matrix A with padding to make the size evenly divisible by BLOCK_SIZE
// ----A----- | b --padding--
// 1 1 1 1 .. | 0 * * * ..
// 1 2 2 2 .. | 1 * * * ..
// 1 2 3 3 .. | 2 * * * ..
// 1 2 3 4 .. | 3 * * * ..
// .......... | ...
// * means uninitialized entry
// A is size NxN
// Augmented matrix with padding is size Nx(N+BLOCK_SIZE)
int dimRow = dim;
int dimCol = dim + BLOCK_SIZE;
for(int i = 0; i < dim; i++)
{
h_A[(i + 1) * dimCol - BLOCK_SIZE] = i; // b vector stored in (N+1)th column
// of augmented matrix
for (int j = 0; j < dimRow - i; j++)
{
h_A[j + i + i * dimCol] = i + 1; // A[i][j] entry
h_A[j * dimCol + i + i * dimCol] = i + 1; // A[j][i] entry
}
}
//display the test case [ A | b ]
for ( int m = 0 ; m < dimRow; m++)
{
for ( int n = 0 ; n <= dimRow; n++)
//for ( int n = 0 ; n < dimCol; n++)
{
printf("%d ", h_A[m * dimCol + n]);
}
printf("\n");
}
// allocate device memory for the augmented matrix A
int* d_A;
cudaMalloc(&d_A, mem_size_A);
// start timer
double timer1 = gettime();
// copy host memory to device
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
// setup execution parameters for obtaining the pivot factor
int gridP = (dimRow / BLOCK_SIZE2) + ( (dimRow % BLOCK_SIZE2) == 0 ? 0 : 1 );
// add 1 if dimRow/BLOCK_SIZE2 is not evenly divisible
// setup execution parameters for the forward solve (Gaussian Elimination)
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(dimCol / threads.x, dimRow / threads.y);
// execute the kernel
for ( int i = 0 ; i < dim; i++)
{
Gaussian_Pivot_CUDA<<< gridP, BLOCK_SIZE2 >>>(d_A, dimCol, i);
Gaussian_Elim_CUDA<<< grid, threads >>>(d_A, dimCol, i);
}
// copy result from device to host
cudaMemcpy(h_A, d_A, mem_size_A, cudaMemcpyDeviceToHost);
// stop timer
double timer2 = gettime();
printf("GPU time = %lf\n",(timer2-timer1)*1000);
// Back substitution
int X[dimRow];
for (int row = dimRow - 1; row >= 0; row--)
{
X[row] = h_A[(row + 1) * dimCol - BLOCK_SIZE]; // b[row] entry
for (int col = dimRow - 1; col > row; col--)
{
X[row] -= h_A[row * dimCol + col] * X[col];
}
X[row] /= h_A[row * dimCol + row];
printf("X[%d] = %d\t",row,X[row]);
}
printf("\n");
#ifdef OUTPUT
// result of Gaussian Elimination
// ----A----- | b --padding--
// 1 1 1 1 .. | 0 * * * ..
// 0 1 1 1 .. | 1 * * * ..
// 0 0 1 1 .. | 1 * * * ..
// 0 0 0 1 .. | 1 * * * ..
// .......... | ...
// * means garbage entry
for ( int m = 0 ; m < dimRow; m++)
{
for ( int n = 0 ; n <= dimRow; n++)
{
printf("%d ", h_A[m * dimCol + n]);
}
printf("\n");
}
#endif
free(h_A);
cudaFree(d_A);
}
///////////////////////////////////////////////////////////////////////////////
------------ gaussian_kernel.cu --------------------
#include "gaussian.h"
__global__
void Gaussian_Pivot_CUDA(int* A, int widthA, int Pcol)
{
// find the pivot factor
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]
int bx = blockIdx.x;
int tx = threadIdx.x;
int index_row = BLOCK_SIZE2 * bx + tx;
int index_rowA = index_row * widthA;
__shared__ int A_RowRow;
if (tx == 0)
{
A_RowRow = A[Pcol * widthA + Pcol]; // get A[k,k] where k is the pivot column
// for this problem pivot column = pivot row
}
__syncthreads();
A[index_rowA - 1] = A[index_rowA + Pcol] / A_RowRow;
// INVALID WRITE HERE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// store pivot factor at A[dimCol - 1][row]
}
__global__
void Gaussian_Elim_CUDA(int* A, int widthA, int Pcol)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int index_col = BLOCK_SIZE * bx + tx;
int index_row = BLOCK_SIZE * by + ty;
int index = widthA * index_row + index_col;
// d_A[index] "=" d_A[index_row][index_col]
// store pivot factor for rows that we are working on in this block
__shared__ int pivotFactor[BLOCK_SIZE];
if ( ty == 0 ) // use ty instead of tx for coalesced accesses
{
pivotFactor[tx] = A[(index_row + tx) * widthA - 1];
// INVALID WRITE HERE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
}
__syncthreads();
// implement the gaussian elimination for the current row
if ( index_row > Pcol )
{
A[index] -= A[Pcol * widthA + index_col] * pivotFactor[ty];
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]
}
}

cuda-memcheck told me that I was reading and writing stuff out of bounds.
My indexing was off. In Gaussian_Pivot_CUDA, the last line should be A[widthA*(index_row+1)-1)]=...
In Gaussian_Elim_CUDA, when writing pivotFactor I have to change ty to tx and read A[widthA*(index_row+1)-1)].
Thank you #Eugene !!!

If sizeof(int)==2, then This array index will overflow when dim is 128:
h_A[j * dimCol + i + i * dimCol] = i + 1; // A[j][i] entry
(127*144)+127+(127*144) = 36703 > 32767.

Related

Generate Cartesian Product using more than two lists on GPU

I would like to know how to generate a Cartesian product of more than two lists using CUDA.
How do I make this code work with three or more lists?
It works with two lists but not with three, I tried /, % without success.
Its basic.
#include <thrust/device_vector.h>
#include <thrust/pair.h>
#include <thrust/copy.h>
#include <iterator>
__global__ void cartesian_product(const int *a, size_t a_size,
const int *b, size_t b_size,
const int *c, size_t c_size)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < a_size * b_size * c_size)
{
unsigned int a_idx = idx / a_size;
unsigned int b_idx = idx % a_size;
// ?
unsigned int c_idx = idx % a_size;
printf("a[a_idx] and b[b_idx] and c[c_idx] are: %d %d %d\n\n",a[a_idx], b[b_idx], c[c_idx]);
//1 3 5 , 1 3 6 , 1 4 5 , 1 4 6 , 2 3 5 , 2 3 6 , 2 4 5 , 2 4 6
//0 0 0 , 0 0 1 , 0 1 0 , 0 1 1 , 1 0 0 , 1 0 1 , 1 1 0 , 1 1 1
}
}
int main()
{
// host_vector is stored in host memory while device_vector livesin GPU device memory.
// a has storage for 2 integers
thrust::device_vector<int> a(2);
// initialize individual elements
a[0] = 1;
a[1] = 2;
// b has storage for 2 integers
thrust::device_vector<int> b(2);
// initialize individual elements
b[0] = 3;
b[1] = 4;
// d has storage for 2 integers
thrust::device_vector<int> c(2);
// initialize individual elements
c[0] = 5;
c[1] = 6;
unsigned int block_size = 256;
unsigned int num_blocks = (8 + (block_size - 1)) / block_size;
// raw_pointer_cast creates a "raw" pointer from a pointer-like type, simply returning the wrapped pointer, should it exist.
cartesian_product<<<num_blocks, block_size>>>(thrust::raw_pointer_cast(a.data()), a.size(),
thrust::raw_pointer_cast(b.data()), b.size(),
thrust::raw_pointer_cast(c.data()), c.size());
return 0;
}
How do I get the right c_idx in the kernel and subsequent arrays if I want more than three lists?
It seems to me that you want "lexic indexing":
idx == (a_idx * b_size + b_idx) * c_size + c_idx
So you get your indices like this:
c_idx = idx % c_size;
b_idx = (idx / c_size) % b_size;
a_idx = (idx / c_size) / b_size;
This is easily generalizable to more dimensions. E.g. in four dimensions you have
idx == ((a_idx * b_size + b_idx) * c_size + c_idx) * d_size + d_idx
Then:
d_idx = idx % d_size;
c_idx = (idx / d_size) % c_size;
b_idx = ((idx / d_size) / c_size) % b_size;
a_idx = ((idx / d_size) / c_size) / b_size;
In C/C++ programming one likes to use this to calculate indices into an one-dimensional dynamic array representing a multidimensional dataset. In CUDA you normally don't need it as much, as CUDA gives you up to three-dimensional threadIdx/blockIdx/etc.. So for the Cartesian product of three arrays, you won't need this technique, but could just use the intrinsic CUDA features. Even in more than three the most performant solution will get two indices from two of the three dimensions of the kernel and use lexic indexing on the third one:
__global__ void cartesian_product_5d(const int *a, size_t a_size,
const int *b, size_t b_size,
const int *c, size_t c_size,
const int *d, size_t d_size,
const int *e, size_t e_size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int d_idx = blockIdx.y * blockDim.y + threadIdx.y;
int e_idx = blockIdx.z * blockDim.z + threadIdx.z;
/* idx == (c_idx * b_size + b_idx) * a_size + a_idx */
int a_idx = idx % a_size;
int b_idx = (idx / a_size) % b_size;
int c_idx = (idx / a_size) / b_size;
/* ... */
}
int main()
{
/* ... */
dim3 threadsPerBlock(8, 8, 8);
dim3 numBlocks((a_size + b_size + c_size + threadsPerBlock.x - 1) /
threadsPerBlock.x,
(d_size + threadsPerBlock.y - 1) / threadsPerBlock.y,
(e_size + threadsPerBlock.z - 1) / threadsPerBlock.z);
cartesian_product_5d<<<numBlocks, threadsPerBlock>>>(/* ... */);
/* ... */
}

CUDA in-place transpose doesn't complete transpose total matrix [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
I've written the CUDA code below. It's supposed to transpose a matrix using tiling blocks, and the code works when using small values, but when using, for example:
TILE = 32, matrix 128 x 128, it doesn't complete the transpose, it stops after 96. In host this is my dimension thread/block
dim3 dimGrid((nEven + TILE_DIM - 1) / TILE_DIM, (nEven + TILE_DIM - 1) / TILE_DIM);
dim3 dimBlock(TILE_DIM, TILE_DIM);
where I let the threads number == to tile block number,
the global code is simple and it should theoretically work:
__global__ void transposeMain( int *idata)
{
__shared__ int tile2[TILE_DIM][TILE_DIM];
int yyy = blockIdx.y * TILE_DIM ; // col values (0,32,64,96)
int xxx = blockIdx.x * TILE_DIM ; // row values (0,32,64,96)
if (xxx < nEven && yyy < nEven)
{
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
__syncthreads();
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
}
}
Any idea what might be the problem?
The problem is you are trying to do an in-place transpose.
CUDA device code execution is broken up into threadblocks. Threadblocks (groups of threads) can execute in any order, and do not all (typically) execute at the same time. So when you read a tile in here:
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
That is OK. But when you write the tile:
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
You are frequently over-writing data (in some other tile in the original matrix) which you haven't read yet (because the threadblock responsible for reading that tile hasn't even begun to execute yet). Once you overwrite it like this, it's lost.
The solution (for square matrix transpose) has several aspects to it:
Each threadblock must first read 2 tiles. These 2 tiles from the input data will be swapped.
Then each threadblock can write those two tiles.
The tiles along the main diagonal need special casing.
since most threadblocks are handling 2 tiles, only threadblocks on or on one side of the main diagonal need do any work.
You haven't shown a complete MCVE (which is expected when you have questions like this), and your code has other issues such as the potential for uncoalesced access (lower performance) so I'm not going to try to "fix" your code.
Instead, here's a fully worked example, lifted from here:
$ cat t469.cu
#include <stdio.h>
#include <cublas_v2.h>
#include <time.h>
#include <sys/time.h>
#define uS_PER_SEC 1000000
#define uS_PER_mS 1000
#define N 4096
#define M 4096
#define TILE_DIM 32
#define BLOCK_ROWS 8
__global__ void transposeCoalesced(float *odata, const float *idata)
{
__shared__ float tile[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
__syncthreads();
x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
y = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
}
__global__ void iptransposeCoalesced(float *data)
{
__shared__ float tile_s[TILE_DIM][TILE_DIM+1];
__shared__ float tile_d[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
if (blockIdx.y>blockIdx.x) { // handle off-diagonal case
int dx = blockIdx.y * TILE_DIM + threadIdx.x;
int dy = blockIdx.x * TILE_DIM + threadIdx.y;
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_d[threadIdx.y+j][threadIdx.x] = data[(dy+j)*width + dx];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(dy+j)*width + dx] = tile_s[threadIdx.x][threadIdx.y + j];
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(y+j)*width + x] = tile_d[threadIdx.x][threadIdx.y + j];
}
else if (blockIdx.y==blockIdx.x){ // handle on-diagonal case
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
tile_s[threadIdx.y+j][threadIdx.x] = data[(y+j)*width + x];
__syncthreads();
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
data[(y+j)*width + x] = tile_s[threadIdx.x][threadIdx.y + j];
}
}
int validate(const float *mat, const float *mat_t, int n, int m){
int result = 1;
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
if (mat[(i*m)+j] != mat_t[(j*n)+i]) result = 0;
return result;
}
int main(){
timeval t1, t2;
float *matrix = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i ++)
for (int j = 0; j < M; j++)
matrix[(i*M) + j] = i;
// Starting the timer
gettimeofday(&t1, NULL);
float *matrixT = (float *) malloc (N * M * sizeof(float));
for (int i = 0; i < N; i++)
for (int j = 0; j < M; j++)
matrixT[(j*N)+i] = matrix[(i*M)+j]; // matrix is obviously filled
//Ending the timer
gettimeofday(&t2, NULL);
if (!validate(matrix, matrixT, N, M)) {printf("fail!\n"); return 1;}
float et1 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("CPU time = %fms\n", et1);
float *h_matrixT , *d_matrixT , *d_matrix;
h_matrixT = (float *) (malloc (N * M * sizeof(float)));
cudaMalloc((void **)&d_matrixT , N * M * sizeof(float));
cudaMalloc((void**)&d_matrix , N * M * sizeof(float));
cudaMemcpy(d_matrix , matrix , N * M * sizeof(float) , cudaMemcpyHostToDevice);
//Starting the timer
gettimeofday(&t1, NULL);
const float alpha = 1.0;
const float beta = 0.0;
cublasHandle_t handle;
//gettimeofday(&t1, NULL);
cublasCreate(&handle);
gettimeofday(&t1, NULL);
cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, &alpha, d_matrix, M, &beta, d_matrix, N, d_matrixT, N);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cublasDestroy(handle);
//Ending the timer
float et2 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU Sgeam time = %fms\n", et2);
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
cudaMemset(d_matrixT,0, N*M*sizeof(float));
memset(h_matrixT, 0, N*M*sizeof(float));
dim3 threads(TILE_DIM, BLOCK_ROWS);
dim3 blocks(N/TILE_DIM, M/TILE_DIM);
gettimeofday(&t1, NULL);
transposeCoalesced<<<blocks, threads >>>(d_matrixT, d_matrix);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cudaMemcpy(h_matrixT , d_matrixT , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
float et3 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU kernel time = %fms\n", et3);
memset(h_matrixT, 0, N*M*sizeof(float));
gettimeofday(&t1, NULL);
iptransposeCoalesced<<<blocks, threads >>>(d_matrix);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
cudaMemcpy(h_matrixT , d_matrix , N * M * sizeof(float) , cudaMemcpyDeviceToHost);
if (!validate(matrix, h_matrixT, N, M)) {printf("fail!\n"); return 1;}
float et4 = (((t2.tv_sec*uS_PER_SEC)+t2.tv_usec) - ((t1.tv_sec*uS_PER_SEC)+t1.tv_usec))/(float)uS_PER_mS;
printf("GPU in-place kernel time = %fms\n", et4);
cudaFree(d_matrix);
cudaFree(d_matrixT);
return 0;
}
$ nvcc -arch=sm_20 -o t469 t469.cu -lcublas
$ ./t469
CPU time = 450.095001ms
GPU Sgeam time = 1.937000ms
GPU kernel time = 1.694000ms
GPU in-place kernel time = 1.839000ms
$
Note that this compares several different approaches to matrix transpose.
If you study the iptransposeCoalesced you will see that it is adhering to the 4 specific aspects I outlined above.
It is fishy to use __syncthreads(); in the if statement in CUDA. Try to move it outside this block by simple:
if (xxx < nEven && yyy < nEven)
{
tile2[threadIdx.x][threadIdx.y] = idata[(threadIdx.x + xxx)*nEven + (threadIdx.y + yyy)];
}
__syncthreads();
if (xxx < nEven && yyy < nEven)
{
idata[(threadIdx.y + yyy)*nEven + (threadIdx.x + xxx)] = tile2[threadIdx.x][threadIdx.y];
}

Matrix-vector multiplication in CUDA: benchmarking & performance

I'm updating my question with some new benchmarking results (I also reformulated the question to be more specific and I updated the code)...
I implemented a kernel for matrix-vector multiplication in CUDA C following the CUDA C Programming Guide using shared memory. Let me first present some benchmarking results which I did on a Jetson TK1 (GPU: Tegra K1, compute capability 3.2) and a comparison with cuBLAS:
Here I guess cuBLAS does some magic since it seems that its execution is not affected by the number of columns of A, which, in turn, implies that there is some sort of parallelisation along the columns of A.
Now, here is the source code of my kernel and a host function to call it (file: mv.cuh):
#include <cuda_runtime.h>
#define BLOCK_SIZE 16
/* Set to __restric__ */
#define RESTRICT
/**
* Performs matrix-vector multiplication on the device.
*
* #param dA Address of matrix `A` on the device
* #param dx Address of vector `x` on the device
* #param dev_ptr_y Address of result y = A*x
* #param nRows Number of rows of `A`
* #param nx Size of `x` (number of columns of `A`)
*
* #tparam T Data type
*
*/
template<typename T>
__global__ void matvec_kernel(
const T * RESTRICT dA,
const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows,
const unsigned int nx);
/**
* Host-side wrapper for #matvec_kernel.
*
* #param dA Address of matrix `A` on the device
* #param dx Address of vector `x` on the device
* #param dev_ptr_y Address of result y = A*x
* #param nRows Number of rows of `A`
* #param nx Size of `x` (number of columns of `A`)
* #param elapsed_time Time for the kernel to complete the execution in `ms`.
* If NULL is passed to this argument, the elapsed time
* will not be computed.
*
* #tparam T Data type for `A` and `x`
*/
template<typename T>
__host__ void matvec(
const T * RESTRICT dA,
const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows,
const unsigned int nx);
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
template<typename T>
__global__ void matvec_kernel(const T * RESTRICT dA, const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows, const unsigned int nx)
{
unsigned int bid = blockIdx.x;
unsigned int row = threadIdx.x;
const unsigned int block_size = blockDim.x;
const unsigned int num_hor_blocks = ((nx + block_size - 1)/ block_size);
unsigned int n_star;
unsigned int idx_x;
unsigned int idx_Asub;
unsigned int idx_y;
const T * Asub;
const T * xsub;
/* Only `x` is copied to shared memory */
__shared__ T x_shared[BLOCK_SIZE];
idx_y = bid * block_size;
T * y_sub = dy + idx_y;
T y_val = 0.0;
#pragma unroll
for (unsigned int m = 0; m < num_hor_blocks; ++m)
{
idx_Asub = block_size * (bid + m * nRows);
idx_x = m * block_size;
Asub = dA + idx_Asub;
xsub = dx + idx_x;
if (idx_x + row < nx) {
x_shared[row] = xsub[row];
}
__syncthreads();
/* If the tiling is exact */
if ( (nRows % block_size == 0 && nx % block_size == 0 ) ||
(m != block_size - 1 || bid != gridDim.x - 1)) {
y_val += Asub[row] * x_shared[0];
y_val += Asub[row + nRows] * x_shared[1];
y_val += Asub[row + 2 * nRows] * x_shared[2];
y_val += Asub[row + 3 * nRows] * x_shared[3];
y_val += Asub[row + 4 * nRows] * x_shared[4];
y_val += Asub[row + 5 * nRows] * x_shared[5];
y_val += Asub[row + 6 * nRows] * x_shared[6];
y_val += Asub[row + 7 * nRows] * x_shared[7];
y_val += Asub[row + 8 * nRows] * x_shared[8];
y_val += Asub[row + 9 * nRows] * x_shared[9];
y_val += Asub[row + 10 * nRows] * x_shared[10];
y_val += Asub[row + 11 * nRows] * x_shared[11];
y_val += Asub[row + 12 * nRows] * x_shared[12];
y_val += Asub[row + 13 * nRows] * x_shared[13];
y_val += Asub[row + 14 * nRows] * x_shared[14];
y_val += Asub[row + 15 * nRows] * x_shared[15];
} else {
n_star = min(BLOCK_SIZE, nx - idx_x);
#pragma unroll
for (unsigned int e = 0; e < n_star; ++e) {
y_val += Asub[row + e * nRows] * x_shared[e];
}
}
__syncthreads();
}
if (row + idx_y < nRows)
y_sub[row] = y_val;
}
template<typename T>
__host__ void matvec(
const T * RESTRICT dA,
const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows,
const unsigned int nx)
{
dim3 dim_grid( (nRows + BLOCK_SIZE -1)/ BLOCK_SIZE );
dim3 dim_block(BLOCK_SIZE);
matvec_kernel<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);
}
I'm using this to time my execution (file: cuda_timer.cuh):
#include <cuda_runtime.h>
#include "error_handles.cuh"
static cudaEvent_t start;
static cudaEvent_t stop;
static short timer_running = 0;
static short tic_called = 0;
/**
* Sets up the timer.
*
* Must be called before any invocation to
* tic() or toc(), preferrably at the beginning of your
* application.
*/
void start_tictoc();
/**
* Starts the timer.
*
* Use `toc()` to get the elapsed time; `tic()` must
* be called before a `toc()`.
*/
void tic();
/**
* Returns the elapsed time between its invocation
* and a previous invocation of `toc()`. Returns `-1`
* and prints a warning message if `toc()` was not
* previously called. Returns `-2` and prints and error
* message if `start_tictoc()` has not been called.
*
* #return Elapsed time between `tic()` and `toc()` in milliseconds
* with a resolution of `0.5` microseconds.
*/
float toc();
/**
* This function should be called when the
* time will not be being used any more. It destroys
* the events used to time CUDA kernels. If the timer
* is not running, this function does nothing and
* prints a warning message.
*/
void stop_tictoc();
void start_tictoc() {
_CUDA(cudaEventCreate(&start));
_CUDA(cudaEventCreate(&stop));
timer_running = 1;
}
void tic() {
if (timer_running) {
_CUDA(cudaEventRecord(start, 0));
tic_called = 1;
} else {
printf("WARNING: tic() called without a timer running!\n");
}
}
float toc() {
float elapsed_time;
if (tic_called == 0) {
printf("WARNING: toc() called without a previous tic()!\n");
return -1;
}
if (timer_running == 1) {
// _CUDA(cudaDeviceSynchronize()); // Removed! (See discussion below)
_CUDA(cudaEventRecord(stop, 0));
_CUDA(cudaEventSynchronize(stop));
_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
tic_called = 0;
return elapsed_time;
} else {
printf("WARNING: toc() called without a timer running!\n");
return -2;
}
}
void stop_tictoc()
{
if (timer_running == 1){
_CUDA(cudaEventDestroy(start));
_CUDA(cudaEventDestroy(stop));
timer_running = 0;
} else{
printf("WARNING: stop_tictoc() called without a timer running!\n");
}
}
and my main file (main.cu) is the following:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include "cublas_v2.h"
#include <math.h>
#include <curand.h>
#include <stdbool.h>
#include "mv.cuh"
#include "cuda_timer.cuh"
#include "error_handles.cuh"
typedef float real_t;
#define _CUDA(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define _CUBLAS(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define _CURAND(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define TEST_COLUMNS 1
#define TEST_ROWS 0
/**
* If `TEST_WRT_` is set to `TEST_COLUMNS`, then a benchmark
* will be performed with respect to columns (with a fixed
* number of rows). If it is set to `TEST_ROWS`, then a benchmark will
* run with respect to rows (fixed number of columns).
*/
#define TEST_WRT_ TEST_ROWS
#define CONSTANT_COLS 300
#define CONSTANT_ROWS 256
/**
* In order to estimate the execution time, every
* kernel is run `RUNS` times and the average is taken.
*/
#define RUNS 50
void compare_results(real_t *dev_y_cublas, real_t * dev_y,unsigned int nrows)
{
real_t * hst_y_cublas;
real_t * hst_y;
const size_t s = nrows * sizeof(real_t);
hst_y_cublas = (real_t*) malloc(s);
hst_y = (real_t*) malloc(s);
_CUDA(cudaMemcpy(hst_y, dev_y, s, cudaMemcpyDeviceToHost));
_CUDA(cudaMemcpy(hst_y_cublas, dev_y_cublas, s, cudaMemcpyDeviceToHost));
for (unsigned int i = 0; i < nrows; ++i) {
if (fabsf(hst_y_cublas[i] - hst_y[i]) > 0.001) {
printf("ERROR ------ %f\n", fabsf(hst_y_cublas[i] - hst_y[i]));
exit(EXIT_FAILURE);
}
}
if (hst_y_cublas) free(hst_y_cublas);
if (hst_y) free(hst_y);
}
void do_benchmark() {
curandGenerator_t gen;
real_t *dev_rand_data = NULL; // Random data will be allocated here!
real_t *dev_y = NULL;
real_t *dev_y_cublas = NULL;
real_t t;
real_t t_cublas;
const size_t n_rows_max = 1500;
const size_t n_cols_max = 300;
const size_t ntot = n_cols_max * (1 + n_rows_max);
const size_t size_tot = sizeof(real_t) * ntot;
float alpha = 1.0, beta = 0.0; // beta was initially set to 1.0 by mistake
cublasHandle_t handle;
_CUBLAS(cublasCreate(&handle));
start_tictoc();
_CUDA(cudaMalloc((void** )&dev_rand_data, size_tot));
_CUDA(cudaMalloc((void** )&dev_y, n_rows_max * sizeof(real_t)));
_CUDA(cudaMalloc((void** )&dev_y_cublas, n_rows_max * sizeof(real_t)));
_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
_CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
tic();
_CURAND(curandGenerateUniform(gen, dev_rand_data, ntot));
t = toc();
printf("RNG in %f ms\n", t);
_CURAND(curandDestroyGenerator(gen));
size_t ncols = CONSTANT_COLS;
size_t nrows = CONSTANT_ROWS;
size_t runs = RUNS;
cudaMemset(dev_y_cublas, 0, n_rows_max * sizeof(real_t));
matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows, ncols);
_CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
/* Compare results */
compare_results(dev_y_cublas,dev_y, nrows);
FILE * pFile;
char filename[50];
#if (TEST_WRT_ == TEST_COLUMNS)
sprintf(filename, "times_rows%lu_cols.txt", nrows);
#else
sprintf(filename, "times_cols%lu_rows.txt", ncols);
#endif
printf("Logging to : '%s'\n", filename);
pFile = fopen(filename, "w");
if (pFile == NULL) {
perror("Error opening file.");
exit(79);
}
#if (TEST_WRT_ == TEST_COLUMNS)
fprintf(pFile, "0, %lu, 0, 0\n", nrows);
for (ncols = 32; ncols < n_cols_max; ncols += 32) {
#else
fprintf(pFile, "1, %lu, 0, 0\n", ncols);
for (nrows = 32; nrows < n_rows_max; nrows += 32) {
#endif
tic();
for (short i = 0; i < runs; i++) {
matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows,
ncols);
}
t = toc() / runs;
tic();
for (short i = 0; i < runs; i++) {
_CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
}
t_cublas = toc() / runs;
#if (TEST_WRT_ == TEST_COLUMNS)
fprintf(pFile, "%lu, %f, %f\n", ncols, t, t_cublas);
#else
fprintf(pFile, "%lu, %f, %f\n", nrows, t, t_cublas);
#endif
}
_CUBLAS(cublasDestroy(handle));
fclose(pFile);
if (dev_rand_data != NULL)
_CUDA(cudaFree(dev_rand_data));
stop_tictoc();
}
int main(void)
{
do_benchmark();
return EXIT_SUCCESS;
}
Finally, this is a MATLAB script I'm using to plot the execution times:
fetch_this = 'times_cols512_rows.txt';
username = 'ubuntu';
target_hostname = 'jetson';
% Do not modify below this line
eval_this=['! scp ' username '#' target_hostname ':~/mv/Debug/' fetch_this ' .'];
eval(eval_this)
set(0, 'DefaultAxesFontSize', 14);
r = csvread(fetch_this);
r_header = r(1,:);
plot(r(2:end,1), r(2:end,2)*1000, '-');
hold on
plot(r(2:end,1), r(2:end,3)*1000, '-r');
grid on;
fig_title = 'Matvec on Tegra K1 - %d %s';
if (r_header(1)==1),
xlabel('Number of rows');
title(sprintf(fig_title, r_header(2),'columns'));
else
xlabel('Number of columns');
title(sprintf(fig_title, r_header(2),'rows'));
end
ylabel('Computation time [us]');
legend('Kernel', 'cuBLAS');
axis tight
I am concerned about the performance and the scalability of my kernel, so first I would like to know how to improve the scalability with respect to the number of rows of matrix A. Second, I know that it is not very good practice to have branch divergence (and my code has), but I'm feeling I want some hints to improve it.
UPDATE :
Thanks to all your comments and suggestions, I reached the conclusion that cudaDeviceSynchronized() caused, in the first place, some peculiarities with my timing so my initial measurements were inaccurate. Row-major ordering leads to worse results. The size of the blocks is an important tuning parameter and changing from 16 to 32 or 64 improves the execution time. Further benchmarking is necessary to choose the block size. To this end, one may use the following API for the kernel:
template<typename T, const uint_t blk>
__global__ void matvec_kernel(const T * RESTRICT dA, const T * RESTRICT dx,
T * RESTRICT dy, const uint_t nRows, const uint_t nx);
and call it like this from the host:
template<typename T>
__host__ void matvec(const T * RESTRICT dA, const T * RESTRICT dx,
T * RESTRICT dy, const uint_t nRows, const uint_t nx) {
uint_t blk_size_opt = 64;
/* Add code to decide the value of `blk_size_opt` */
if (blk_size_opt == 32) {
matvec_engine<T, 32>(dA, dx, dy, nRows, nx);
} else if (blk_size_opt == 64) {
matvec_engine<T, 64>(dA, dx, dy, nRows, nx);
} else if (blk_size_opt == 128) {
matvec_engine<T, 128>(dA, dx, dy, nRows, nx);
} else if (blk_size_opt == 256) {
matvec_engine<T, 256>(dA, dx, dy, nRows, nx);
}
}
Let me provide some benchmarking results. First a comparison with cublasSgemv:
and the effect of block size on the execution time:
First, let me write down the full working Matrix-Vector multiplication kernel employing shared memory:
template<typename T>
__global__ void matvec_kernel(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ T x_shared[BLOCK_SIZE];
T y_val = 0.0;
#pragma unroll
for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
{
if ((m * BLOCK_SIZE + threadIdx.x) < nCols) x_shared[threadIdx.x] = dx[threadIdx.x + m * BLOCK_SIZE];
else x_shared[threadIdx.x] = 0.f;
__syncthreads();
#pragma unroll
for (unsigned int e = 0; e < BLOCK_SIZE; ++e) {
// --- Column-major ordering - faster
y_val += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
// --- Row-major ordering - slower
//y_val += dA[tid * nCols + (e + BLOCK_SIZE * m)] * x_shared[e];
}
__syncthreads();
}
if (tid < nRows) dy[tid] = y_val;
}
Unless differently specified, all the tests will be done on a GT540M card.
A first parameter to be optimized is the BLOCK_SIZE. Changing the BLOCK_SIZE changes the algorithm performance, as witnessed by the following graph:
The following graphs compares row-major ordering vs. column-major ordering. The latter is faster:
Another optimization you may wish to try is using more Instruction Level Parallelism (ILP) by this modified kernel employing ILP = 2
template<typename T>
__global__ void matvec_kernel_ILP2(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ T x_shared[BLOCK_SIZE];
T y_val1 = 0.0;
T y_val2 = 0.0;
#pragma unroll
for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
{
if ((m * BLOCK_SIZE + threadIdx.x) < nCols) x_shared[threadIdx.x] = dx[threadIdx.x + m * BLOCK_SIZE];
else x_shared[threadIdx.x] = 0.f;
__syncthreads();
#pragma unroll
for (unsigned int e = 0; e < BLOCK_SIZE; ++e) {
y_val1 += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
y_val2 += dA[tid + gridDim.x * BLOCK_SIZE + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
}
__syncthreads();
}
if (tid < nRows) dy[tid] = y_val1;
if ((tid + gridDim.x * BLOCK_SIZE) < nRows) dy[tid + gridDim.x * BLOCK_SIZE] = y_val2;
}
This kernel should be called with half of the threads, as
dim3 dim_grid((nRows/2 + BLOCK_SIZE -1)/ BLOCK_SIZE);
dim3 dim_block(BLOCK_SIZE);
matvec_kernel_ILP2<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);
Finally, since you are using a device with compute capability 3.2, you can try using shuffle operations. I'm providing here the kernel using shuffle operations instead of shared memory. In this case, you should set BLOCK_SIZE = 32:
template<typename T>
__global__ void matvec_kernel_shfl(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
T x_shfl_src, x_shfl_dest;
T y_val = 0.0;
#pragma unroll
for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
{
if ((m * BLOCK_SIZE + threadIdx.x) < nCols) x_shfl_src = dx[threadIdx.x + m * BLOCK_SIZE];
else x_shfl_src = 0.f;
__syncthreads();
// #pragma unroll
for (int e = 0; e < 32; ++e) {
// --- Column-major ordering - faster
x_shfl_dest = __shfl(x_shfl_src, e);
y_val += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shfl_dest;
// --- Row-major ordering - slower
//y_val += dA[tid * nCols + (e + BLOCK_SIZE * m)] * x_shared[e];
}
__syncthreads();
}
if (tid < nRows) dy[tid] = y_val;
}
Shuffle operations improve the performance over shared memory for BLOCK_SIZE = 32 on a Kepler K20c as shown by the graph below:
Looking at your code I think that the way you traverse the elements of A may be the problem:
for (unsigned int e = 0; e < n_star; ++e) {
y_val += Asub[row + e * nRows] * x_shared[e];
}
So, when nRows becomes large, you actually read from the global memory (that is where A is stored) with a large stride. In particular this happens in every block: threads inside the same block will read from the global memory in a non-consecutive fashion. This can be improved if you consider storing from the beginning the values of A row-by-row (i.e., using row-major order). This is just a guess and I would have written a comment, but it requires a higher score on Stackoverflow...

The Floyd-Warshall algorithm in CUDA

This is the sequential piece of code I am trying to parallelize in CUDA
/*
Sequential (Single Thread) APSP on CPU.
*/
void floyd_sequential(int *mat, const size_t N)
{
for(int k = 0; k < N; k ++)
for(int i = 0; i < N; i ++)
for(int j = 0; j < N; j ++)
{
int i0 = i*N + j;
int i1 = i*N + k;
int i2 = k*N + j;
if(mat[i1] != -1 && mat[i2] != -1)
mat[i0] = (mat[i0] != -1 && mat[i0] < mat[i1] + mat[i2]) ?
mat[i0] : (mat[i1] + mat[i2]);
}
}
This is my CUDA implementation
// ParallelComputing.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#define DIMENSION 10;
__global__ void gpu_Floyd(int *result, int N)
{
int j,k;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
for(k = 0; k < N; k++)
{
for(j = 0; j < N; j++)
{
int i0 = Row * N + j;
int i1 = Row * N + k;
int i2 = k * N + j;
if(result[i0] != -1 && result[i2] != -1)
result[i0] = (result[i0] != -1 && result[i0] < result[i1] + result[i2]) ?
result[i0] : (result[i1] + result[i2]);
__syncthreads();
}
}
}
void GenMatrix(int *mat, const size_t N)
{
for(int i = 0; i < N*N; i ++)
mat[i] = rand()%32 - 1;
}
bool CmpArray(const int *l, const int *r, const size_t eleNum)
{
for(int i = 0; i < eleNum; i ++)
if(l[i] != r[i])
{
printf("ERROR: l[%d] = %d, r[%d] = %d\n", i, l[i], i, r[i]);
return false;
}
return true;
}
int main(int argc, char **argv)
{
// generate a random matrix.
size_t N = 10;
int *mat = (int*)malloc(sizeof(int)*N*N);
GenMatrix(mat, N);
// compute the reference result.
int *ref = (int*)malloc(sizeof(int)*N*N);
memcpy(ref, mat, sizeof(int)*N*N);
Floyd_sequential(ref, N);
//CUDA Portion
int Grid_Dim_x = 1, Grid_Dim_y = 1;
int noThreads_x, noThreads_y;
int *result = (int*)malloc(sizeof(int)*N*N);
memcpy(result, mat, sizeof(int)*N*N);
int *d_result;
// compute your results
cudaMalloc((void **)&d_result, N*N);
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
gpu_Floyd<<<1024, 256>>>(d_result, N);
cudaMemcpy(result, d_result, cudaMemcpyDeviceToHost);
// compare your result with reference result
if(CmpArray(result, ref, N*N))
printf("The matrix matches.\n");
else
printf("The matrix do not match.\n");
free(ref);
free(result);
cudaFree(d_result);
}
However, my output always shows the matrices do not match.
I understand that in CUDA we try to map each element in the matrix to each row. However, I am trying to explore possibilities by mapping each row of the matrix to a thread instead.
As has already been mentioned, your provided GPU code does not compile, so I'm curious how you got to the observation that your output matrices do not match.
Here are some of the problems with your code:
cudaMalloc, just like malloc allocates bytes, so this is not correct:
cudaMalloc((void **)&d_result, N*N);
instead you want this:
cudaMalloc((void **)&d_result, N*N*sizeof(int));
likewise cudaMemcpy, just like memcpy, operates on bytes, and furthermore cudaMemcpy requires 4 parameters so this is not correct:
cudaMemcpy(result, N * N, cudaMemcpyHostToDevice);
instead you probably want this:
cudaMemcpy(d_result, result, N * N*sizeof(int), cudaMemcpyHostToDevice);
and your other cudaMemcpy line needs to be fixed similarly.
I'd also advise doing proper cuda error checking
Your kernel is written as if it's expecting a 2 dimensional thread array, or at least one dimensional in y, whereas you are launching a one dimensional grid in x:
gpu_Floyd<<<1024, 256>>>(d_result, N);
therefore all your kernel built-in variables in y will be 1 or 0 always, and this line of code:
int Row = blockIdx.y * blockDim.y + threadIdx.y;
will evaluate to zero for all threads in your 1-D grid in x.
Your gpu kernel is putting the results in the same matrix as the input data. For sequential code this may or may not matter, but for code that is intended to run in parallel, it can often lead to race conditions, because the order of operations (i.e. order of thread execution) is largely undefined.
Below you will find a canonical, simple implementation of the Floyd-Warshall algorithm in CUDA.
The CUDA code is accompanied with a sequential implementation and both are based on the simplifying assumption that the edges are non-negative. The full, minimum distance paths are also reconstructed in both the cases. Despite the simplifying assumption, it should be possible to grasp the relevant parallelization idea, namely that a two-dimensional thread grid is exploited and that each thread along x is assigned to a matrix column, while each block along y is assigned to a matrix row. In this way, all the columns are loaded by the threadIdx.x == 0 threads of each block in shared memory.
// --- Assumption: graph with positive edges
#include <stdio.h>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include "Utilities.cuh"
#define BLOCKSIZE 256
using namespace std;
map<string, int> nameToNum; // --- names of vertices
map<string, map<string, int>> weightMap; // --- weights of edges
/************************/
/* READ GRAPH FROM FILE */
/************************/
int *readGraphFromFile(int &N, char *fileName) {
string vertex1, vertex2;
ifstream graphFile;
int currentWeight;
N = 0; // --- Init the number of found vertices
graphFile.open(fileName); // --- Open the graph file
graphFile >> vertex1; // --- Read first vertex
while(vertex1 != "--END--") { // --- Loop untile end of file has not been found
graphFile >> vertex2; // --- Read second vertex
graphFile >> currentWeight; // --- Read weight between first and second vertex
if (nameToNum.count(vertex1) == 0) { // --- If vertex has not yet been added ...
nameToNum[vertex1] = N; // assign a progressive number to the vertex
weightMap[vertex1][vertex1] = 0; // assign a zero weight to the "self-edge"
N++; // --- Update the found number of vertices
}
if (nameToNum.count(vertex2) == 0) {
nameToNum[vertex2] = N;
weightMap[vertex2][vertex2] = 0;
N++;
}
weightMap[vertex1][vertex2] = currentWeight; // --- Update weight between vertices 1 and 2
graphFile >> vertex1;
}
graphFile.close(); // --- Close the graph file
// --- Construct the array
int *weightMatrix = (int*) malloc(N * N * sizeof(int));
// --- Loop over all the vertex couples in the wights matrix
for (int ii = 0; ii < N; ii++)
for (int jj = 0; jj < N; jj++)
weightMatrix[ii * N + jj] = INT_MAX / 2; // --- Init the weights matrix elements to infinity
map<string, int>::iterator i, j;
// --- Loop over all the vertex couples in the map
// (*i).first and (*j).first are the weight entries of the map, while (*i).second and (*j).second are their corresponding indices
for (i = nameToNum.begin(); i != nameToNum.end(); ++i)
for (j = nameToNum.begin(); j != nameToNum.end(); ++j) {
// --- If there is connection between vertices (*i).first and (*j).first, the update the weight matrix
if (weightMap[(*i).first].count((*j).first) != 0)
weightMatrix[N * (*i).second + (*j).second] = weightMap[(*i).first][(*j).first];
}
return weightMatrix;
}
/************************************/
/* PRINT MINIMUM DISTANCES FUNCTION */
/************************************/
void printMinimumDistances(int N, int *a) {
map<string, int>::iterator i;
// --- Prints all the node labels at the first row
for (i = nameToNum.begin(); i != nameToNum.end(); ++i) printf("\t%s", i->first.c_str());
printf("\n");
i = nameToNum.begin();
// --- Loop over the rows
for (int p = 0; p < N; p++) {
printf("%s\t", i -> first.c_str());
// --- Loop over the columns
for (int q = 0; q < N; q++) {
int dd = a[p * N + q];
if (dd != INT_MAX / 2) printf("%d\t", dd);
else printf("--\t");
}
printf("\n");
i++;
}
}
void printPathRecursive(int row, int col, int *minimumDistances, int *path, int N) {
map<string, int>::iterator i = nameToNum.begin();
map<string, int>::iterator j = nameToNum.begin();
if (row == col) {advance(i, row); printf("%s\t", i -> first.c_str()); }
else {
if (path[row * N + col] == INT_MAX / 2) printf("%row %row %row No path exists\t\n", minimumDistances[row * N + col], row, col);
else {
printPathRecursive(row, path[row * N + col], minimumDistances, path, N);
advance(j, col);
printf("%s\t", j -> first.c_str());
}
}
}
void printPath(int N, int *minimumDistances, int *path) {
map<string, int>::iterator i;
map<string, int>::iterator j;
// --- Loop over the rows
i = nameToNum.begin();
for (int p = 0; p < N; p++) {
// --- Loop over the columns
j = nameToNum.begin();
for (int q = 0; q < N; q++) {
printf("From %s to %s\t", i -> first.c_str(), j -> first.c_str());
printPathRecursive(p, q, minimumDistances, path, N);
printf("\n");
j++;
}
i++;
}
}
/**********************/
/* FLOYD-WARSHALL CPU */
/**********************/
void h_FloydWarshall(int *h_graphMinimumDistances, int *h_graphPath, const int N) {
for (int k = 0; k < N; k++)
for (int row = 0; row < N; row++)
for (int col = 0; col < N; col++) {
if (h_graphMinimumDistances[row * N + col] > (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col])) {
h_graphMinimumDistances[row * N + col] = (h_graphMinimumDistances[row * N + k] + h_graphMinimumDistances[k * N + col]);
h_graphPath[row * N + col] = h_graphPath[k * N + col];
}
}
}
/*************************/
/* FLOYD-WARSHALL KERNEL */
/*************************/
__global__ void d_FloydWarshall(int k, int *d_graphMinimumDistances, int *d_graphPath, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x; // --- Each thread along x is assigned to a matrix column
int row = blockIdx.y; // --- Each block along y is assigned to a matrix row
if (col >= N) return;
int arrayIndex = N * row + col;
// --- All the blocks load the entire k-th column into shared memory
__shared__ int d_graphMinimumDistances_row_k;
if(threadIdx.x == 0) d_graphMinimumDistances_row_k = d_graphMinimumDistances[N * row + k];
__syncthreads();
if (d_graphMinimumDistances_row_k == INT_MAX / 2) // --- If element (row, k) = infinity, no update is needed
return;
int d_graphMinimumDistances_k_col = d_graphMinimumDistances[k * N + col];
if(d_graphMinimumDistances_k_col == INT_MAX / 2) // --- If element (k, col) = infinity, no update is needed
return;
int candidateBetterDistance = d_graphMinimumDistances_row_k + d_graphMinimumDistances_k_col;
if (candidateBetterDistance < d_graphMinimumDistances[arrayIndex]) {
d_graphMinimumDistances[arrayIndex] = candidateBetterDistance;
d_graphPath[arrayIndex] = d_graphPath[k * N + col];
}
}
/********/
/* MAIN */
/********/
int main() {
int N = 0; // --- Number of vertices
// --- Read graph array from file
int *h_graphArray = readGraphFromFile(N, "graph2.txt");
printf("\n******************\n");
printf("* Original graph *\n");
printf("******************\n");
printMinimumDistances(N, h_graphArray);
// --- Floyd-Warshall on CPU
int *h_graphMinimumDistances = (int *) malloc(N * N * sizeof(int));
int *h_graphPath = (int *) malloc(N * N * sizeof(int));
memcpy(h_graphMinimumDistances, h_graphArray, N * N * sizeof(int));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
h_FloydWarshall(h_graphMinimumDistances, h_graphPath, N);
printf("\n*************************\n");
printf("* CPU result: distances *\n");
printf("*************************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* CPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
// --- Graph array device allocation and host-device memory transfer
int *d_graphMinimumDistances; gpuErrchk(cudaMalloc(&d_graphMinimumDistances, N * N * sizeof(int)));
gpuErrchk(cudaMemcpy(d_graphMinimumDistances, h_graphArray, N * N * sizeof(int), cudaMemcpyHostToDevice));
int *d_graphPath; gpuErrchk(cudaMalloc(&d_graphPath, N * N * sizeof(int)));
for (int k = 0; k < N; k++)
for (int l = 0; l < N; l++)
if (h_graphArray[k * N + l] == INT_MAX / 2) h_graphPath[k * N + l] = INT_MAX / 2;
else h_graphPath[k * N + l] = k;
gpuErrchk(cudaMemcpy(d_graphPath, h_graphPath, N * N * sizeof(int), cudaMemcpyHostToDevice));
// --- Iterations
for (int k = 0; k < N; k++) {
d_FloydWarshall <<<dim3(iDivUp(N, BLOCKSIZE), N), BLOCKSIZE>>>(k, d_graphMinimumDistances, d_graphPath, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
// --- Copy results back to the host
gpuErrchk(cudaMemcpy(h_graphMinimumDistances, d_graphMinimumDistances, N * N * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_graphPath, d_graphPath, N * N * sizeof(int), cudaMemcpyDeviceToHost));
printf("\n**************\n");
printf("* GPU result *\n");
printf("**************\n");
printMinimumDistances(N, h_graphMinimumDistances);
printf("\n********************\n");
printf("* GPU result: path *\n");
printf("********************\n");
printPath(N, h_graphMinimumDistances, h_graphPath);
}

Unable to execute device kernel in CUDA

I am trying to call a device kernel within a global kernel. My global kernel is a Matrix Multiplication and my device kernel is finding the maximum value and the index in each column of the product matrix. Following is the code :
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + y] > temp){
temp = Pd[x*wB + y];
temp_idx = x*wB + y;
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
The Main code :
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 1024;
const int wB = 128;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = x;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = x;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i, j;
f1 = fopen("max_val.txt","w");
for(i=0; i < (wB * 2); i+=2){
fprintf(f1,"%d\t%d\n",int(P[i]),int(P[i+1]));
}
fclose(f1);
f1 = fopen("Prod_mat.txt","w");
for(i=0; i < 2; i++){
for(j=0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
The Matrix Multiplication result is fine (Verified using Matlab), but I am not able to get the max values and their corresponding index. I would appreciate if anyone can kindly point out at what I am doing wrong. The max variable has only garbage when I run the above code.
Apparently you are attempting to find the maximum value in each column, as well as the offset to that value.
But all of your threads in y are hammering on the same location for max value (max[x*2 + 0]). This isn't recommended, as there is no way to sort out a race condition. You should use atomic operations, or other methods (e.g. reduction) to handle multiple threads updating a single max value this way.
Since you have a need to update two values atomically (the max value and it's location), it's not a simple matter of replacing your plain access with a standard atomic function. However, since you are dealing with two 32-bit adjacent quantities, you may be interested in my answer here.
By the way I think matlab's native matrix multiply on gpuArray should be faster than any matrix multiply code you write. But it would require the Parallel Compute Toolbox.