How to format the A matrix for CUBLAS routine cublasdtbsv?

How to format the A matrix for CUBLAS routine cublasdtbsv? - cuda

I'm new to using the Cuda libraries and would like to solve symmetric banded matrix equation. I found sample code to solve this using LU Factorization. I am now trying to use cudaBlas routine cublasdtbsv to solve the equations. I was not able to find sample code for this function and have put together my own solution. The problem that I believe I'm having is that I do not understand the correct way to input the A matrix for this routine. Here is my sample code for a very simple 3x3 matrix with one right hand side. It includes the correct solution using LU Factorization and my attempt to use cublasdtbsv routine:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
void test();
void printMatrix2(int m, int n, const double* A, int lda, const char* name);
int LUFactorizationSolver2();
int TriBandedSymSolver2();
int main(int argc, char* argv[])
{
test();
return 0;
}
void test()
{
LUFactorizationSolver2();
TriBandedSymSolver2();
return;
}
int TriBandedSymSolver2()
{
printf("\n**** example of cublasDtbsv \n\n");
const int n = 3;
const int ldm = n;
const int k = 1;// n - 1;
const int lda = n;
const int nrhs = 1;
const int incx = 1;
double M[ldm * n] = { 1.0, 0.0, 0.0
, 0.0, 2.0, 3.0
, 0.0, 3.0, 4.0
};
double A[lda * n] = { 1.0, 2.0, 4.0
, 0.0, 3.0, 0.0
, 0.0, 0.0, 0.0
};
double x[n * nrhs] = { 00.0
, 40.0
, 00.0
};
cublasHandle_t cublasHandle = NULL;
cudaStream_t stream = NULL;
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
double* d_A = NULL; /* device copy of A */
double* d_x = NULL; /* device copy of x */
printf("example of tbsv \n");
printf("A = (matlab base-1)\n");
printMatrix2(n, n, A, lda, "A");
printf("=====\n");
printf("x (b) = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
cublasStatus = cublasCreate(&cublasHandle);
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/*
* step 5: solve A*x = b
*
*/
cublasStatus = cublasDtbsv(cublasHandle
, CUBLAS_FILL_MODE_LOWER
, CUBLAS_OP_N
, CUBLAS_DIAG_NON_UNIT
, n
, k
, d_A
, lda
, d_x
, incx
);
cudaStat1 = cudaDeviceSynchronize();
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_x) cudaFree(d_x);
if (cublasHandle) cublasDestroy(cublasHandle);
if (stream) cudaStreamDestroy(stream);
cudaDeviceReset();
return 0;
}
int LUFactorizationSolver2()
{
printf("\n**** example of cusolverDnDgetrs \n\n");
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
const int m = 3;
const int lda = m;
const int nrhs = 1; // number of right-hand sides
const int ldb = m;
double A[lda * m] = { 1.0, 0.0, 0.0
, 0.0, 2.0, 3.0
, 0.0, 3.0, 4.0
};
double B[m * nrhs] = { 00.0
, 40.0
, 00.0
};
double X[m * nrhs]; /* X = A\B */
double LU[lda * m]; /* L and U */
int Ipiv[m]; /* host copy of pivoting sequence */
int info = 0; /* host copy of error info */
double* d_A = NULL; /* device copy of A */
double* d_B = NULL; /* device copy of B */
int* d_Ipiv = NULL; /* pivoting sequence */
int* d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
double* d_work = NULL; /* device workspace for getrf */
const int pivot_on = 0; // 1;
if (pivot_on) {
printf("pivot is on : compute P*A = L*U \n");
}
else {
printf("pivot is off: compute A = L*U (not numerically stable)\n");
}
printf("A = (matlab base-1)\n");
printMatrix2(m, m, A, lda, "A");
printf("=====\n");
printf("B = (matlab base-1)\n");
printMatrix2(m, nrhs, B, ldb, "B");
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * m);
cudaStat2 = cudaMalloc((void**)&d_B, sizeof(double) * m * nrhs);
cudaStat2 = cudaMalloc((void**)&d_Ipiv, sizeof(int) * m);
cudaStat4 = cudaMalloc((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * m, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_B, B, sizeof(double) * m * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/* step 3: query working space of getrf */
status = cusolverDnDgetrf_bufferSize(
cusolverH,
m,
m,
d_A,
lda,
&lwork);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double) * lwork);
assert(cudaSuccess == cudaStat1);
/* step 4: LU factorization */
if (pivot_on) {
status = cusolverDnDgetrf(
cusolverH,
m,
m,
d_A,
lda,
d_work,
d_Ipiv,
d_info);
}
else {
status = cusolverDnDgetrf(
cusolverH,
m,
m,
d_A,
lda,
d_work,
NULL,
d_info);
}
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
if (pivot_on) {
cudaStat1 = cudaMemcpy(Ipiv, d_Ipiv, sizeof(int) * m, cudaMemcpyDeviceToHost);
}
cudaStat2 = cudaMemcpy(LU, d_A, sizeof(double) * lda * m, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
if (0 > info) {
printf("%d-th parameter is wrong \n", -info);
exit(1);
}
if (pivot_on) {
printf("pivoting sequence, matlab base-1\n");
for (int j = 0; j < m; j++) {
printf("Ipiv(%d) = %d\n", j + 1, Ipiv[j]);
}
}
printf("L and U = (matlab base-1)\n");
printMatrix2(m, m, LU, lda, "LU");
printf("=====\n");
/*
* step 5: solve A*X = B
*
*/
if (pivot_on) {
status = cusolverDnDgetrs(
cusolverH,
CUBLAS_OP_N,
m,
nrhs, /* nrhs */
d_A,
lda,
d_Ipiv,
d_B,
ldb,
d_info);
}
else {
status = cusolverDnDgetrs(
cusolverH,
CUBLAS_OP_N,
m,
nrhs, /* nrhs */
d_A,
lda,
NULL,
d_B,
ldb,
d_info);
}
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(X, d_B, sizeof(double) * m * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(m, nrhs, X, ldb, "X");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_Ipiv) cudaFree(d_Ipiv);
if (d_info) cudaFree(d_info);
if (d_work) cudaFree(d_work);
if (cusolverH) cusolverDnDestroy(cusolverH);
if (stream) cudaStreamDestroy(stream);
cudaDeviceReset();
return 0;
}
void printMatrix2(int m, int n, const double* A, int lda, const char* name)
{
printf("%18s", "");
for (int col = 0; col < n; col++) { printf("%7s(*,%2d) ", name, col + 1); }
printf("\n");
for (int row = 0; row < m; row++) {
printf("%4s(%2d,*) = ", name, row + 1);
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%20.9f", Areg);
}
printf("\n");
}
return;
}
The correct answer should be:
0.00
-160.00
120.00
But I get:
0.000000000
inf
-inf
I'm developing this on Windows 10 using Visual Studio 2019.
What am I missing or can someone point me to a working sample for the cublasDtbsv routine?

CUBLAS tbsv is a banded triangular solver. It expects your M matrix to be banded and triangular. If you would like to see what that looks like, this is a good reference.
Your M matrix is not triangular. A triangular matrix would either have the upper triangular part (not including main diagonal) or lower triangular part (not including main diagonal) as all zeros. Your M matrix does not fit that definition.
A banded triangular matrix M might look like this:
| 2.0 0.0 0.0 |
M = | 1.0 1.0 0.0 |
| 0.0 1.0 1.0 |
Let's use that as our example, with a RHS of | 2.0 2.0 2.0 |, and use the suggestions given for A matrix formatting here. In that case our A matrix would look like:
A = | 2.0 1.0 1.0 | (the main diagonal of M)
| 1.0 1.0 0.0 | (the first sub-diagonal of M)
In that case our A matrix has 2 rows, and therefore the leading dimension of A is given by lda = 2
If we put all of that into your test framework, it seems to give the correct result:
$ cat t158.cu
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
void printMatrix2(int m, int n, const double* A, int lda, const char* name)
{
printf("%18s", "");
for (int col = 0; col < n; col++) { printf("%7s(*,%2d) ", name, col + 1); }
printf("\n");
for (int row = 0; row < m; row++) {
printf("%4s(%2d,*) = ", name, row + 1);
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%20.9f", Areg);
}
printf("\n");
}
return;
}
int main(int argc, char* argv[])
{
printf("\n**** example of cublasDtbsv \n\n");
const int n = 3;
// const int ldm = n;
const int k = 1;
const int lda = k+1;
const int nrhs = 1;
const int incx = 1;
/*
double M[ldm * n] = { 2.0, 0.0, 0.0
, 1.0, 1.0, 0.0
, 0.0, 1.0, 1.0
};
*/
double A[lda * n] = { 2.0, 1.0, 1.0
, 1.0, 1.0, 0.0
};
double x[n * nrhs] = { 2.0
, 2.0
, 2.0
};
cublasHandle_t cublasHandle = NULL;
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
double* d_A = NULL; /* device copy of A */
double* d_x = NULL; /* device copy of x */
printf("example of tbsv \n");
printf("A = (matlab base-1)\n");
printMatrix2(n, n, A, lda, "A");
printf("=====\n");
printf("x (b) = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* step 1: create cublas handle */
cublasStatus = cublasCreate(&cublasHandle);
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/*
* step 5: solve A*x = b
*
*/
cublasStatus = cublasDtbsv(cublasHandle
, CUBLAS_FILL_MODE_LOWER
, CUBLAS_OP_N
, CUBLAS_DIAG_NON_UNIT
, n
, k
, d_A
, lda
, d_x
, incx
);
cudaStat1 = cudaDeviceSynchronize();
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_x) cudaFree(d_x);
if (cublasHandle) cublasDestroy(cublasHandle);
return 0;
}
$ nvcc -o t158 t158.cu -lcublas
$ ./t158
**** example of cublasDtbsv
example of tbsv
A = (matlab base-1)
A(*, 1) A(*, 2) A(*, 3)
A( 1,*) = 2.000000000 1.000000000 1.000000000
A( 2,*) = 1.000000000 1.000000000 0.000000000
A( 3,*) = 1.000000000 1.000000000 0.000000000
=====
x (b) = (matlab base-1)
x(*, 1)
x( 1,*) = 2.000000000
x( 2,*) = 2.000000000
x( 3,*) = 2.000000000
=====
X = (matlab base-1)
x(*, 1)
x( 1,*) = 1.000000000
x( 2,*) = 1.000000000
x( 3,*) = 1.000000000
=====
$

Related

Undefined reference to `cusolverDn` functions

I need to calculate eigenvalue of a big matrix in parallel. For that purpose I use cuSolver. To test, how it works, I took code from documentation:
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
for(int row = 0 ; row < m ; row++){
for(int col = 0 ; col < n ; col++){
double Areg = A[row + col*lda];
printf("%s(%d,%d) = %f\n", name, row+1, col+1, Areg);
}
}
}
int main(int argc, char*argv[])
{
cusolverDnHandle_t cusolverH = NULL;
cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
const int m = 3;
const int lda = m;
/* | 3.5 0.5 0 |
* A = | 0.5 3.5 0 |
* | 0 0 2 |
*
*/
double A[lda*m] = { 3.5, 0.5, 0, 0.5, 3.5, 0, 0, 0, 2.0};
double lambda[m] = { 2.0, 3.0, 4.0};
double V[lda*m]; // eigenvectors
double W[m]; // eigenvalues
double *d_A = NULL;
double *d_W = NULL;
int *devInfo = NULL;
double *d_work = NULL;
int lwork = 0;
int info_gpu = 0;
printf("A = (matlab base-1)\n");
printMatrix(m, m, A, lda, "A");
printf("=====\n");
// step 1: create cusolver/cublas handle
cusolver_status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
// step 2: copy A and B to device
cudaStat1 = cudaMalloc ((void**)&d_A, sizeof(double) * lda * m);
cudaStat2 = cudaMalloc ((void**)&d_W, sizeof(double) * m);
cudaStat3 = cudaMalloc ((void**)&devInfo, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * m, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
// step 3: query working space of syevd
cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; // compute eigenvalues and eigenvectors.
cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
cusolver_status = cusolverDnDsyevd_bufferSize(
cusolverH,
jobz,
uplo,
m,
d_A,
lda,
d_W,
&lwork);
assert (cusolver_status == CUSOLVER_STATUS_SUCCESS);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double)*lwork);
assert(cudaSuccess == cudaStat1);
// step 4: compute spectrum
cusolver_status = cusolverDnDsyevd(
cusolverH,
jobz,
uplo,
m,
d_A,
lda,
d_W,
d_work,
lwork,
devInfo);
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(W, d_W, sizeof(double)*m, cudaMemcpyDeviceToHost);
cudaStat2 = cudaMemcpy(V, d_A, sizeof(double)*lda*m, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
printf("after syevd: info_gpu = %d\n", info_gpu);
assert(0 == info_gpu);
printf("eigenvalue = (matlab base-1), ascending order\n");
for(int i = 0 ; i < m ; i++){
printf("W[%d] = %E\n", i+1, W[i]);
}
printf("V = (matlab base-1)\n");
printMatrix(m, m, V, lda, "V");
printf("=====\n");
// step 4: check eigenvalues
double lambda_sup = 0;
for(int i = 0 ; i < m ; i++){
double error = fabs( lambda[i] - W[i]);
lambda_sup = (lambda_sup > error)? lambda_sup : error;
}
printf("|lambda - W| = %E\n", lambda_sup);
// free resources
if (d_A ) cudaFree(d_A);
if (d_W ) cudaFree(d_W);
if (devInfo) cudaFree(devInfo);
if (d_work ) cudaFree(d_work);
if (cusolverH) cusolverDnDestroy(cusolverH);
cudaDeviceReset();
return 0;
}
But after compiling it I got the next errors undefined reference to `cusolverDnCreate', undefined reference to cusolverDnDsyevd_bufferSize, undefined reference to cusolverDnDsyevd and undefined reference to `cusolverDnDestroy, but all that functions exist in <cusolverDn.h>.
The most strange thing, that when I try to compile this file as said there: undefined reference to cusolverDn .
instead two first errors, which I listed above(others are the same) I got new errors: identifier "cusolverEigMode_t" is undefined and identifier "CUSOLVER_EIG_MODE_VECTOR" is undefined.
I try to solve it for couple of days, and will be really gratefull if you help me!

You have to link -lcusolver in your Makefile

Getting a wrong result when using cusparse Csrmv_mp but a correct one with cusparse Csrmv

I'm trying to do a simple test of the Csrmv_mp function. I have a working program, but I'm getting a wrong result for an specific pair of matrix an vector. If I run the exact same program but with Csrmv I get the correct result.
Here is my code (it is a reduced version of the example in appendix C of the cusparse library: http://docs.nvidia.com/cuda/cusparse/index.html#csrmv_examples):
/*
* How to compile (assume cuda is installed at /usr/local/cuda/)
* nvcc -c -I/usr/local/cuda/include csrmvmp_example.cpp
* g++ -fopenmp -o csrmvmp_example csrmvmp_example.o -L/usr/local/cuda/lib64 -
lcublas -lcusparse -lcudart
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse.h>
void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
double Areg = A[row + col*lda];
printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
}
}
}
int main(int argc, char*argv[])
{
cublasHandle_t cublasH = NULL;
cusparseHandle_t cusparseH = NULL;
cudaStream_t stream = NULL;
cusparseMatDescr_t descrA = NULL;
cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
const int n = 3;
const int nnzA = 6;
/*
* | 0 1 2 |
* A = | 1 0 3 |
* | 2 3 0 |
*
* Initial vector
*
* | 1/3 |
* v = | 1/3 |
* | 1/3 |
*
*/
const int csrRowPtrA[n + 1] = { 0, 2, 4, 6 };
const int csrColIndA[nnzA] = { 1, 2, 0, 2, 0, 1 };
const double csrValA[nnzA] = { 1.0, 2.0, 1.0, 3.0, 2.0, 3.0 };
const double x0[n] = { 1.0/3.0, 1.0/3.0, 1.0/3.0 }; /* initial guess */
double x[n]; /* numerical eigenvector */
int *d_csrRowPtrA = NULL;
int *d_csrColIndA = NULL;
double *d_csrValA = NULL;
double *d_x = NULL; /* eigenvector */
double *d_y = NULL; /* workspace */
const double tol = 1.e-6;
const int max_ites = 30;
const double h_one = 1.0;
const double h_zero = 0.0;
/* step 1: create cublas/cusparse handle, bind a stream */
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
cublasStat = cublasCreate(&cublasH);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cublasStat = cublasSetStream(cublasH, stream);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cusparseStat = cusparseCreate(&cusparseH);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseStat = cusparseSetStream(cusparseH, stream);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/* step 2: configuration of matrix A */
cusparseStat = cusparseCreateMatDescr(&descrA);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
/* step 3: copy A and x0 to device */
cudaStat1 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int) * (n + 1));
cudaStat2 = cudaMalloc((void**)&d_csrColIndA, sizeof(int) * nnzA);
cudaStat3 = cudaMalloc((void**)&d_csrValA, sizeof(double) * nnzA);
cudaStat4 = cudaMalloc((void**)&d_x, sizeof(double) * n);
cudaStat5 = cudaMalloc((void**)&d_y, sizeof(double) * n);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int) * (n + 1),
cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int) * nnzA,
cudaMemcpyHostToDevice);
cudaStat3 = cudaMemcpy(d_csrValA, csrValA, sizeof(double) * nnzA,
cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
/*
* 4.1: initial guess x0
*/
cudaStat1 = cudaMemcpy(d_x, x0, sizeof(double) * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
/*
* 4.3: y = A*x
*/
cusparseStat = cusparseDcsrmv_mp(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, n, n, nnzA, &h_one, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_x, &h_zero, d_y);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/*
* step 5: report result
*/
cudaStat1 = cudaMemcpy(x, d_y, sizeof(double) * n, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("vector = \n");
printMatrix(n, 1, x, n, "V0");
printf("=====\n");
/* free resources */
if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
if (d_csrColIndA) cudaFree(d_csrColIndA);
if (d_csrValA) cudaFree(d_csrValA);
if (d_x) cudaFree(d_x);
if (d_y) cudaFree(d_y);
if (cublasH) cublasDestroy(cublasH);
if (cusparseH) cusparseDestroy(cusparseH);
if (stream) cudaStreamDestroy(stream);
if (descrA) cusparseDestroyMatDescr(descrA);
cudaDeviceReset();
return 0;
}
The resulting vector is {1, 1, 1}, but doing the calculation by hand or with the Csrmv function I get the vector {1, 4/3, 5/3} as a result.
I really don't understand why I have this problem. The only thing I can think of is I incorrectly wrote the matrix in its CSR format. Also, I don't use CUSPARSE_MATRIX_TYPE_SYMMETRIC because the function doesn't accept this type of matix (the documentation is wrong).
If someone could help me I would appreciate it very much.
Edit:
I'm using CUDA 9.0, my OS Windows 10 Home and my GPU is the GTX 960m

I just updated to CUDA 9.1 and, as Robert Crovella said, the bug has been solved.

Call multiple CUDA SVD (in cuSolver) [duplicate]

I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?

My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA

The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}

The efficiency and performance of ILP for the NVIDIA Kepler architecture

Quoting the "Kepler Tuning Guide" provided by NVIDIA:
Also note that Kepler GPUs can utilize ILP in place of
thread/warp-level parallelism (TLP) more readily than Fermi GPUs can.
In my opinion, the following code snippet
a = .....;
a2 = f(a);
a3 = g(a2);
can be improved as follows
a = ...;
b = ....;
a2 = f(a);
b2 = f(b);
a3 = g(a2);
b3 = g(b2);
So in my projects, I have a section of code as follows (example 1)
if(x < src.cols && y < src.rows)
{
if(!mask(y,x))
{
src.ptr(y)[x] = make_short4(0,0,0,0);
}
}
and I rewrite it as follows (example2)
if(x < src.cols && y < src.rows)
{
if(!mask(y,x))
{
short4 t;
t.x = 0;
t.y = 0;
t.z = 0;
t.w = 0;
src.ptr(y)[x].x = t.x;
src.ptr(y)[x].y = t.y;
src.ptr(y)[x].z = t.z;
src.ptr(y)[x].w = t.w;
}
}
In the Kepler architecture, the example2 will be more efficient and exhibit better performance than example1, is that right?

A good explanation on Instruction Level Parallelism (ILP) can be found at CUDA Performance: Maximizing Instruction-Level Parallelism.
It has been pointed out by Robert Crovella and talonmies, and it has been recognized by yourself, that your example above does not reach ILP.
Concerning how implementing ILP, I'm showing below the classical example, translated from the PyCUDA code at numbapro-examples, which I have tested for a Fermi and for a Kepler GPU. Please, notice that for the latter case I have not observed relevant speedups.
THE CODE
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE 64
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/************************************/
/* NO INSTRUCTION LEVEL PARALLELISM */
/************************************/
__global__ void ILP0(float* d_a, float* d_b, float* d_c) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
d_c[i] = d_a[i] + d_b[i];
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X2 */
/************************************/
__global__ void ILP2(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X4 */
/************************************/
__global__ void ILP4(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
}
/************************************/
/* INSTRUCTION LEVEL PARALLELISM X8 */
/************************************/
__global__ void ILP8(float* d_a, float* d_b, float* d_c) {
// --- Loading the data
int i = threadIdx.x + blockIdx.x * blockDim.x;
float ai = d_a[i];
float bi = d_b[i];
int stride = gridDim.x * blockDim.x;
int j = i + stride;
float aj = d_a[j];
float bj = d_b[j];
int k = j + stride;
float ak = d_a[k];
float bk = d_b[k];
int l = k + stride;
float al = d_a[l];
float bl = d_b[l];
int m = l + stride;
float am = d_a[m];
float bm = d_b[m];
int n = m + stride;
float an = d_a[n];
float bn = d_b[n];
int p = n + stride;
float ap = d_a[p];
float bp = d_b[p];
int q = p + stride;
float aq = d_a[q];
float bq = d_b[q];
// --- Computing
float ci = ai + bi;
float cj = aj + bj;
float ck = ak + bk;
float cl = al + bl;
float cm = am + bm;
float cn = an + bn;
float cp = ap + bp;
float cq = aq + bq;
// --- Writing the data
d_c[i] = ci;
d_c[j] = cj;
d_c[k] = ck;
d_c[l] = cl;
d_c[m] = cm;
d_c[n] = cn;
d_c[p] = cp;
d_c[q] = cq;
}
/********/
/* MAIN */
/********/
void main() {
float timing;
cudaEvent_t start, stop;
const int N = 65536*4; // --- ASSUMPTION: N can be divided by BLOCKSIZE
float* a = (float*)malloc(N*sizeof(float));
float* b = (float*)malloc(N*sizeof(float));
float* c = (float*)malloc(N*sizeof(float));
float* c_ref = (float*)malloc(N*sizeof(float));
srand(time(NULL));
for (int i=0; i<N; i++) {
a[i] = rand() / RAND_MAX;
b[i] = rand() / RAND_MAX;
c_ref[i] = a[i] + b[i];
}
float* d_a; gpuErrchk(cudaMalloc((void**)&d_a,N*sizeof(float)));
float* d_b; gpuErrchk(cudaMalloc((void**)&d_b,N*sizeof(float)));
float* d_c0; gpuErrchk(cudaMalloc((void**)&d_c0,N*sizeof(float)));
float* d_c2; gpuErrchk(cudaMalloc((void**)&d_c2,N*sizeof(float)));
float* d_c4; gpuErrchk(cudaMalloc((void**)&d_c4,N*sizeof(float)));
float* d_c8; gpuErrchk(cudaMalloc((void**)&d_c8,N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice));
/******************/
/* ILP0 TEST CASE */
/******************/
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ILP0<<<iDivUp(N,BLOCKSIZE),BLOCKSIZE>>>(d_a, d_b, d_c0);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP0: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c0, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP2 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP2<<<(N/2)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c2);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP2: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c2, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP4 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP4<<<(N/4)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c4);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP4: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c4, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("Error!\n");
return;
}
printf("Test passed!\n");
/******************/
/* ILP8 TEST CASE */
/******************/
cudaEventRecord(start, 0);
ILP8<<<(N/8)/BLOCKSIZE,BLOCKSIZE>>>(d_a, d_b, d_c8);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timing, start, stop);
printf("Elapsed time - ILP8: %3.3f ms \n", timing);
gpuErrchk(cudaMemcpy(c, d_c8, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Checking the results
for (int i=0; i<N; i++)
if (c[i] != c_ref[i]) {
printf("%f %f\n",c[i],c_ref[i]);
printf("Error!\n");
return;
}
printf("Test passed!\n");
}
PERFORMANCE
Card Kernel Time [ms] Speedup
GeForce GT540M ILP0 4.609 1
" ILP2 2.666 1.72
" ILP4 1.675 2.76
" ILP8 1.477 3.12
Kepler K20c ILP0 0.045
" ILP2 0.043
" ILP4 0.043
" ILP8 0.042

Parallel implementation for multiple SVDs using CUDA

I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?

My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA

The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

How to format the A matrix for CUBLAS routine cublasdtbsv? - cuda

Related

Undefined reference to `cusolverDn` functions

Getting a wrong result when using cusparse Csrmv_mp but a correct one with cusparse Csrmv

Call multiple CUDA SVD (in cuSolver) [duplicate]

The efficiency and performance of ILP for the NVIDIA Kepler architecture

Parallel implementation for multiple SVDs using CUDA

Categories

Resources