Related
I'm new to using the Cuda libraries and would like to solve symmetric banded matrix equation. I found sample code to solve this using LU Factorization. I am now trying to use cudaBlas routine cublasdtbsv to solve the equations. I was not able to find sample code for this function and have put together my own solution. The problem that I believe I'm having is that I do not understand the correct way to input the A matrix for this routine. Here is my sample code for a very simple 3x3 matrix with one right hand side. It includes the correct solution using LU Factorization and my attempt to use cublasdtbsv routine:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
void test();
void printMatrix2(int m, int n, const double* A, int lda, const char* name);
int LUFactorizationSolver2();
int TriBandedSymSolver2();
int main(int argc, char* argv[])
{
test();
return 0;
}
void test()
{
LUFactorizationSolver2();
TriBandedSymSolver2();
return;
}
int TriBandedSymSolver2()
{
printf("\n**** example of cublasDtbsv \n\n");
const int n = 3;
const int ldm = n;
const int k = 1;// n - 1;
const int lda = n;
const int nrhs = 1;
const int incx = 1;
double M[ldm * n] = { 1.0, 0.0, 0.0
, 0.0, 2.0, 3.0
, 0.0, 3.0, 4.0
};
double A[lda * n] = { 1.0, 2.0, 4.0
, 0.0, 3.0, 0.0
, 0.0, 0.0, 0.0
};
double x[n * nrhs] = { 00.0
, 40.0
, 00.0
};
cublasHandle_t cublasHandle = NULL;
cudaStream_t stream = NULL;
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
double* d_A = NULL; /* device copy of A */
double* d_x = NULL; /* device copy of x */
printf("example of tbsv \n");
printf("A = (matlab base-1)\n");
printMatrix2(n, n, A, lda, "A");
printf("=====\n");
printf("x (b) = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
cublasStatus = cublasCreate(&cublasHandle);
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/*
* step 5: solve A*x = b
*
*/
cublasStatus = cublasDtbsv(cublasHandle
, CUBLAS_FILL_MODE_LOWER
, CUBLAS_OP_N
, CUBLAS_DIAG_NON_UNIT
, n
, k
, d_A
, lda
, d_x
, incx
);
cudaStat1 = cudaDeviceSynchronize();
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_x) cudaFree(d_x);
if (cublasHandle) cublasDestroy(cublasHandle);
if (stream) cudaStreamDestroy(stream);
cudaDeviceReset();
return 0;
}
int LUFactorizationSolver2()
{
printf("\n**** example of cusolverDnDgetrs \n\n");
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
const int m = 3;
const int lda = m;
const int nrhs = 1; // number of right-hand sides
const int ldb = m;
double A[lda * m] = { 1.0, 0.0, 0.0
, 0.0, 2.0, 3.0
, 0.0, 3.0, 4.0
};
double B[m * nrhs] = { 00.0
, 40.0
, 00.0
};
double X[m * nrhs]; /* X = A\B */
double LU[lda * m]; /* L and U */
int Ipiv[m]; /* host copy of pivoting sequence */
int info = 0; /* host copy of error info */
double* d_A = NULL; /* device copy of A */
double* d_B = NULL; /* device copy of B */
int* d_Ipiv = NULL; /* pivoting sequence */
int* d_info = NULL; /* error info */
int lwork = 0; /* size of workspace */
double* d_work = NULL; /* device workspace for getrf */
const int pivot_on = 0; // 1;
if (pivot_on) {
printf("pivot is on : compute P*A = L*U \n");
}
else {
printf("pivot is off: compute A = L*U (not numerically stable)\n");
}
printf("A = (matlab base-1)\n");
printMatrix2(m, m, A, lda, "A");
printf("=====\n");
printf("B = (matlab base-1)\n");
printMatrix2(m, nrhs, B, ldb, "B");
printf("=====\n");
/* step 1: create cusolver handle, bind a stream */
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * m);
cudaStat2 = cudaMalloc((void**)&d_B, sizeof(double) * m * nrhs);
cudaStat2 = cudaMalloc((void**)&d_Ipiv, sizeof(int) * m);
cudaStat4 = cudaMalloc((void**)&d_info, sizeof(int));
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * m, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_B, B, sizeof(double) * m * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/* step 3: query working space of getrf */
status = cusolverDnDgetrf_bufferSize(
cusolverH,
m,
m,
d_A,
lda,
&lwork);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaStat1 = cudaMalloc((void**)&d_work, sizeof(double) * lwork);
assert(cudaSuccess == cudaStat1);
/* step 4: LU factorization */
if (pivot_on) {
status = cusolverDnDgetrf(
cusolverH,
m,
m,
d_A,
lda,
d_work,
d_Ipiv,
d_info);
}
else {
status = cusolverDnDgetrf(
cusolverH,
m,
m,
d_A,
lda,
d_work,
NULL,
d_info);
}
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
if (pivot_on) {
cudaStat1 = cudaMemcpy(Ipiv, d_Ipiv, sizeof(int) * m, cudaMemcpyDeviceToHost);
}
cudaStat2 = cudaMemcpy(LU, d_A, sizeof(double) * lda * m, cudaMemcpyDeviceToHost);
cudaStat3 = cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
if (0 > info) {
printf("%d-th parameter is wrong \n", -info);
exit(1);
}
if (pivot_on) {
printf("pivoting sequence, matlab base-1\n");
for (int j = 0; j < m; j++) {
printf("Ipiv(%d) = %d\n", j + 1, Ipiv[j]);
}
}
printf("L and U = (matlab base-1)\n");
printMatrix2(m, m, LU, lda, "LU");
printf("=====\n");
/*
* step 5: solve A*X = B
*
*/
if (pivot_on) {
status = cusolverDnDgetrs(
cusolverH,
CUBLAS_OP_N,
m,
nrhs, /* nrhs */
d_A,
lda,
d_Ipiv,
d_B,
ldb,
d_info);
}
else {
status = cusolverDnDgetrs(
cusolverH,
CUBLAS_OP_N,
m,
nrhs, /* nrhs */
d_A,
lda,
NULL,
d_B,
ldb,
d_info);
}
cudaStat1 = cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == status);
assert(cudaSuccess == cudaStat1);
cudaStat1 = cudaMemcpy(X, d_B, sizeof(double) * m * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(m, nrhs, X, ldb, "X");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_B) cudaFree(d_B);
if (d_Ipiv) cudaFree(d_Ipiv);
if (d_info) cudaFree(d_info);
if (d_work) cudaFree(d_work);
if (cusolverH) cusolverDnDestroy(cusolverH);
if (stream) cudaStreamDestroy(stream);
cudaDeviceReset();
return 0;
}
void printMatrix2(int m, int n, const double* A, int lda, const char* name)
{
printf("%18s", "");
for (int col = 0; col < n; col++) { printf("%7s(*,%2d) ", name, col + 1); }
printf("\n");
for (int row = 0; row < m; row++) {
printf("%4s(%2d,*) = ", name, row + 1);
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%20.9f", Areg);
}
printf("\n");
}
return;
}
The correct answer should be:
0.00
-160.00
120.00
But I get:
0.000000000
inf
-inf
I'm developing this on Windows 10 using Visual Studio 2019.
What am I missing or can someone point me to a working sample for the cublasDtbsv routine?
CUBLAS tbsv is a banded triangular solver. It expects your M matrix to be banded and triangular. If you would like to see what that looks like, this is a good reference.
Your M matrix is not triangular. A triangular matrix would either have the upper triangular part (not including main diagonal) or lower triangular part (not including main diagonal) as all zeros. Your M matrix does not fit that definition.
A banded triangular matrix M might look like this:
| 2.0 0.0 0.0 |
M = | 1.0 1.0 0.0 |
| 0.0 1.0 1.0 |
Let's use that as our example, with a RHS of | 2.0 2.0 2.0 |, and use the suggestions given for A matrix formatting here. In that case our A matrix would look like:
A = | 2.0 1.0 1.0 | (the main diagonal of M)
| 1.0 1.0 0.0 | (the first sub-diagonal of M)
In that case our A matrix has 2 rows, and therefore the leading dimension of A is given by lda = 2
If we put all of that into your test framework, it seems to give the correct result:
$ cat t158.cu
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
void printMatrix2(int m, int n, const double* A, int lda, const char* name)
{
printf("%18s", "");
for (int col = 0; col < n; col++) { printf("%7s(*,%2d) ", name, col + 1); }
printf("\n");
for (int row = 0; row < m; row++) {
printf("%4s(%2d,*) = ", name, row + 1);
for (int col = 0; col < n; col++) {
double Areg = A[row + col * lda];
printf("%20.9f", Areg);
}
printf("\n");
}
return;
}
int main(int argc, char* argv[])
{
printf("\n**** example of cublasDtbsv \n\n");
const int n = 3;
// const int ldm = n;
const int k = 1;
const int lda = k+1;
const int nrhs = 1;
const int incx = 1;
/*
double M[ldm * n] = { 2.0, 0.0, 0.0
, 1.0, 1.0, 0.0
, 0.0, 1.0, 1.0
};
*/
double A[lda * n] = { 2.0, 1.0, 1.0
, 1.0, 1.0, 0.0
};
double x[n * nrhs] = { 2.0
, 2.0
, 2.0
};
cublasHandle_t cublasHandle = NULL;
cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
double* d_A = NULL; /* device copy of A */
double* d_x = NULL; /* device copy of x */
printf("example of tbsv \n");
printf("A = (matlab base-1)\n");
printMatrix2(n, n, A, lda, "A");
printf("=====\n");
printf("x (b) = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* step 1: create cublas handle */
cublasStatus = cublasCreate(&cublasHandle);
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
/* step 2: copy A to device */
cudaStat1 = cudaMalloc((void**)&d_A, sizeof(double) * lda * n);
cudaStat2 = cudaMalloc((void**)&d_x, sizeof(double) * n * nrhs);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
cudaStat1 = cudaMemcpy(d_A, A, sizeof(double) * lda * n, cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_x, x, sizeof(double) * n * nrhs, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
/*
* step 5: solve A*x = b
*
*/
cublasStatus = cublasDtbsv(cublasHandle
, CUBLAS_FILL_MODE_LOWER
, CUBLAS_OP_N
, CUBLAS_DIAG_NON_UNIT
, n
, k
, d_A
, lda
, d_x
, incx
);
cudaStat1 = cudaDeviceSynchronize();
assert(CUBLAS_STATUS_SUCCESS == cublasStatus);
cudaStat1 = cudaMemcpy(x, d_x, sizeof(double) * n * nrhs, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("X = (matlab base-1)\n");
printMatrix2(n, nrhs, x, nrhs, "x");
printf("=====\n");
/* free resources */
if (d_A) cudaFree(d_A);
if (d_x) cudaFree(d_x);
if (cublasHandle) cublasDestroy(cublasHandle);
return 0;
}
$ nvcc -o t158 t158.cu -lcublas
$ ./t158
**** example of cublasDtbsv
example of tbsv
A = (matlab base-1)
A(*, 1) A(*, 2) A(*, 3)
A( 1,*) = 2.000000000 1.000000000 1.000000000
A( 2,*) = 1.000000000 1.000000000 0.000000000
A( 3,*) = 1.000000000 1.000000000 0.000000000
=====
x (b) = (matlab base-1)
x(*, 1)
x( 1,*) = 2.000000000
x( 2,*) = 2.000000000
x( 3,*) = 2.000000000
=====
X = (matlab base-1)
x(*, 1)
x( 1,*) = 1.000000000
x( 2,*) = 1.000000000
x( 3,*) = 1.000000000
=====
$
I'm trying to do a simple test of the Csrmv_mp function. I have a working program, but I'm getting a wrong result for an specific pair of matrix an vector. If I run the exact same program but with Csrmv I get the correct result.
Here is my code (it is a reduced version of the example in appendix C of the cusparse library: http://docs.nvidia.com/cuda/cusparse/index.html#csrmv_examples):
/*
* How to compile (assume cuda is installed at /usr/local/cuda/)
* nvcc -c -I/usr/local/cuda/include csrmvmp_example.cpp
* g++ -fopenmp -o csrmvmp_example csrmvmp_example.o -L/usr/local/cuda/lib64 -
lcublas -lcusparse -lcudart
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse.h>
void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
double Areg = A[row + col*lda];
printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
}
}
}
int main(int argc, char*argv[])
{
cublasHandle_t cublasH = NULL;
cusparseHandle_t cusparseH = NULL;
cudaStream_t stream = NULL;
cusparseMatDescr_t descrA = NULL;
cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
const int n = 3;
const int nnzA = 6;
/*
* | 0 1 2 |
* A = | 1 0 3 |
* | 2 3 0 |
*
* Initial vector
*
* | 1/3 |
* v = | 1/3 |
* | 1/3 |
*
*/
const int csrRowPtrA[n + 1] = { 0, 2, 4, 6 };
const int csrColIndA[nnzA] = { 1, 2, 0, 2, 0, 1 };
const double csrValA[nnzA] = { 1.0, 2.0, 1.0, 3.0, 2.0, 3.0 };
const double x0[n] = { 1.0/3.0, 1.0/3.0, 1.0/3.0 }; /* initial guess */
double x[n]; /* numerical eigenvector */
int *d_csrRowPtrA = NULL;
int *d_csrColIndA = NULL;
double *d_csrValA = NULL;
double *d_x = NULL; /* eigenvector */
double *d_y = NULL; /* workspace */
const double tol = 1.e-6;
const int max_ites = 30;
const double h_one = 1.0;
const double h_zero = 0.0;
/* step 1: create cublas/cusparse handle, bind a stream */
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
cublasStat = cublasCreate(&cublasH);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cublasStat = cublasSetStream(cublasH, stream);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cusparseStat = cusparseCreate(&cusparseH);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseStat = cusparseSetStream(cusparseH, stream);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/* step 2: configuration of matrix A */
cusparseStat = cusparseCreateMatDescr(&descrA);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
/* step 3: copy A and x0 to device */
cudaStat1 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int) * (n + 1));
cudaStat2 = cudaMalloc((void**)&d_csrColIndA, sizeof(int) * nnzA);
cudaStat3 = cudaMalloc((void**)&d_csrValA, sizeof(double) * nnzA);
cudaStat4 = cudaMalloc((void**)&d_x, sizeof(double) * n);
cudaStat5 = cudaMalloc((void**)&d_y, sizeof(double) * n);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int) * (n + 1),
cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int) * nnzA,
cudaMemcpyHostToDevice);
cudaStat3 = cudaMemcpy(d_csrValA, csrValA, sizeof(double) * nnzA,
cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
/*
* 4.1: initial guess x0
*/
cudaStat1 = cudaMemcpy(d_x, x0, sizeof(double) * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
/*
* 4.3: y = A*x
*/
cusparseStat = cusparseDcsrmv_mp(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, n, n, nnzA, &h_one, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_x, &h_zero, d_y);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/*
* step 5: report result
*/
cudaStat1 = cudaMemcpy(x, d_y, sizeof(double) * n, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("vector = \n");
printMatrix(n, 1, x, n, "V0");
printf("=====\n");
/* free resources */
if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
if (d_csrColIndA) cudaFree(d_csrColIndA);
if (d_csrValA) cudaFree(d_csrValA);
if (d_x) cudaFree(d_x);
if (d_y) cudaFree(d_y);
if (cublasH) cublasDestroy(cublasH);
if (cusparseH) cusparseDestroy(cusparseH);
if (stream) cudaStreamDestroy(stream);
if (descrA) cusparseDestroyMatDescr(descrA);
cudaDeviceReset();
return 0;
}
The resulting vector is {1, 1, 1}, but doing the calculation by hand or with the Csrmv function I get the vector {1, 4/3, 5/3} as a result.
I really don't understand why I have this problem. The only thing I can think of is I incorrectly wrote the matrix in its CSR format. Also, I don't use CUSPARSE_MATRIX_TYPE_SYMMETRIC because the function doesn't accept this type of matix (the documentation is wrong).
If someone could help me I would appreciate it very much.
Edit:
I'm using CUDA 9.0, my OS Windows 10 Home and my GPU is the GTX 960m
I just updated to CUDA 9.1 and, as Robert Crovella said, the bug has been solved.
Here I want to calculate the distance of each two points, and decide if they are neighbours. here is my simple code in cuda.
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
The DataPoint is a struct is
typedef struct DataPoint {
float pfDimens[3];
} DataPoint;
so here i want to reduce the time, How can i do? I have tried to use memory coalesing and share memory, but i didn't get a good speed up?
===============use share memory==============
__global__ void calcNeighbors2(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[threadsPerBlock];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
Here i changed the neighbors[tid*N+i] to neighbors[i*N+tid], it give me amlost 8x speed up on Tesla K10.G2.8GB. But when i use share memory to store some points, it is no use?
There are at least 4 ideas, some of which have already been stated in the comments:
Transform your point distance storage from AoS format:
struct DataPoint {
float pfDimens[3];
};
to SoA format:
struct DataPoint {
float pfDimens_x[NPTS];
float pfDimens_y[NPTS];
float pfDimens_z[NPTS];
};
this will enable full coalescing on loading of the data. In fact, to help with point 4 below, I would just switch to using 3 bare arrays, rather than a structure.
reduce the computation to (slightly less than) half:
for (int i=N-1; i>tid; i--) {
then, either in the thread code itself, or in the host, you can populate the other "half" of the output matrix by copying data.
Transpose the storage in your output matrix, so that you can write a storage operation like this:
neighbors[i*N+tid] = true;
which will nicely coalesce, as opposed to this:
neighbors[tid*N+i] = true;
which will not.
Since your input point data is read only, mark the kernel parameter appropriately:
const float * __restrict__ points_x, const float * __restrict__ points_y, const float * __restrict__ points_z
in some cases, and on some GPUs, this will often lead to a speed-up due to use of the read-only cache. If you really want to get aggressive with caching, and your data array is small enough (4K or less float points), you could put a copy of the point data in global memory as well as a copy in __constant__ memory, and load the "uniform" load you are doing here through constant memory:
DataPoint p2 = c_points[i];
thus you could perform the coalesced load through the read-only cache, the uniform load through the constant cache, and the coalesced store going to ordinary global memory.
On a K40c, on linux/CUDA 7, for N = 4096, the net effect of these changes appears to be about a 3.5x speedup, at the kernel level:
$ cat t749.cu
#include <stdio.h>
#define N 4096
// if N is 16K/3 or less, we can use constant
#define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 0.004903s, t2: 0.001395s
$
In the case of K40c, the limited number of blocks being launched above (16) is a significant impediment to performance, due to latency. If we comment out the USE_CONSTANT define, and change N to 16384, we observe an even higher speedup with the improved kernel:
$ ./t749
t1: 0.267107s, t2: 0.008209s
$
the resultant ~48 blocks being enough to approximately "fill" the K40c which has 15 SMs.
EDIT: now that you've posted a shared memory kernel, I added it to my test case as calcNeighbors3 and compared it's timing performance (as t3). It is almost as fast as my kernel, and it seems to provide the correct result (matches your original kernel) so I'm not sure what your concerns are.
Here's the updated code and test case:
$ cat t749.cu
#include <stdio.h>
#include <math.h>
#define imin(X,Y) ((X)<(Y))?(X):(Y)
#define N 32768
// if N is 16K/3 or less, we can use constant
// #define USE_CONSTANT
#define THRESH 0.2f
#define nTPB 256
#define nBLK (N/nTPB+1)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
struct DataPoint {
float pfDimens[3];
};
__global__ void calcNeighbors(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
float dis = 0.0f;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=0; i<N; i++) {
DataPoint p2 = points[i];
dis = 0;
dis += (p1.pfDimens[0]-p2.pfDimens[0]) * (p1.pfDimens[0]-p2.pfDimens[0]) +
(p1.pfDimens[1]-p2.pfDimens[1]) * (p1.pfDimens[1]-p2.pfDimens[1]) +
(p1.pfDimens[2]-p2.pfDimens[2]) * (p1.pfDimens[2]-p2.pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[tid*N+i] = true;
} else {
neighbors[tid*N+i] = false;
}
}
tid += blockDim.x * gridDim.x;
}
}
#ifdef USE_CONSTANT
__constant__ float cpx[N];
__constant__ float cpy[N];
__constant__ float cpz[N];
#endif
__global__ void calcNeighbors2(const float * __restrict__ pts_x, const float * __restrict__ pts_y, const float * __restrict__ pts_z, const float doubleRadius, bool * __restrict__ neighbors) {
int tid = threadIdx.x+blockDim.x*blockIdx.x;
while (tid < N) {
float p1x = pts_x[tid];
float p1y = pts_y[tid];
float p1z = pts_z[tid];
for (int i = N-1; i > tid; i--){
float p2x, p2y, p2z;
#ifdef USE_CONSTANT
p2x = cpx[i];
p2y = cpy[i];
p2z = cpz[i];
#else
p2x = pts_x[i];
p2y = pts_y[i];
p2z = pts_z[i];
#endif
float dis = ((p1x-p2x)*(p1x-p2x)) + ((p1y-p2y)*(p1y-p2y)) + ((p1z-p2z)*(p1z-p2z));
neighbors[i*N+tid] = (dis <= doubleRadius);
}
tid += blockDim.x * gridDim.x;
}
}
__global__ void calcNeighbors3(const DataPoint* points,
const float doubleRadius, bool* neighbors) {
__shared__ DataPoint sharedpoints[nTPB];
int start = blockIdx.x * blockDim.x;
int len = start+threadIdx.x;
if (len < N) {
sharedpoints[threadIdx.x] = points[len];
}
len = imin(N, blockDim.x + start);
__syncthreads();
int tid = threadIdx.x;
float dis;
while (tid < N) {
DataPoint p1 = points[tid];
for (int i=start; i<len; i++) {
dis = 0;
dis += (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) * (p1.pfDimens[0]-sharedpoints[i-start].pfDimens[0]) +
(p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) * (p1.pfDimens[1]-sharedpoints[i-start].pfDimens[1]) +
(p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]) * (p1.pfDimens[2]-sharedpoints[i-start].pfDimens[2]);
if (dis <= doubleRadius) {
neighbors[i*N+tid] = true;
} else {
neighbors[i*N+tid] = false;
}
}
tid += blockDim.x;
}
}
int main(){
float *dx, *dy, *dz, *hx, *hy, *hz;
DataPoint *dp, *hp;
bool *dn, *hn1, *hn2, *hn3;
hx =(float *)malloc(N*sizeof(float));
hy =(float *)malloc(N*sizeof(float));
hz =(float *)malloc(N*sizeof(float));
hp =(DataPoint *)malloc(N*sizeof(DataPoint));
hn1=(bool *)malloc(N*N*sizeof(bool));
hn2=(bool *)malloc(N*N*sizeof(bool));
hn3=(bool *)malloc(N*N*sizeof(bool));
cudaMalloc(&dx, N*sizeof(float));
cudaMalloc(&dy, N*sizeof(float));
cudaMalloc(&dz, N*sizeof(float));
cudaMalloc(&dp, N*sizeof(DataPoint));
cudaMalloc(&dn, N*N*sizeof(bool));
for (int i =0; i < N; i++){
hx[i] = rand()/(float)RAND_MAX;
hy[i] = rand()/(float)RAND_MAX;
hz[i] = rand()/(float)RAND_MAX;
hp[i].pfDimens[0] = hx[i];
hp[i].pfDimens[1] = hy[i];
hp[i].pfDimens[2] = hz[i];}
cudaMemcpy(dx, hx, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dz, hz, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dp, hp, N*sizeof(DataPoint), cudaMemcpyHostToDevice);
#ifdef USE_CONSTANT
cudaMemcpyToSymbol(cpx, hx, N*sizeof(float));
cudaMemcpyToSymbol(cpy, hy, N*sizeof(float));
cudaMemcpyToSymbol(cpz, hz, N*sizeof(float));
#endif
// warm-up
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t1 = dtime_usec(0);
calcNeighbors<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 1 error");
t1 = dtime_usec(t1);
cudaMemcpy(hn1, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t2 = dtime_usec(0);
calcNeighbors2<<<nBLK, nTPB>>>(dx, dy, dz, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 2 error");
t2 = dtime_usec(t2);
cudaMemcpy(hn2, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
// warm-up
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaMemset(dn, 0, N*N*sizeof(bool));
unsigned long long t3 = dtime_usec(0);
calcNeighbors3<<<nBLK, nTPB>>>(dp, THRESH, dn);
cudaDeviceSynchronize();
cudaCheckErrors("kernel 3 error");
t3 = dtime_usec(t3);
cudaMemcpy(hn3, dn, N*N*sizeof(bool), cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
printf("t1: %fs, t2: %fs, t3: %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC, t3/(float)USECPSEC);
// results validation
for (int i = 0; i < N; i++)
for (int j = i+1; j < N; j++)
if (hn1[i*N+j] != hn2[j*N+i]) {printf("1:2 mismatch at %d, %d, was: %d, should be: %d\n", i, j, hn2[j*N+i], hn1[i*N+j]); return 1;}
for (int i = 0; i < N*N; i++)
if (hn1[i] != hn3[i]) {printf("1:3 mismatch at %d, was: %d, should be: %d\n", i, hn1[i], hn3[i]); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t749 t749.cu
$ ./t749
t1: 1.260010s, t2: 0.022661s, t3: 0.029632s
$
For this test, I have changed the data set size to 32768 since that is closer to the range you care about. Your shared memory kernel shows about a 42x speedup over your original kernel, and my kernel shows about a 55x speedup, on my K40c.
I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?
My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA
The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}
I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?
My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA
The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}