Singular values calculation only with CUDA - cuda

I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main(){
int M = 10;
int N = 10;
// --- Setting the host matrix
float *h_A = (float *)malloc(M * N * sizeof(float));
for(unsigned int i = 0; i < M; i++){
for(unsigned int j = 0; j < N; j++){
h_A[j*M + i] = (i + j) * (i + j);
}
}
// --- Setting the device matrix and moving the host matrix to the device
float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_U = (float *)malloc(M * M * sizeof(float));
float *h_V = (float *)malloc(N * N * sizeof(float));
float *h_S = (float *)malloc(N * sizeof(float));
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));
cusolverStatus_t stat;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. \N";
float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
//float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));
// --- CUDA SVD execution
//stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
cudaDeviceSynchronize();
int devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
std::cout << "devInfo = " << devInfo_h << "\n";
switch(stat){
case CUSOLVER_STATUS_SUCCESS: std::cout << "SVD computation success\n"; break;
case CUSOLVER_STATUS_NOT_INITIALIZED: std::cout << "Library cuSolver not initialized correctly\n"; break;
case CUSOLVER_STATUS_INVALID_VALUE: std::cout << "Invalid parameters passed\n"; break;
case CUSOLVER_STATUS_INTERNAL_ERROR: std::cout << "Internal operation failed\n"; break;
}
if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout << "SVD successful\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;
cusolverDnDestroy(solver_handle);
return 0;
}
If I ask for the computation of the full SVD (commented line with jobu = 'A' and jobvt = 'A') everything works fine. If I ask for the computation of the singular values only (line with jobu = 'N' and jobvt = 'N'), cusolverDnSgesvd returns
CUSOLVER_STATUS_INVALID_VALUE
Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.
Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.

At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'
So the error when you specify other combinations is expected. From the documentation:
Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH

USE OF cusolver<T>nSgesvd
As remarked by lebedov, as of CUDA 8.0, it is now possible to calculate the singular values only by cusolverDnSgesvd. I report below a slightly modified version of your code with two calls to cusolverDnSgesvd, one performing the singular values calculation only
cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)
and one performing the full SVD calculation
cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)
As you already remarked, the two 'A' fields for the full SVD case are changed to 'N' in the singular values only case. Please, note that, in the singular values only case, there is no need to store space for the singular vector matrices U and V. Indeed, a NULL pointer is passed.
The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:
Singular values only: 559 ms
Full SVD: 2239 ms
Here is the full code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<stdlib.h>
#include<stdio.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
/********/
/* MAIN */
/********/
int main(){
int M = 1000;
int N = 1000;
TimingGPU timerGPU;
float elapsedTime;
// --- Setting the host matrix
float *h_A = (float *)malloc(M * N * sizeof(float));
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[j*M + i] = (i + j) * (i + j);
}
}
// --- Setting the device matrix and moving the host matrix to the device
float *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));
// --- host side SVD results space
float *h_U = (float *)malloc(M * M * sizeof(float));
float *h_V = (float *)malloc(N * N * sizeof(float));
float *h_S = (float *)malloc(N * sizeof(float));
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
float *d_U; gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
float *d_V; gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
float *d_S; gpuErrchk(cudaMalloc(&d_S, N * sizeof(float)));
cusolverStatus_t stat;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));
float *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
// --- CUDA SVD execution - Singular values only
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
elapsedTime = timerGPU.GetCounter();
int devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h == 0)
printf("SVD successfull for the singular values calculation only\n\n");
else if (devInfo_h < 0)
printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrong\n", -devInfo_h);
else
printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zero\n", devInfo_h);
printf("Calculation of the singular values only: %f ms\n\n", elapsedTime);
// --- Moving the results from device to host
//gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
//for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;
// --- CUDA SVD execution - Full SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
elapsedTime = timerGPU.GetCounter();
devInfo_h = 0;
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h == 0)
printf("SVD successfull for the full SVD calculation\n\n");
else if (devInfo_h < 0)
printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrong\n", -devInfo_h);
else
printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zero\n", devInfo_h);
printf("Calculation of the full SVD calculation: %f ms\n\n", elapsedTime);
cusolveSafeCall(cusolverDnDestroy(solver_handle));
return 0;
}
EDIT - PERFORMANCE ACROSS DIFFERENT VERSIONS OF CUDA
I have compared the performance of the singular values only calculation and the the Full SVD computations for CUDA 8.0, CUDA 9.1 and CUDA 10.0, for a 5000x5000 matrix. Here are the results on a GTX 960.
Computation type CUDA 8.0 CUDA 9.1 CUDA 10.0
__________________________________________________________________
Singular values only 17s 15s 15s
Full SVD 161s 159s 457s
__________________________________________________________________

Related

Cuda GPUassert: an illegal memory access was encountered

I was trying to make a game program using __device __ variables instead of declaring it dynamically using cudaMalloc, but it keeps telling me that GPUassert: illegal memory access was encountered at the third last line where the cudaDeviceSynchronization() is called. I have tried the version using cudaMalloc and it worked out fine.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define M 3
#define N 3
#define K 3
using namespace std;
__device__ double A_dev[M * K];
__device__ double B_dev[K * N];
__device__ double C_dev[M * N];
__global__ void gemm(double* A, double* B, double* C, int m, int n, int k)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = x * n + y;
double sum = 0.0;
for (int j = 0; j < k; j++)
{
sum += A[x * k + j] * B[n * j + y];
}
C[i] = sum;
printf("The value is %f", C[i]);
}
int main(void)
{
double A_h[M * K];
double B_h[K * N];
double C_h[M * N];
for (int i = 0; i < M*K; i++)
{
A_h[i] = (double)i;
B_h[i] = (double)i;
C_h[i] = 0.0;
}
gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(3, 3, 1);
gemm <<<dimGrid, dimBlock >>> (A_dev, B_dev, C_dev, 3, 3, 3);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));
return 0;
}
When using __device__ variables, they are inherently at global scope, and we do not pass those as kernel arguments. You use those variables directly in kernel code without having to have a kernel argument for them.
If you make the following changes to your code, it will run without error:
#include <iostream>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define M 3
#define N 3
#define K 3
using namespace std;
__device__ double A_dev[M * K];
__device__ double B_dev[K * N];
__device__ double C_dev[M * N];
__global__ void gemm(int m, int n, int k)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = x * n + y;
double sum = 0.0;
for (int j = 0; j < k; j++)
{
sum += A_dev[x * k + j] * B_dev[n * j + y];
}
C_dev[i] = sum;
printf("The value is %f", C_dev[i]);
}
int main(void)
{
double A_h[M * K];
double B_h[K * N];
double C_h[M * N];
for (int i = 0; i < M*K; i++)
{
A_h[i] = (double)i;
B_h[i] = (double)i;
C_h[i] = 0.0;
}
gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
dim3 dimGrid(1, 1, 1);
dim3 dimBlock(3, 3, 1);
gemm <<<dimGrid, dimBlock >>> (3, 3, 3);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));
return 0;
}

Call multiple CUDA SVD (in cuSolver) [duplicate]

I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?
My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA
The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}

Solving dense linear systems AX = B with CUDA

Can I use the new cuSOLVER library (CUDA 7) to solve linear systems of the form
AX = B
where A, X and B are NxN dense matrices ?
Yes.
Approach nr. 1
In the framework of cuSOLVER you can use QR decomposition, see QR decomposition to solve linear systems in CUDA.
Approach nr. 2
Alternatively, you can calculate the matrix inverse by the successive involation of
cublas<t>getrfBatched()
which calculates the LU decomposition of a matrix, and
cublas<t>getriBatched()
which calculates the inverse of the matrix starting from its LU decomposition.
Approach nr. 3
A final possibility is using
cublas<t>getrfBatched()
followed by a twofold invocation of
cublas<t>trsm()
which solves upper or lower triangular linear systems.
As pointed out by Robert Crovella, the answer may vary on the size and the type of the involved matrices.
Code for approach nr. 1
Please, see QR decomposition to solve linear systems in CUDA.
Code for approaches nr. 2 and nr. 3
Below, I'm reporting a worked example for the implementation of approaches nr. 2 and 3. Hankel matrices are used to feed the approaches with well-conditioned, invertible matrices. Please, note that approach nr. 3 requires permuting (rearranging) the system coefficients vector according to the pivot array obtained following the invokation of cublas<t>getrfBatched(). This permutation can be conveniently done on the CPU.
#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define prec_save 10
#define BLOCKSIZE 256
#define BLOCKSIZEX 16
#define BLOCKSIZEY 16
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {
double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));
// --- Initialize random seed
srand(time(NULL));
// --- Generate random numbers
for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();
// --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];
free(h_atemp);
}
/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref,
double * __restrict h_y, const int N) {
for (int k = 0; k < N; k++) h_y[k] = 0.f;
for (int m = 0; m < N; m++)
for (int n = 0; n < N; n++)
h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];
}
/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
for (int i = 0; i < N; i++) {
double temp = vec[i];
vec[i] = vec[pivotArray[i] - 1];
vec[pivotArray[i] - 1] = temp;
}
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 1000;
const unsigned int Nmatrices = 1;
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));
TimingGPU timerLU, timerApproach1, timerApproach2;
double timingLU, timingApproach1, timingApproach2;
/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Matrices to be inverted (only one in this example)
double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));
// --- Setting the Hankel matrix
setHankelMatrix(h_A, N);
// --- Defining the solution
double *h_xref = (double *)malloc(N * sizeof(double));
for (int k = 0; k < N; k++) h_xref[k] = 1.f;
// --- Coefficient vectors (only one in this example)
double *h_y = (double *)malloc(N * sizeof(double));
computeCoefficientsVector(h_A, h_xref, h_y, N);
// --- Result (only one in this example)
double *h_x = (double *)malloc(N * sizeof(double));
// --- Allocate device space for the input matrices
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
// --- Move the relevant matrices from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(double), cudaMemcpyHostToDevice));
/**********************************/
/* COMPUTING THE LU DECOMPOSITION */
/**********************************/
timerLU.StartCounter();
// --- Creating the array of pointers needed as input/output to the batched getrf
double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;
double **d_inout_pointers;
gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
free(h_inout_pointers);
int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
int *d_InfoArray; gpuErrchk(cudaMalloc(&d_InfoArray, Nmatrices * sizeof(int)));
int *h_InfoArray = (int *)malloc(Nmatrices * sizeof(int));
cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
//cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
timingLU = timerLU.GetCounter();
printf("Timing LU decomposition %f [ms]\n", timingLU);
/*********************************/
/* CHECKING THE LU DECOMPOSITION */
/*********************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
saveCPUrealtxt(h_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);
/******************************************************************************/
/* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
/******************************************************************************/
timerApproach1.StartCounter();
// --- Allocate device space for the inverted matrices
double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));
// --- Creating the array of pointers needed as output to the batched getri
double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));
double **d_out_pointers;
gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
free(h_out_pointers);
cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
double alpha1 = 1.f;
double beta1 = 0.f;
cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));
timingApproach1 = timingLU + timerApproach1.GetCounter();
printf("Timing approach 1 %f [ms]\n", timingApproach1);
/**************************/
/* CHECKING APPROACH NR.1 */
/**************************/
saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);
/*************************************************************/
/* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
/*************************************************************/
timerApproach2.StartCounter();
double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));
gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
rearrange(h_y, h_pivotArray, N);
gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- Now P*A=L*U
// Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y
// --- 1st phase - solve Ly = b
const double alpha = 1.f;
// --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result
// --- Lower triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));
// --- Upper triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));
timingApproach2 = timingLU + timerApproach2.GetCounter();
printf("Timing approach 2 %f [ms]\n", timingApproach2);
/**************************/
/* CHECKING APPROACH NR.2 */
/**************************/
saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.
Some useful references on the third approach:
NAG Fortran Library Routine Document
Scientific Computing Software Library (SCSL) User’s Guide
https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c
EDIT
Timings (in ms) for approaches nr. 2 and 3 (tests performed on a GTX960 card, cc. 5.2).
N LU decomposition Approach nr. 2 Approach nr. 3
100 1.08 2.75 1.28
500 45.4 161 45.7
1000 302 1053 303
As it emerges, approach nr. 3 is more convenient and its cost is essentially the cost of computing the LU factorization. Furthermore:
Solving linear systems by LU decomposition is faster than using QR decomposition (see QR decomposition to solve linear systems in CUDA);
LU decomposition is limited to square linear systems, while QR decomposition helps in case of non-square linear systems.
The below Matlab code can be used for checking the results
clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt
A = reshape(A, N, N);
yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);
load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);
L = eye(N);
for k = 1 : N
L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
U(k, k : N) = Adecomposed(k, k : N);
end
load pivotArray.txt
Pj = eye(N);
for j = 1 : N
tempVector = Pj(j, :);
Pj(j, :) = Pj(pivotArray(j), :);
Pj(pivotArray(j), :) = tempVector;
end
fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));
xprime = inv(Lmatlab) * yMatlab;
xMatlab = inv(Umatlab) * xprime;
fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));
load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

Parallel implementation for multiple SVDs using CUDA

I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?
My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA
The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}

Cannot read out Values from Texture Memory

Hi I'm writing a simple Program for practicing to work with texture memory. I Just want to write my data into Texture Memory and write it back into Global Memory. But i cannont read out the Values. Here is the code.
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel4.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 40;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.8f;
for(int i = 0; i < N; i++){
A[i] = i; //(float)rand();
B[i] = i+1; //(float)rand();
}
ipLinearTexture2(A,B,result,angle,N);
float result2;
result2 = (angle)*A[4] + (1-angle)*B[4];
printf(" A %f B %f Result %f\n", A[4], B[4], result[4]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture2(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
int N2 = N * 2;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) malloc( 2 * sizeof(float *));
}
}
for (int i = 0; i < N; i = i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N2 * sizeof(float);
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2));
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);
// set texture parameters
tex2.normalized = true;
tex2.filterMode = cudaFilterModeLinear;
tex2.addressMode[0] = cudaAddressModeWrap; //cudaAddressModeWrap;
tex2.addressMode[1] = cudaAddressModeWrap; //cudaAddressModeClamp;
checkCudaErrors(cudaBindTextureToArray( tex2, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel4<<< 256, 256, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyDeviceToHost));
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << " on " << i << endl;
}
cout << "==================================================" << endl;
checkCudaErrors(cudaUnbindTexture(tex));
checkCudaErrors(cudaFree(dev_result));
checkCudaErrors(cudaFreeArray(cu_array));
}
and here is the kernel code
#ifndef _SIMPLETEXTURE_KERNEL5_H_
#define _SIMPLETEXTURE_KERNEL5_H_
// Texture references
texture<float, 2, cudaReadModeElementType> tex2;
__global__ void
transformKernel4(float* g_odata, int width, int height, float theta)
{
unsigned int xid = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int yid = blockIdx.y * blockDim.y + threadIdx.y;
if (xid >= width || yid >= height) return;
float dx = 1.0f / (float)width;
float dy = 1.0f / (float)height;
float x = ((float)xid + 0.5f) * dx;
float y = ((float)yid + 0.5f) * dy;
float value = tex2D(tex2, x , y);
printf("wert %f xid %i yid %i \n",value, xid, yid);
g_odata[yid * width + xid] = value;
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Can somebody tell what i am doing wrong?
I have edited it to remove the first 2 logical mistake. Put why am I need able to print out my data?
It was the wrong binding of the Arrays. You can not use multidimensional Arrays in C that can be copied. You have to use a onedimensional array that respresents a multidimensional.
I can see 2 logical errors here.
The first one is the one pointed out by #asm.
The output should be stored by calculating linear index from 2D x and y indices.
outputIndex = yid * width + xid;
The second one is that the memory allocation for the cudaArray structure is internally aligned.
You should consider using cudaMemcpy2DToArray function to avoid erroneous data copying.
cudaMemcpy2DToArray(cu_array,0,0,AB,N * sizeof(float), N * sizeof(float), 2, cudaMemcpyHostToDevice);