Solving dense linear systems AX = B with CUDA

Solving dense linear systems AX = B with CUDA - cuda

Can I use the new cuSOLVER library (CUDA 7) to solve linear systems of the form
AX = B
where A, X and B are NxN dense matrices ?

Yes.
Approach nr. 1
In the framework of cuSOLVER you can use QR decomposition, see QR decomposition to solve linear systems in CUDA.
Approach nr. 2
Alternatively, you can calculate the matrix inverse by the successive involation of
cublas<t>getrfBatched()
which calculates the LU decomposition of a matrix, and
cublas<t>getriBatched()
which calculates the inverse of the matrix starting from its LU decomposition.
Approach nr. 3
A final possibility is using
cublas<t>getrfBatched()
followed by a twofold invocation of
cublas<t>trsm()
which solves upper or lower triangular linear systems.
As pointed out by Robert Crovella, the answer may vary on the size and the type of the involved matrices.
Code for approach nr. 1
Please, see QR decomposition to solve linear systems in CUDA.
Code for approaches nr. 2 and nr. 3
Below, I'm reporting a worked example for the implementation of approaches nr. 2 and 3. Hankel matrices are used to feed the approaches with well-conditioned, invertible matrices. Please, note that approach nr. 3 requires permuting (rearranging) the system coefficients vector according to the pivot array obtained following the invokation of cublas<t>getrfBatched(). This permutation can be conveniently done on the CPU.
#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define prec_save 10
#define BLOCKSIZE 256
#define BLOCKSIZEX 16
#define BLOCKSIZEY 16
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {
double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));
// --- Initialize random seed
srand(time(NULL));
// --- Generate random numbers
for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();
// --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];
free(h_atemp);
}
/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref,
double * __restrict h_y, const int N) {
for (int k = 0; k < N; k++) h_y[k] = 0.f;
for (int m = 0; m < N; m++)
for (int n = 0; n < N; n++)
h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];
}
/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
for (int i = 0; i < N; i++) {
double temp = vec[i];
vec[i] = vec[pivotArray[i] - 1];
vec[pivotArray[i] - 1] = temp;
}
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 1000;
const unsigned int Nmatrices = 1;
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));
TimingGPU timerLU, timerApproach1, timerApproach2;
double timingLU, timingApproach1, timingApproach2;
/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Matrices to be inverted (only one in this example)
double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));
// --- Setting the Hankel matrix
setHankelMatrix(h_A, N);
// --- Defining the solution
double *h_xref = (double *)malloc(N * sizeof(double));
for (int k = 0; k < N; k++) h_xref[k] = 1.f;
// --- Coefficient vectors (only one in this example)
double *h_y = (double *)malloc(N * sizeof(double));
computeCoefficientsVector(h_A, h_xref, h_y, N);
// --- Result (only one in this example)
double *h_x = (double *)malloc(N * sizeof(double));
// --- Allocate device space for the input matrices
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
// --- Move the relevant matrices from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(double), cudaMemcpyHostToDevice));
/**********************************/
/* COMPUTING THE LU DECOMPOSITION */
/**********************************/
timerLU.StartCounter();
// --- Creating the array of pointers needed as input/output to the batched getrf
double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;
double **d_inout_pointers;
gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
free(h_inout_pointers);
int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
int *d_InfoArray; gpuErrchk(cudaMalloc(&d_InfoArray, Nmatrices * sizeof(int)));
int *h_InfoArray = (int *)malloc(Nmatrices * sizeof(int));
cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
//cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
timingLU = timerLU.GetCounter();
printf("Timing LU decomposition %f [ms]\n", timingLU);
/*********************************/
/* CHECKING THE LU DECOMPOSITION */
/*********************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
saveCPUrealtxt(h_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);
/******************************************************************************/
/* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
/******************************************************************************/
timerApproach1.StartCounter();
// --- Allocate device space for the inverted matrices
double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));
// --- Creating the array of pointers needed as output to the batched getri
double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));
double **d_out_pointers;
gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
free(h_out_pointers);
cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
double alpha1 = 1.f;
double beta1 = 0.f;
cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));
timingApproach1 = timingLU + timerApproach1.GetCounter();
printf("Timing approach 1 %f [ms]\n", timingApproach1);
/**************************/
/* CHECKING APPROACH NR.1 */
/**************************/
saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);
/*************************************************************/
/* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
/*************************************************************/
timerApproach2.StartCounter();
double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));
gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
rearrange(h_y, h_pivotArray, N);
gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- Now P*A=L*U
// Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y
// --- 1st phase - solve Ly = b
const double alpha = 1.f;
// --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result
// --- Lower triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));
// --- Upper triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));
timingApproach2 = timingLU + timerApproach2.GetCounter();
printf("Timing approach 2 %f [ms]\n", timingApproach2);
/**************************/
/* CHECKING APPROACH NR.2 */
/**************************/
saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.
Some useful references on the third approach:
NAG Fortran Library Routine Document
Scientific Computing Software Library (SCSL) User’s Guide
https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c
EDIT
Timings (in ms) for approaches nr. 2 and 3 (tests performed on a GTX960 card, cc. 5.2).
N LU decomposition Approach nr. 2 Approach nr. 3
100 1.08 2.75 1.28
500 45.4 161 45.7
1000 302 1053 303
As it emerges, approach nr. 3 is more convenient and its cost is essentially the cost of computing the LU factorization. Furthermore:
Solving linear systems by LU decomposition is faster than using QR decomposition (see QR decomposition to solve linear systems in CUDA);
LU decomposition is limited to square linear systems, while QR decomposition helps in case of non-square linear systems.
The below Matlab code can be used for checking the results
clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt
A = reshape(A, N, N);
yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);
load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);
L = eye(N);
for k = 1 : N
L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
U(k, k : N) = Adecomposed(k, k : N);
end
load pivotArray.txt
Pj = eye(N);
for j = 1 : N
tempVector = Pj(j, :);
Pj(j, :) = Pj(pivotArray(j), :);
Pj(pivotArray(j), :) = tempVector;
end
fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));
xprime = inv(Lmatlab) * yMatlab;
xMatlab = inv(Umatlab) * xprime;
fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));
load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

Related

Why PyCUDA is faster than C CUDA in this example

I am exploring to move from OpenCL to CUDA, and did a few tests to benchmark the speed of CUDA in various implementations. To my surprise, in the examples below, the PyCUDA implementation is about 20% faster than the C CUDA example.
I read many posts talking about "release build" of C CUDA code. I did try having -Xptxas -O3 in the makefile and that really did not make a difference. I also tried to adjust the block size, with which the kernel was executed. Unfortunately, it did not help improve the speed, either.
My questions here are:
What could be the reasons leading to the speed difference between C CUDA and PYCUDA?
If the "advanced" (lack of a better word) compiling in PYCUDA is one of reasons, how can I optimize the compiling of my C CUDA code?
Are there any other ways to improve the speed of C CUDA in this case?
While I appreciate general comments, I am looking for actionable suggestions that I can validate on my machine. Thanks!
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
import time
mod = SourceModule(
"""
__global__ void saxpy(int n, const float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n){
y[i] = a * x[i] + y[i];
}
}
"""
)
saxpy = mod.get_function("saxpy")
N = 1 << 25
time_elapse = 0.0
for i in range(100):
# print(i)
# print(N)
x = np.ones(N).astype(np.float32)
y = 2 * np.ones(N).astype(np.float32)
start = time.time()
saxpy(
np.int32(N),
np.float32(2.0),
drv.In(x),
drv.InOut(y),
block=(512, 1, 1),
grid=(int(N / 512) + 1, 1),
)
time_elapse += (time.time() - start)
print(time_elapse )
print(y[-100:-1])
print(y.sum())
print(N * 4.0)
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main(int num_iterations)
{
double start;
double cputime;
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
for (j = 0; j < num_iterations; j++)
{
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
start = clock();
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cputime += ((double)(clock() - start) / CLOCKS_PER_SEC);
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
// float maxError = 0.0f;
// for (int i = 0; i < N; i++){
// maxError = max(maxError, abs(y[i] - 4.0f));
// //printf("y[%d]: %f\n", i,y[i]);
// }
// printf("Max error: %f\n", maxError);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}
printf("cpu time is %f\n", cputime);
return 0;
}
I saved the above file as cuda_example.cu and compile it with the following commands in a makefile:
nvcc -arch=sm_61 -Xptxas -O3,-v -o main cuda_example.cu

If I execute your CUDA-C code as is, and set num_iterations to 300 like this:
int num_iterations =300;
then the execution of your program takes about 60s on a Geforce GTX 1650. Your code is extremely inefficient, as you copy data back and forth between GPU and device at every iteration.
So, lets restrict the loop to just the kernel execution:
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main()
{
double start = clock();
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
int num_iterations = 300;
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
for (j = 0; j < num_iterations; j++){
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cudaDeviceSynchronize();
}
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
double cputime = ((double)(clock() - start) / CLOCKS_PER_SEC);
printf("cpu time is %f\n", cputime);
return 0;
}
If I do that, then the execution time becomes 1.36 seconds. Doing sth similar to the PyCUDA code I got about 19s of execution time.

Solve the Poisson equation using FFT with CUDA

I'm following a tutorial on using the cuFFT library here: http://gpgpu.org/static/sc2007/SC07_CUDA_3_Libraries.pdf
After following line by line of its code, I'm getting really strange results.
I have input data that is an NxN array of floats. The program does a FFT forward transform, solves Poisson's equation, and then does an inverse FFT. The input data (and output data) is referred to as a square image with sidelength N. When I comment out solve_poisson <<<dimGrid, dimBlock>>> (r_complex_d, kx_d, ky_d, N);, it correctly forward transforms the data and then performs an inverse transform, which causes the output data to be the same as the input data. This is supposed to happen.
Here is the output without calling the solve_poisson method.
0 r_initial: 0.00125126 r: 0.00125132
1 r_initial: 0.563585 r: 0.563585
2 r_initial: 0.193304 r: 0.193304
3 r_initial: 0.80874 r: 0.80874
4 r_initial: 0.585009 r: 0.585009
5 r_initial: 0.479873 r: 0.479873
6 r_initial: 0.350291 r: 0.350291
7 r_initial: 0.895962 r: 0.895962
8 r_initial: 0.82284 r: 0.82284
9 r_initial: 0.746605 r: 0.746605
10 r_initial: 0.174108 r: 0.174108
11 r_initial: 0.858943 r: 0.858943
12 r_initial: 0.710501 r: 0.710502
13 r_initial: 0.513535 r: 0.513535
14 r_initial: 0.303995 r: 0.303995
15 r_initial: 0.0149846 r: 0.0149846
Press any key to continue . . .
However, when I uncomment out the solve_poisson method, the output data is inf or nan, which leads me to believe that the scale variable was somehow close to zero in the solve_poisson method.
So I changed float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]); to float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]) + 0.00001f. This change is not in the original tutorial. The results computed here are not supposed to have extreme positive or negative values.
0 r_initial: 0.00125126 r: -11448.1
1 r_initial: 0.563585 r: 11449.3
2 r_initial: 0.193304 r: -11448.3
3 r_initial: 0.80874 r: 11449.2
4 r_initial: 0.585009 r: 11449.4
5 r_initial: 0.479873 r: -11448.4
6 r_initial: 0.350291 r: 11449.5
7 r_initial: 0.895962 r: -11448.6
8 r_initial: 0.82284 r: -11448.5
9 r_initial: 0.746605 r: 11449.4
10 r_initial: 0.174108 r: -11448.3
11 r_initial: 0.858943 r: 11449.3
12 r_initial: 0.710501 r: 11449.2
13 r_initial: 0.513535 r: -11448.4
14 r_initial: 0.303995 r: 11449.3
15 r_initial: 0.0149846 r: -11448.1
Press any key to continue . . .
In the tutorial, a sample calculation on slide 43 on page 22 is computed=0.975879 reference=0.975882, yet my results are completely different and really large.
The following code is what I used.
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cufft.h>
#include <stdlib.h>
#include <iostream>
#define N 4 //4 X 4 // N is the sidelength of the image -> 16 pixels in entire image
#define block_size_x 2
#define block_size_y 2
__global__ void real2complex(cufftComplex *c, float *a, int n);
__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n);
__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n);
int main()
{
float *kx, *ky, *r;
kx = (float *)malloc(sizeof(float) * N);
ky = (float *)malloc(sizeof(float) * N);
r = (float *)malloc(sizeof(float) * N * N);
float *kx_d, *ky_d, *r_d;
cufftComplex *r_complex_d;
cudaMalloc((void **)&kx_d, sizeof(float) * N);
cudaMalloc((void **)&ky_d, sizeof(float) * N);
cudaMalloc((void **)&r_d, sizeof(float) * N * N);
cudaMalloc((void **)&r_complex_d, sizeof(cufftComplex) * N * N);
for (int y = 0; y < N; y++)
for (int x = 0; x < N; x++)
r[x + y * N] = rand() / (float)RAND_MAX;
//r[x + y * N] = sin(exp(-((x - N / 2.0f) * (x - N / 2.0f) + (N / 2.0f - y) * (N / 2.0f - y)) / (20 * 20))) * 255 / sin(1); //Here is sample data that will high values at the center of the image and low values as you go farther and farther away from the center.
float* r_inital = (float *)malloc(sizeof(float) * N * N);
for (int i = 0; i < N * N; i++)
r_inital[i] = r[i];
for (int i = 0; i < N; i++)
{
kx[i] = i - N / 2.0f; //centers kx values to be at center of image
ky[i] = N / 2.0f - i; //centers ky values to be at center of image
}
cudaMemcpy(kx_d, kx, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(ky_d, ky, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(r_d, r, sizeof(float) * N * N, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan2d(&plan, N, N, CUFFT_C2C);
/* Compute the execution configuration, block_size_x*block_size_y = number of threads */
dim3 dimBlock(block_size_x, block_size_y);
dim3 dimGrid(N / dimBlock.x, N / dimBlock.y);
/* Handle N not multiple of block_size_x or block_size_y */
if (N % block_size_x != 0) dimGrid.x += 1;
if (N % block_size_y != 0) dimGrid.y += 1;
real2complex << < dimGrid, dimBlock >> > (r_complex_d, r_d, N);
cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_FORWARD);
solve_poisson << <dimGrid, dimBlock >> > (r_complex_d, kx_d, ky_d, N);
cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_INVERSE);
float scale = 1.0f / (N * N);
complex2real_scaled << <dimGrid, dimBlock >> > (r_d, r_complex_d, scale, N);
cudaMemcpy(r, r_d, sizeof(float) * N * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N * N; i++)
std::cout << i << "\tr_initial: " << r_inital[i] << "\tr: " << r[i] << std::endl;
system("pause");
/* Destroy plan and clean up memory on device*/
free(kx);
free(ky);
free(r);
free(r_inital);
cufftDestroy(plan);
cudaFree(r_complex_d);
cudaFree(kx_d);
}
__global__ void real2complex(cufftComplex *c, float *a, int n)
{
/* compute idx and idy, the location of the element in the original NxN array */
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < n && idy < n)
{
int index = idx + idy * n;
c[index].x = a[index];
c[index].y = 0.0f;
}
}
__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n)
{
/* compute idx and idy, the location of the element in the original NxN array */
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < n && idy < n)
{
int index = idx + idy * n;
a[index] = scale * c[index].x;
}
}
__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n)
{
/* compute idx and idy, the location of the element in the original NxN array */
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < n && idy < n)
{
int index = idx + idy * n;
float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]) + 0.00001f;
if (idx == 0 && idy == 0) scale = 1.0f;
scale = 1.0f / scale;
c[index].x *= scale;
c[index].y *= scale;
}
}
Is there anything I messed up on? I would really appreciate if anyone could help me out.

Although the poster has found the mistake by himself, I want to share my own implementation of the 2D Poisson equation solver.
The implementation slightly differs from the one linked to by the poster.
The theory is reported at Solve Poisson Equation Using FFT.
MATLAB VERSION
I first report the Matlab version for reference:
clear all
close all
clc
M = 64; % --- Number of Fourier harmonics along x (should be a multiple of 2)
N = 32; % --- Number of Fourier harmonics along y (should be a multiple of 2)
Lx = 3; % --- Domain size along x
Ly = 1.5; % --- Domain size along y
sigma = 0.1; % --- Characteristic width of f (make << 1)
% --- Wavenumbers
kx = (2 * pi / Lx) * [0 : (M / 2 - 1) (- M / 2) : (-1)]; % --- Wavenumbers along x
ky = (2 * pi / Ly) * [0 : (N / 2 - 1) (- N / 2) : (-1)]; % --- Wavenumbers along y
[Kx, Ky] = meshgrid(kx, ky);
% --- Right-hand side of differential equation
hx = Lx / M; % --- Grid spacing along x
hy = Ly / N; % --- Grid spacing along y
x = (0 : (M - 1)) * hx;
y = (0 : (N - 1)) * hy;
[X, Y] = meshgrid(x, y);
rSquared = (X - 0.5 * Lx).^2 + (Y - 0.5 * Ly).^2;
sigmaSquared = sigma^2;
f = exp(-rSquared / (2 * sigmaSquared)) .* (rSquared - 2 * sigmaSquared) / (sigmaSquared^2);
fHat = fft2(f);
% --- Denominator of the unknown spectrum
den = -(Kx.^2 + Ky.^2);
den(1, 1) = 1; % --- Avoid division by zero at wavenumber (0, 0)
% --- Unknown determination
uHat = ifft2(fHat ./ den);
% uHat(1, 1) = 0; % --- Force the unknown spectrum at (0, 0) to be zero
u = real(uHat);
u = u - u(1,1); % --- Force arbitrary constant to be zero by forcing u(1, 1) = 0
% --- Plots
uRef = exp(-rSquared / (2 * sigmaSquared));
err = 100 * sqrt(sum(sum(abs(u - uRef).^2)) / sum(sum(abs(uRef).^2)));
errMax = norm(u(:)-uRef(:),inf)
fprintf('Percentage root mean square error = %f\n', err);
fprintf('Maximum error = %f\n', errMax);
surf(X, Y, u)
xlabel('x')
ylabel('y')
zlabel('u')
title('Solution of 2D Poisson equation by spectral method')
CUDA VERSION
Here is the corresponding CUDA version:
#include <stdio.h>
#include <fstream>
#include <iomanip>
// --- Greek pi
#define _USE_MATH_DEFINES
#include <math.h>
#include <cufft.h>
#define BLOCKSIZEX 16
#define BLOCKSIZEY 16
#define prec_save 10
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************************************/
/* COMPUTE RIGHT HAND SIDE OF 2D POISSON EQUATION */
/**************************************************/
__global__ void computeRHS(const float * __restrict__ d_x, const float * __restrict__ d_y,
float2 * __restrict__ d_r, const float Lx, const float Ly, const float sigma,
const int M, const int N) {
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
const float sigmaSquared = sigma * sigma;
const float rSquared = (d_x[tidx] - 0.5f * Lx) * (d_x[tidx] - 0.5f * Lx) +
(d_y[tidy] - 0.5f * Ly) * (d_y[tidy] - 0.5f * Ly);
d_r[tidy * M + tidx].x = expf(-rSquared / (2.f * sigmaSquared)) * (rSquared - 2.f * sigmaSquared) / (sigmaSquared * sigmaSquared);
d_r[tidy * M + tidx].y = 0.f;
}
/****************************************************/
/* SOLVE 2D POISSON EQUATION IN THE SPECTRAL DOMAIN */
/****************************************************/
__global__ void solvePoisson(const float * __restrict__ d_kx, const float * __restrict__ d_ky,
float2 * __restrict__ d_r, const int M, const int N)
{
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
float scale = -(d_kx[tidx] * d_kx[tidx] + d_ky[tidy] * d_ky[tidy]);
if (tidx == 0 && tidy == 0) scale = 1.f;
scale = 1.f / scale;
d_r[M * tidy + tidx].x *= scale;
d_r[M * tidy + tidx].y *= scale;
}
/****************************************************************************/
/* SOLVE 2D POISSON EQUATION IN THE SPECTRAL DOMAIN - SHARED MEMORY VERSION */
/****************************************************************************/
__global__ void solvePoissonShared(const float * __restrict__ d_kx, const float * __restrict__ d_ky,
float2 * __restrict__ d_r, const int M, const int N)
{
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
// --- Use shared memory to minimize multiple access to same spectral coordinate values
__shared__ float kx_s[BLOCKSIZEX], ky_s[BLOCKSIZEY];
kx_s[threadIdx.x] = d_kx[tidx];
ky_s[threadIdx.y] = d_ky[tidy];
__syncthreads();
float scale = -(kx_s[threadIdx.x] * kx_s[threadIdx.x] + ky_s[threadIdx.y] * ky_s[threadIdx.y]);
if (tidx == 0 && tidy == 0) scale = 1.f;
scale = 1.f / scale;
d_r[M * tidy + tidx].x *= scale;
d_r[M * tidy + tidx].y *= scale;
}
/******************************/
/* COMPLEX2REAL SCALED KERNEL */
/******************************/
__global__ void complex2RealScaled(float2 * __restrict__ d_r, float * __restrict__ d_result, const int M, const int N, float scale)
{
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
d_result[tidy * M + tidx] = scale * (d_r[tidy * M + tidx].x - d_r[0].x);
}
/******************************************/
/* COMPLEX2REAL SCALED KERNEL - OPTIMIZED */
/******************************************/
__global__ void complex2RealScaledOptimized(float2 * __restrict__ d_r, float * __restrict__ d_result, const int M, const int N, float scale)
{
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
__shared__ float d_r_0[1];
if (threadIdx.x == 0) d_r_0[0] = d_r[0].x;
volatile float2 c;
c.x = d_r[tidy * M + tidx].x;
c.y = d_r[tidy * M + tidx].y;
d_result[tidy * M + tidx] = scale * (c.x - d_r_0[0]);
}
/**************************************/
/* SAVE FLOAT2 ARRAY FROM GPU TO FILE */
/**************************************/
void saveGPUcomplextxt(const float2 * d_in, const char *filename, const int M) {
float2 *h_in = (float2 *)malloc(M * sizeof(float2));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(float2), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) {
//printf("%f %f\n", h_in[i].c.x, h_in[i].c.y);
outfile << std::setprecision(prec_save) << h_in[i].x << "\n"; outfile << std::setprecision(prec_save) << h_in[i].y << "\n";
}
outfile.close();
}
/*************************************/
/* SAVE FLOAT ARRAY FROM GPU TO FILE */
/*************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/********/
/* MAIN */
/********/
int main()
{
const int M = 64; // --- Number of Fourier harmonics along x (should be a multiple of 2)
const int N = 32; // --- Number of Fourier harmonics along y(should be a multiple of 2)
const float Lx = 3.f; // --- Domain size along x
const float Ly = 1.5f; // --- Domain size along y
const float sigma = 0.1f; // --- Characteristic width of f(make << 1)
// --- Wavenumbers on the host
float *h_kx = (float *)malloc(M * sizeof(float));
float *h_ky = (float *)malloc(N * sizeof(float));
for (int k = 0; k < M / 2; k++) h_kx[k] = (2.f * M_PI / Lx) * k;
for (int k = -M / 2; k < 0; k++) h_kx[k + M] = (2.f * M_PI / Lx) * k;
for (int k = 0; k < N / 2; k++) h_ky[k] = (2.f * M_PI / Ly) * k;
for (int k = -N / 2; k < 0; k++) h_ky[k + N] = (2.f * M_PI / Ly) * k;
// --- Wavenumbers on the device
float *d_kx; gpuErrchk(cudaMalloc(&d_kx, M * sizeof(float)));
float *d_ky; gpuErrchk(cudaMalloc(&d_ky, N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_kx, h_kx, M * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ky, h_ky, N * sizeof(float), cudaMemcpyHostToDevice));
// --- Domain discretization on the host
float *h_x = (float *)malloc(M * sizeof(float));
float *h_y = (float *)malloc(N * sizeof(float));
for (int k = 0; k < M; k++) h_x[k] = Lx / (float)M * k;
for (int k = 0; k < N; k++) h_y[k] = Ly / (float)N * k;
// --- Domain discretization on the device
float *d_x; gpuErrchk(cudaMalloc(&d_x, M * sizeof(float)));
float *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x, h_x, M * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(float), cudaMemcpyHostToDevice));
// --- Compute right-hand side of the equation on the device
float2 *d_r; gpuErrchk(cudaMalloc(&d_r, M * N * sizeof(float2)));
dim3 dimBlock(BLOCKSIZEX, BLOCKSIZEY);
dim3 dimGrid(iDivUp(M, BLOCKSIZEX), iDivUp(N, BLOCKSIZEY));
computeRHS << <dimGrid, dimBlock >> >(d_x, d_y, d_r, Lx, Ly, sigma, M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
// --- Create plan for CUDA FFT
cufftHandle plan;
cufftPlan2d(&plan, N, M, CUFFT_C2C);
// --- Compute in place forward FFT of right-hand side
cufftExecC2C(plan, d_r, d_r, CUFFT_FORWARD);
// --- Solve Poisson equation in Fourier space
//solvePoisson << <dimGrid, dimBlock >> > (d_kx, d_ky, d_r, M, N);
solvePoissonShared << <dimGrid, dimBlock >> > (d_kx, d_ky, d_r, M, N);
// --- Compute in place inverse FFT
cufftExecC2C(plan, d_r, d_r, CUFFT_INVERSE);
//saveGPUcomplextxt(d_r, "D:\\Project\\poisson2DFFT\\poisson2DFFT\\d_r.txt", M * N);
// --- With cuFFT, an FFT followed by an IFFT will return the same array times the size of the transform
// --- Accordingly, we need to scale the result.
const float scale = 1.f / ((float)M * (float)N);
float *d_result; gpuErrchk(cudaMalloc(&d_result, M * N * sizeof(float)));
//complex2RealScaled << <dimGrid, dimBlock >> > (d_r, d_result, M, N, scale);
complex2RealScaledOptimized << <dimGrid, dimBlock >> > (d_r, d_result, M, N, scale);
//saveGPUrealtxt(d_result, "D:\\Project\\poisson2DFFT\\poisson2DFFT\\d_result.txt", M * N);
// --- Transfer data from device to host
float *h_result = (float *)malloc(M * N * sizeof(float));
gpuErrchk(cudaMemcpy(h_result, d_result, M * N * sizeof(float), cudaMemcpyDeviceToHost));
return 0;
}

As it shows in the tutorial, the Matlab implementation on slide 33 on page 17 shows that the Poisson calculations are based on the top left corner of the screen as the origin. The x and y data values are then x = (0:(N-1))*h; and y = (0:(N-1))*h;, which is why the meshgrid created from these x and y values both start from 0 and increase, as shown on the graph's x and y axes on slide 31 on page 16. In this case, where the image length was 1 (I refer to the input data of the NxN float array or the meshgrid as an image), the center of the image is actually (0.5, 0.5). I wanted to translate these points, so the center point would instead be (0,0) and followed a typical representation of the Cartesian Plane.
So in my code, instead of the Matlab code of
x = (0:(N-1))*h;
y = (0:(N-1))*h;
which could be implemented as
for (int i = 0; i < N; i++)
{
kx[i] = i;
ky[i] = i;
}
I replaced it with
for (int i = 0; i < N; i++)
{
kx[i] = i - N / 2.0f; //centers kx values to be at center of image
ky[i] = N / 2.0f - i; //centers ky values to be at center of image
}
However, I had forgot to change the Poisson calculation so it recognizes the center of the image as the origin instead of the top right corner as the origin. So as Mr. Robert Crovella said, I would have to
change this line: if (idx == 0 && idy == 0) scale = 1.0f; to this: if
(idx == 2 && idy == 2) scale = 1.0f;
for the case where the image length, or N, is 4.
To generalize this for any image length, this line of code could be then changed to if (idx == n/2 && idy == n/2) scale = 1.0f;

Call multiple CUDA SVD (in cuSolver) [duplicate]

I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?

My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA

The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}

Instructions Per Count (IPC) and Instruction Level Parallelism (ILP) in CUDA

I observe IPC drops as ILP goes up for 32-bit int operations when trying to speed up my cryptographic kernel. The kernel consists of fairly unrolled loops of long sequence of ADD and XOR operations, which should have a throughput of 160 ops per 192 cores per cycle on Kepler (GTX Titan/780).
IPC for my kernel hits the upper bound of 3.28. Using ILP even drops IPC. Apparently ILP fails to help achieve my goal -- fully utilize the pipeline, so I wrote some little experiments. I put the code for ILP 4 at the end.
Profiler Measurements
Results are measured on GTX Titan.
cubin outputs are examined to make sure no instructions are eliminated during optimization.
Executed IPC is almost the same as issued IPC, so I just list one of them.
ADD instructions (XORs have identical behavior)
| ILP 1 | ILP 2 | ILP 4 | ILP 8
--------------------------------------------------
IPC | 4.00 | 3.32 | 2.72 | 3.44
--------------------------------------------------
Issue Slot | 99.17% | 59.34% | 48.61% | 61.71%
Utilization | | | |
I expect ILP 2, 4 and 8 would give better performance, but not.
Recall the integer throughput is 160. The 4 warp scheduler per SM should dual issue up to 5 instructions per cycle, so that IPC should go up towards 5. How can I explain what I observed? Why is the issue slot 99% utilized when IPC = 4?
Float / Int ADD instruction mix
If I modify the code for ILP 4 to do two int ADDs and two float ADDs:
IPC: 5.1
Issue slot utilization: 99.12%
Strangely enough, it seems that the warp scheduler does a better job to issue floating operations.
Discussion
Available literature suggests using ILP help reach the peak performance for floating point operations. Why doesn't ILP apply to integers? How can I do this for integer operations?
My kernel theoretically should do 2.25 integer operations per candidate. This is consistent with what I observed in cuobjdump. There are 2^48 candidates, so the minimun runtime on GTX Titan should be 2.25 * 2^48 / (2688 * 160/192) / 876 MHz = 322.75s. Is this estimation reasonable?
The measured performance for my kernel is 523s. This does imply that integer throughput is only about 160 * 3.28 (measure IPC) / 5 (max IPC).
ILP test code
__device__ int x[10];
__global__ void test(int flag = 0)
{
int a = x[0], b = x[1], c = x[2], d = x[3];
int _a = x[4], _b = x[5], _c = x[6], _d = x[7];
#pragma unroll 128
for (int i = 0; i < 51200; ++i)
{
asm volatile("add.u32 %0, %0, %1;": "+r"(a): "r"(_a));
asm volatile("add.u32 %0, %0, %1;": "+r"(b): "r"(_b));
asm volatile("add.u32 %0, %0, %1;": "+r"(c): "r"(_c));
asm volatile("add.u32 %0, %0, %1;": "+r"(d): "r"(_d));
}
int v = a + b + c + d;
if (flag * v == 1)
x[0] = v;
}
Code fragment for 4 candidates
Each candidate takes 9 / 4 = 2.25 ops. Cuobjdump also verifies this.
d ^= d2(1, 3); // d2 is located in constant memory
s ^= d;
t ^= d2(1, 16);
u ^= d2(1, 17);
v ^= some_const;
flag_s = min(flag_s, s); // int min has throughput of 160
flag_t = flag_t || (s == t); // setp.or should be the same
flag_u = flag_u || (s == u);
flag_v = flag_v || (s == v);

I'm providing an answer to remove this question from the unanswered list.
I do not observe a change in executed Instructions Per Count (IPC) with Instruction Level Parallelism. Overall, it is difficult to argue the reason for the effect observed by the OP without knowing any further information but that provided by the OP himself (f.i., the launch configuration).
In the code below, I'm considering an example using floats, although I have tested the same code with ints without changing the conceptual results. The code implements cyclical Multiply Add (MAD) operations with ILP=1, ILP=2 and ILP=4.
The executed IPC has been the following
ILP IPC FLOPs
1 3.924 67108864
2 4.323 67108864
4 4.016 67108864
for N=8192. The code has been compiled with CUDA 8.0 and run on an NVIDIA GT920M. As it can be seen, IPC keeps almost constant for the differently considered values of ILP. The Floating Point Operations (FLOPs) as estimated by the code assuming 2 FLOPs per MAD coincides with that measured by the Visual Profiler.
THE CODE
#include<stdio.h>
#define N_ITERATIONS 8192
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCKSIZE 512
//#define DEBUG
/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float a = d_a[tid];
float b = d_b[tid];
float c = d_c[tid];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a = a * b + c;
}
d_a[tid] = a;
}
}
/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N / 2) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float c1 = d_c[tid];
float a2 = d_a[tid + N / 2];
float b2 = d_b[tid + N / 2];
float c2 = d_c[tid + N / 2];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
}
d_a[tid] = a1;
d_a[tid + N / 2] = a2;
}
}
/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N / 4) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float c1 = d_c[tid];
float a2 = d_a[tid + N / 4];
float b2 = d_b[tid + N / 4];
float c2 = d_c[tid + N / 4];
float a3 = d_a[tid + N / 2];
float b3 = d_b[tid + N / 2];
float c3 = d_c[tid + N / 2];
float a4 = d_a[tid + 3 * N / 4];
float b4 = d_b[tid + 3 * N / 4];
float c4 = d_c[tid + 3 * N / 4];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
a3 = a3 * b3 + c3;
a4 = a4 * b4 + c4;
}
d_a[tid] = a1;
d_a[tid + N / 4] = a2;
d_a[tid + N / 2] = a3;
d_a[tid + 3 * N / 4] = a4;
}
}
/********/
/* MAIN */
/********/
int main() {
//const int N = 8192 * 64;
const int N = 8192;
//const int N = 1024;
TimingGPU timerGPU;
float *h_a = (float*)malloc(N*sizeof(float));
float *h_a_result_host = (float*)malloc(N*sizeof(float));
float *h_a_result_device = (float*)malloc(N*sizeof(float));
float *h_b = (float*)malloc(N*sizeof(float));
float *h_c = (float*)malloc(N*sizeof(float));
for (int i = 0; i<N; i++) {
h_a[i] = 2.;
h_b[i] = 1.;
h_c[i] = 2.;
h_a_result_host[i] = h_a[i];
for (unsigned int k = 0; k < N_ITERATIONS; k++) {
h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
}
}
float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(float)));
float *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(float)));
float *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(float), cudaMemcpyHostToDevice));
/***********/
/* KERNEL0 */
/***********/
timerGPU.StartCounter();
kernel0 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
/***********/
/* KERNEL1 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
kernel1 << <iDivUp(N / 2, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
/***********/
/* KERNEL2 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
kernel2 << <iDivUp(N / 4, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
cudaDeviceReset();
return 0;
}

Parallel implementation for multiple SVDs using CUDA

I'm new to parallel programming using GPU so I apologize if the question is broad or vague. I'm aware there is some parallel SVD function in the CULA library, but what should be the strategy if I have a large number of relatively small matrices to factorize? For example I have n matrices with dimension d, n is large and d is small. How to parallelize this process? Could anyone give me a hint?

My previous answer is now out-of-date. As of February 2015, CUDA 7 (currently in release candidate version) offers full SVD capabilities in its cuSOLVER library. Below, I'm providing an example of generating the singular value decomposition using CUDA cuSOLVER.
Concerning the specific issue you are rising (calculating the SVD of several matrices of small size), you should adapt the example I'm providing below by using streams. To associate a stream to each task you can use
cudaStreamCreate()
and
cusolverDnSetStream()
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include<math.h>
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main(){
// --- gesvd only supports Nrows >= Ncols
// --- column major memory ordering
const int Nrows = 7;
const int Ncols = 5;
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolverDnCreate(&solver_handle);
// --- Setting the host, Nrows x Ncols matrix
double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
for(int j = 0; j < Nrows; j++)
for(int i = 0; i < Ncols; i++)
h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j));
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_U = (double *)malloc(Nrows * Nrows * sizeof(double));
double *h_V = (double *)malloc(Ncols * Ncols * sizeof(double));
double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double));
// --- device side SVD workspace and matrices
double *d_U; gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows * sizeof(double)));
double *d_V; gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols * sizeof(double)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double)));
// --- CUDA SVD initialization
cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size));
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
// --- CUDA SVD execution
cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n";
// --- Moving the results from device to host
gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
std::cout << "Singular values\n";
for(int i = 0; i < min(Nrows, Ncols); i++)
std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl;
std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n";
for(int j = 0; j < Nrows; j++) {
printf("\n");
for(int i = 0; i < Nrows; i++)
printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]);
}
std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n";
for(int i = 0; i < Ncols; i++) {
printf("\n");
for(int j = 0; j < Ncols; j++)
printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]);
}
cusolverDnDestroy(solver_handle);
return 0;
}
Utilities.cuh
#ifndef UTILITIES_CUH
#define UTILITIES_CUH
extern "C" int iDivUp(int, int);
extern "C" void gpuErrchk(cudaError_t);
extern "C" void cusolveSafeCall(cusolverStatus_t);
#endif
Utilities.cu
#include <stdio.h>
#include <assert.h>
#include "cuda_runtime.h"
#include <cuda.h>
#include <cusolverDn.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cudaGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if(CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

You can take a look at the Batched Operations post of the CULA blog for a discussion of your problem.
EDIT
From what I understand from your comment below, you would like each thread to calculate a separate SVD. So, basically each thread should execute a standard, sequential SVD scheme. For that some possibly useful references:
Numerical Recipes
Golub, Van Loan, Matrix Computations
If you use this approach, though, I'm afraid you will not be able anymore to use cuBLAS, as those are host functions not callable from the device (unless you do not have a compute capability >3.5, see the the simpleDevLibCUBLAS example.). But basically in this way I think you are somehow implementing the batch concept by yourself.
If you decide to go to a more standard parallel GPU implementation, the reference below could be of interest:
Singular Value Decomposition on GPU using CUDA

The above answers are now out of date. As of CUDA 9.0, the cuSOLVER library has been equipped with a batched SVD calculation based on the Jacobi method. Below, a fully worked example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
//#define FULLSVD
//#define PRINTRESULTS
/********/
/* MAIN */
/********/
int main() {
const int M = 3;
const int N = 3;
const int lda = M;
//const int numMatrices = 3;
const int numMatrices = 16384;
TimingGPU timerGPU;
// --- Setting the host matrix
double *h_A = (double *)malloc(lda * N * numMatrices * sizeof(double));
for (unsigned int k = 0; k < numMatrices; k++)
for (unsigned int i = 0; i < M; i++){
for (unsigned int j = 0; j < N; j++){
h_A[k * M * N + j * M + i] = (1. / (k + 1)) * (i + j * j) * (i + j);
//printf("%d %d %f\n", i, j, h_A[j*M + i]);
}
}
// --- Setting the device matrix and moving the host matrix to the device
double *d_A; gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- host side SVD results space
double *h_S = (double *)malloc(N * numMatrices * sizeof(double));
double *h_U = NULL;
double *h_V = NULL;
#ifdef FULLSVD
h_U = (double *)malloc(M * M * numMatrices * sizeof(double));
h_V = (double *)malloc(N * N * numMatrices * sizeof(double));
#endif
// --- device side SVD workspace and matrices
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
double *d_S; gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(double)));
double *d_U = NULL;
double *d_V = NULL;
#ifdef FULLSVD
gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(double)));
gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(double)));
#endif
double *d_work = NULL; /* devie workspace for gesvdj */
int devInfo_h = 0; /* host copy of error devInfo_h */
// --- Parameters configuration of Jacobi-based SVD
const double tol = 1.e-7;
const int maxSweeps = 15;
cusolverEigMode_t jobz; // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif
const int econ = 0; // --- econ = 1 for economy size
// --- Numerical result parameters of gesvdj
double residual = 0;
int executedSweeps = 0;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle = NULL;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- Configuration of gesvdj
gesvdjInfo_t gesvdj_params = NULL;
cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));
// --- Set the computation tolerance, since the default tolerance is machine precision
cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
// --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));
// --- Query the SVD workspace
cusolveSafeCall(cusolverDnDgesvdjBatched_bufferSize(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Nubmer of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M,N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
&work_size,
gesvdj_params,
numMatrices));
gpuErrchk(cudaMalloc(&d_work, sizeof(double) * work_size));
// --- Compute SVD
timerGPU.StartCounter();
cusolveSafeCall(cusolverDnDgesvdjBatched(
solver_handle,
jobz, // --- Compute the singular vectors or not
M, // --- Number of rows of A, 0 <= M
N, // --- Number of columns of A, 0 <= N
d_A, // --- M x N
lda, // --- Leading dimension of A
d_S, // --- Square matrix of size min(M, N) x min(M, N)
d_U, // --- M x M if econ = 0, M x min(M, N) if econ = 1
lda, // --- Leading dimension of U, ldu >= max(1, M)
d_V, // --- N x N if econ = 0, N x min(M, N) if econ = 1
lda, // --- Leading dimension of V, ldv >= max(1, N)
d_work,
work_size,
devInfo,
gesvdj_params,
numMatrices));
printf("Calculation of the singular values only: %f ms\n\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(double) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(double) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(double) * lda * N * numMatrices, cudaMemcpyDeviceToHost));
#endif
#ifdef PRINTRESULTS
printf("SINGULAR VALUES \n");
printf("_______________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; SV nr. %d; Value = %f\n", k, p, h_S[k * N + p]);
printf("\n");
}
#ifdef FULLSVD
printf("SINGULAR VECTORS U \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * M + econ * min(M, N); q++)
for (int p = 0; p < M; p++)
printf("Matrix nr. %d; U nr. %d; Value = %f\n", k, p, h_U[((1 - econ) * M + econ * min(M, N)) * M * k + q * M + p]);
printf("\n");
}
printf("SINGULAR VECTORS V \n");
printf("__________________ \n");
for (int k = 0; k < numMatrices; k++) {
for (int q = 0; q < (1 - econ) * N + econ * min(M, N); q++)
for (int p = 0; p < N; p++)
printf("Matrix nr. %d; V nr. %d; Value = %f\n", k, p, h_V[((1 - econ) * N + econ * min(M, N)) * N * k + q * N + p]);
printf("\n");
}
#endif
#endif
if (0 == devInfo_h){
printf("gesvdj converges \n");
}
else if (0 > devInfo_h){
printf("%d-th parameter is wrong \n", -devInfo_h);
exit(1);
}
else{
printf("WARNING: devInfo_h = %d : gesvdj does not converge \n", devInfo_h);
}
// --- Free resources
if (d_A) gpuErrchk(cudaFree(d_A));
if (d_S) gpuErrchk(cudaFree(d_S));
#ifdef FULLSVD
if (d_U) gpuErrchk(cudaFree(d_U));
if (d_V) gpuErrchk(cudaFree(d_V));
#endif
if (devInfo) gpuErrchk(cudaFree(devInfo));
if (d_work) gpuErrchk(cudaFree(d_work));
if (solver_handle) cusolveSafeCall(cusolverDnDestroy(solver_handle));
if (gesvdj_params) cusolveSafeCall(cusolverDnDestroyGesvdjInfo(gesvdj_params));
gpuErrchk(cudaDeviceReset());
return 0;
}

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Solving dense linear systems AX = B with CUDA - cuda

Can I use the new cuSOLVER library (CUDA 7) to solve linear systems of the form AX = B where A, X and B are NxN dense matrices ?

Related

Why PyCUDA is faster than C CUDA in this example

Solve the Poisson equation using FFT with CUDA

Call multiple CUDA SVD (in cuSolver) [duplicate]

Instructions Per Count (IPC) and Instruction Level Parallelism (ILP) in CUDA

Parallel implementation for multiple SVDs using CUDA

Categories

Resources