Gradient Descent Optimization in CUDA - cuda

I will code my first relatively big CUDA project as Gradient Descent Optimization for machine learning purposes. I would like to get benefit from crowd wisdom about some useful native functions of the CUDA that might be short cut to use in the project. Any ideas/suggestions?

Gradient Descent (AKA steepest descent) aims at finding a local minimum of a multivariate function F(x) by taking steps proportional to the negative of the gradient of F(x) at the current point. The update rule is the following:
where the step size gamma_n is allowed to change at every step and can be determined, for example, by line searches.
Implementing the above update rule in CUDA is pretty easy. Below, I'm providing a full example using the Rosenbrock function as the cost functional to be optimized, exploiting the analytical gradient, and considering a constant value for the step size through the iterations (namely, gamma_n = gamma). The Utilities.cu and Utilities.cuh files are mantained at OrangeOwlSolutions/CUDA_Utilities and omitted here. The example implements the CPU as well as the GPU approach.
**kernel.cu**
#include <stdio.h>
#include <float.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "GradientDescentCPU.h"
#include "GradientDescentGPU.cuh"
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main()
{
/********************/
/* INPUT PARAMETERS */
/********************/
// --- Number of unknowns
const int M = 5;
// --- Starting point
float *h_x0 = (float*)malloc(M * sizeof(float));
for (int i=0; i<M; i++) h_x0[i] = 1.2f;
// --- Termination tolerance
const float tol = 1.e-6;
// --- Maximum number of allowed iterations
const int maxiter = 10000;
// --- Step size
const float alpha = 0.001f;
// --- Derivative step
const float h = 0.0001f;
// --- Minimum allowed perturbations
const float dxmin = 1e-5;
/*********************/
/* OUTPUT PARAMETERS */
/*********************/
// --- Optimal point
float* h_xopt = (float*)malloc(M * sizeof(float));
for (int i=0; i<M; i++) h_xopt[i] = 0.f;
// --- Optimal functional
float fopt = 0.f;
// --- Number of performed iterations
int niter = 0;
// --- Gradient norm at optimal point
float gnorm = 0.f;
// --- Distance between last and penultimate solutions found
float dx = 0.f;
/***************************/
/* OPTIMIZATION - CPU CASE */
/***************************/
GradientDescentCPU(h_x0, tol, maxiter, alpha, h, dxmin, M, h_xopt, &fopt, &niter, &gnorm, &dx);
printf("Solution found - CPU case:\n");
printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx);
printf("\n\n");
#ifdef VERBOSE
printf("Found minimum - CPU case:\n");
for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]);
printf("\n\n");
#endif
/***************************/
/* OPTIMIZATION - GPU CASE */
/***************************/
// --- Starting point
float *d_x0; gpuErrchk(cudaMalloc((void**)&d_x0, M * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x0, h_x0, M * sizeof(float), cudaMemcpyHostToDevice));
// --- Optimal point
float *d_xopt; gpuErrchk(cudaMalloc((void**)&d_xopt, M * sizeof(float)));
GradientDescentGPU(d_x0, tol, maxiter, alpha, h, dxmin, M, d_xopt, &fopt, &niter, &gnorm, &dx);
printf("Solution found - GPU case:\n");
printf("fopt = %f; niter = %i; gnorm = %f; dx = %f\n", fopt, niter, gnorm, dx);
printf("\n\n");
#ifdef VERBOSE
gpuErrchk(cudaMemcpy(h_xopt, d_xopt, M * sizeof(float), cudaMemcpyDeviceToHost));
printf("Found minimum - GPU case:\n");
for (int i=0; i<M; i++) printf("i = %i; h_xopt = %f\n", i, h_xopt[i]);
printf("\n\n");
#endif
return 0;
}
GradientDescentCPU.cu
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "GradientDescentGPU.cuh"
/*******************************/
/* GRADIENT DESCENT - CPU CASE */
/*******************************/
// --- Version using finite differences
//void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) {
//
// for (int i=0; i<M; i++) {
// h_x[i] = h_x[i] + h / 2.f;
// h_g[i] = CostFunction(h_x, M);
// h_x[i] = h_x[i] - h;
// h_g[i] = (h_g[i] - CostFunction(h_x, M)) / (2.f * h);
// h_x[i] = h_x[i] + h / 2.f;
// }
//}
// --- Version using analytical gradient (Rosenbrock function)
void CostFunctionGradientCPU(float * __restrict h_x, float * __restrict h_g, const float h, const int M) {
h_g[0] = -400.f * (h_x[1] - h_x[0] * h_x[0]) * h_x[0] + 2.f * (h_x[0] - 1.f);
for (int i=1; i<M-1; i++) {
h_g[i] = -400.f * h_x[i] * (h_x[i+1] - h_x[i] * h_x[i]) + 2.f * (h_x[i] - 1.f) + 200.f * (h_x[i] - h_x[i-1] * h_x[i-1]);
}
h_g[M-1] = 200.f * (h_x[M-1] - h_x[M-2] * h_x[M-2]);
}
/********/
/* NORM */
/********/
float normCPU(const float * __restrict h_x, const int M) {
float sum = 0.f;
for(int i=0; i<M; i++) sum = sum + h_x[i] * h_x[i];
return sqrt(sum);
}
/****************************************/
/* GRADIENT DESCENT FUNCTION - CPU CASE */
/****************************************/
// x0 - Starting point
// tol - Termination tolerance
// maxiter - Maximum number of allowed iterations
// alpha - Step size
// dxmin - Minimum allowed perturbations
void GradientDescentCPU(const float * __restrict h_x0, const float tol, const int maxiter, const float alpha, const float h, const float dxmin, const int M,
float * __restrict h_xopt, float *fopt, int *niter, float *gnorm, float *dx) {
// --- Initialize gradient norm, optimization vector, iteration counter, perturbation
*gnorm = FLT_MAX;
float *h_x = (float *)malloc(M * sizeof(float));
for (int i=0; i<M; i++) h_x[i] = h_x0[i];
*niter = 0;
*dx = FLT_MAX;
// --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions
float *h_g = (float *)malloc(M * sizeof(float));
float *h_xnew = (float *)malloc(M * sizeof(float));
float *h_xdiff = (float *)malloc(M * sizeof(float));
// --- Gradient Descent iterations
while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) {
// --- Calculate gradient
CostFunctionGradientCPU(h_x, h_g, h, M);
*gnorm = normCPU(h_g, M);
// --- Take step:
for (int i=0; i<M; i++) h_xnew[i] = h_x[i] - alpha * h_g[i];
// --- Update termination metrics
*niter = *niter + 1;
for (int i=0; i<M; i++) h_xdiff[i] = h_xnew[i] - h_x[i];
*dx = normCPU(h_xdiff, M);
for (int i=0; i<M; i++) h_x[i] = h_xnew[i];
}
for (int i=0; i<M; i++) h_xopt[i] = h_x[i];
*fopt = CostFunction(h_xopt, M);
*niter = *niter - 1;
}
GradientDescentCPU.h
#ifndef GRADIENT_DESCENT_CPU
#define GRADIENT_DESCENT_CPU
void GradientDescentCPU(const float * __restrict, const float, const int, const float, const float, const float, const int,
float * __restrict, float *, int *, float *, float *);
#endif
GradientDescentGPU.cu
#include <thrust\device_ptr.h>
#include <thrust\inner_product.h>
#include "Utilities.cuh"
#define BLOCK_SIZE 256
//#define VERBOSE
//#define DEBUG
/***********************************/
/* COST FUNCTION - CPU & GPU CASES */
/***********************************/
__host__ __device__ float CostFunction(const float * __restrict h_x, const int M) {
// --- Rosenbrock function
float sum = 0.f;
for (int i=0; i<M-1; i++) {
float temp1 = (h_x[i+1] - h_x[i] * h_x[i]);
float temp2 = (h_x[i] - 1.f);
sum = sum + 100.f * temp1 * temp1 + temp2 * temp2;
}
return sum;
}
/*******************************/
/* GRADIENT DESCENT - GPU CASE */
/*******************************/
// --- Version using finite differences
//__device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) {
//
// int test1, test2;
// float h_test1_plus, h_test1_minus, h_test2_plus, h_test2_minus, temp1_plus, temp1_minus, temp2_plus, temp2_minus;
//
// // --- Rosenbrock function
// float sum_plus = 0.f, sum_minus = 0.f;
// for (int i=0; i<M-1; i++) {
// h_test1_plus = d_x[i] + (h / 2.f) * (tid == i);
// h_test1_minus = d_x[i] - (h / 2.f) * (tid == i);
// h_test2_plus = d_x[i + 1] + (h / 2.f) * (tid == (i + 1));
// h_test2_minus = d_x[i + 1] - (h / 2.f) * (tid == (i + 1));
// temp1_plus = (h_test2_plus - h_test1_plus * h_test1_plus);
// temp2_plus = (h_test1_plus - 1.f);
// temp1_minus = (h_test2_minus - h_test1_minus * h_test1_minus);
// temp2_minus = (h_test1_minus - 1.f);
// sum_plus = sum_plus + 100.f * temp1_plus * temp1_plus + temp2_plus * temp2_plus;
// sum_minus = sum_minus + 100.f * temp1_minus * temp1_minus + temp2_minus * temp2_minus;
// }
// d_g[tid] = (sum_plus - sum_minus) / (2.f * h);
//}
// --- Version using analytical gradient (Rosenbrock function)
__device__ void CostFunctionGradientGPU(float * __restrict d_x, float * __restrict d_g, const float h, const int tid, const int M) {
if (tid == 0) d_g[0] = -400.f * (d_x[1] - d_x[0] * d_x[0]) * d_x[0] + 2.f * (d_x[0] - 1.f);
else if (tid == M-1) d_g[M-1] = 200.f * (d_x[M-1] - d_x[M-2] * d_x[M-2]);
else {
for (int i=1; i<M-1; i++) {
d_g[i] = -400.f * d_x[i] * (d_x[i+1] - d_x[i] * d_x[i]) + 2.f * (d_x[i] - 1.f) + 200.f * (d_x[i] - d_x[i-1] * d_x[i-1]);
}
}
}
/*******************/
/* STEP - GPU CASE */
/*******************/
__global__ void StepGPU(float * __restrict d_x, float * __restrict d_xnew, float * __restrict d_xdiff, float * __restrict d_g, const float alpha, const float h, const int M) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < M) {
// --- Calculate gradient
CostFunctionGradientGPU(d_x, d_g, h, tid, M);
// --- Take step
d_xnew[tid] = d_x[tid] - alpha * d_g[tid];
// --- Update termination metrics
d_xdiff[tid] = d_xnew[tid] - d_x[tid];
// --- Update current solution
d_x[tid] = d_xnew[tid];
}
}
/***********************************/
/* COST FUNCTION STRUCT - GPU CASE */
/***********************************/
// --- Rosenbrock function struct for thrust reduction
struct CostFunctionStructGPU{
template <typename Tuple>
__host__ __device__ float operator()(Tuple a) {
float temp1 = (thrust::get<1>(a) - thrust::get<0>(a) * thrust::get<0>(a));
float temp2 = (thrust::get<0>(a) - 1.f);
return 100.f * temp1 * temp1 + temp2 * temp2;
}
};
/****************************************/
/* GRADIENT DESCENT FUNCTION - GPU CASE */
/****************************************/
// x0 - Starting point
// tol - Termination tolerance
// maxiter - Maximum number of allowed iterations
// alpha - Step size
// dxmin - Minimum allowed perturbations
void GradientDescentGPU(const float * __restrict__ d_x0, const float tol, const int maxiter, const float alpha, const float h,
const float dxmin, const int M, float * __restrict__ d_xopt, float *fopt, int *niter, float *gnorm, float *dx) {
thrust::device_ptr<float> dev_ptr_xopt = thrust::device_pointer_cast(d_xopt);
// --- Initialize gradient norm, optimization vector, iteration counter, perturbation
*gnorm = FLT_MAX;
float *d_x; gpuErrchk(cudaMalloc((void**)&d_x, M * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x, d_x0, M * sizeof(float), cudaMemcpyDeviceToDevice));
*niter = 0;
*dx = FLT_MAX;
// --- Allocating space for the gradient, for the new actual solution and for the difference between actual and old solutions
float *d_g; gpuErrchk(cudaMalloc((void**)&d_g, M * sizeof(float))); thrust::device_ptr<float> dev_ptr_g = thrust::device_pointer_cast(d_g);
float *d_xnew; gpuErrchk(cudaMalloc((void**)&d_xnew, M * sizeof(float)));
float *d_xdiff; gpuErrchk(cudaMalloc((void**)&d_xdiff, M * sizeof(float))); thrust::device_ptr<float> dev_ptr_xdiff = thrust::device_pointer_cast(d_xdiff);
// --- Gradient Descent iterations
while ((*gnorm >= tol) && (*niter <= maxiter) && (*dx >= dxmin)) {
// --- Iteration step
StepGPU<<<iDivUp(M, BLOCK_SIZE), BLOCK_SIZE>>>(d_x, d_xnew, d_xdiff, d_g, alpha, h, M);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
*gnorm = sqrt(thrust::inner_product(dev_ptr_g, dev_ptr_g + M, dev_ptr_g, 0.0f));
*dx = sqrt(thrust::inner_product(dev_ptr_xdiff, dev_ptr_xdiff + M, dev_ptr_xdiff, 0.0f));
*niter = *niter + 1;
}
gpuErrchk(cudaMemcpy(d_xopt, d_x, M * sizeof(float), cudaMemcpyDeviceToDevice));
// --- Functional calculation
*fopt = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt, dev_ptr_xopt + 1)), thrust::make_zip_iterator(thrust::make_tuple(dev_ptr_xopt + M - 1, dev_ptr_xopt + M)), CostFunctionStructGPU(), 0.0f, thrust::plus<float>());
*niter = *niter - 1;
}
GradientDescentGPU.cuh
#ifndef GRADIENT_DESCENT_GPU
#define GRADIENT_DESCENT_GPU
void GradientDescentGPU(const float * __restrict__, const float, const int, const float, const float, const float, const int,
float * __restrict__, float *, int *, float *, float *);
__host__ __device__ float CostFunction(const float * __restrict, const int);
#endif

Related

Computing all-pairs distances between points in different sets with CUDA

I am trying to implement a brute force distance computation algorithm in CUDA.
#define VECTOR_DIM 128
thrust::device_vector<float> feature_data_1;
feature_data_1.resize(VECTOR_DIM * 1000); // 1000 128 dimensional points
thrust::device_vector<float> feature_data_2;
feature_data_2.resize(VECTOR_DIM * 2000); // 2000 128 dimensional points
Now what I would like to do is to compute the L2 distances (sum of the squared differences) from every vector in the first matrix to every vector in the second matrix.
So, if array 1 is of size 1000 and array 2 is of size 2000, the result would be a floating point matrix of 1000*2000 in size.
I was wondering if there is a way to achieve this using Thrust algorithms alone.
Calculating the all-pairs distances between points in two different sets in CUDA can be solved by observing that
||x-y||^2=||x||^2+||y||^2-2*<x,y>
where || || is the l2 norm and <x,y> denotes the scalar product between x and y.
The norms ||x|| and ||y|| can be calculated by approaches inspired by Reduce matrix rows with CUDA, while the scalar products <x,y> can then be calculated as the matrix-matrix multiplication X*Y^T using cublas<t>gemm().
Below is a fully worked out implementation. Please, note that for the calculation of the norms || || two approaches are reported, one using cuBLAS cublas<t>gemv and one using Thurst's transform. For the problem size of your interest, I have experienced the following timings on my GT540M card:
Approach nr. 1 0.12ms
Approach nr. 2 0.59ms
include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/sequence.h>
#include <stdio.h>
#include <iostream>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
/***********************************************************/
/* SQUARED ABSOLUTE VALUE FUNCTOR - NEEDED FOR APPROACH #1 */
/***********************************************************/
struct abs2 {
__host__ __device__ double operator()(const float &x) const { return x * x; }
};
// --- Required for approach #2
__device__ float *vals;
/******************************************/
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */
/******************************************/
struct row_reduction {
const int Ncols; // --- Number of columns
row_reduction(int _Ncols) : Ncols(_Ncols) {}
__device__ float operator()(float& x, int& y ) {
float temp = 0.f;
for (int i = 0; i<Ncols; i++)
temp += vals[i + (y*Ncols)] * vals[i + (y*Ncols)];
return temp;
}
};
/************************************************/
/* KERNEL FUNCTION TO ASSEMBLE THE FINAL RESULT */
/************************************************/
__global__ void assemble_final_result(const float * __restrict__ d_norms_x_2, const float * __restrict__ d_norms_y_2, float * __restrict__ d_dots,
const int NX, const int NY) {
const int i = threadIdx.x + blockIdx.x * gridDim.x;
const int j = threadIdx.y + blockIdx.y * gridDim.y;
if ((i < NY) && (j < NX)) d_dots[i * NX+ j] = d_norms_x_2[j] + d_norms_y_2[i] - 2 * d_dots[i * NX+ j];
}
/********/
/* MAIN */
/********/
int main()
{
//const int Ndims = 128; // --- Number of rows
//const int NX = 1000; // --- Number of columns
//const int NY = 2000; // --- Number of columns
const int Ndims = 3; // --- Number of rows
const int NX = 4; // --- Number of columns
const int NY = 5; // --- Number of columns
// --- Random uniform integer distribution between 10 and 99
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// --- Matrices allocation and initialization
thrust::device_vector<float> d_X(Ndims * NX);
thrust::device_vector<float> d_Y(Ndims * NY);
for (size_t i = 0; i < d_X.size(); i++) d_X[i] = (float)dist(rng);
for (size_t i = 0; i < d_Y.size(); i++) d_Y[i] = (float)dist(rng);
TimingGPU timerGPU;
// --- cuBLAS handle creation
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
/**********************************************/
/* CALCULATING THE NORMS OF THE ELEMENTS OF X */
/**********************************************/
thrust::device_vector<float> d_norms_x_2(NX);
// --- Approach nr. 1
//timerGPU.StartCounter();
thrust::device_vector<float> d_X_2(Ndims * NX);
thrust::transform(d_X.begin(), d_X.end(), d_X_2.begin(), abs2());
thrust::device_vector<float> d_ones(Ndims, 1.f);
float alpha = 1.f;
float beta = 0.f;
cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NX, &alpha, thrust::raw_pointer_cast(d_X_2.data()), Ndims,
thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_x_2.data()), 1));
//printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());
// --- Approach nr. 2
//timerGPU.StartCounter();
// float *s_vals = thrust::raw_pointer_cast(&d_X[0]);
// gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *)));
// thrust::transform(d_norms_x_2.begin(), d_norms_x_2.end(), thrust::counting_iterator<int>(0), d_norms_x_2.begin(), row_reduction(Ndims));
//printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());
/**********************************************/
/* CALCULATING THE NORMS OF THE ELEMENTS OF Y */
/**********************************************/
thrust::device_vector<float> d_norms_y_2(NX);
thrust::device_vector<float> d_Y_2(Ndims * NX);
thrust::transform(d_Y.begin(), d_Y.end(), d_Y_2.begin(), abs2());
cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NY, &alpha, thrust::raw_pointer_cast(d_Y_2.data()), Ndims,
thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_y_2.data()), 1));
/***********************************/
/* CALCULATING THE SCALAR PRODUCTS */
/***********************************/
thrust::device_vector<float> d_dots(NX * NY);
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, NX, NY, Ndims, &alpha,
thrust::raw_pointer_cast(d_X.data()), Ndims, thrust::raw_pointer_cast(d_Y.data()), Ndims, &beta,
thrust::raw_pointer_cast(d_dots.data()), NX));
/*****************************/
/* ASSEMBLE THE FINAL RESULT */
/*****************************/
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid(iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y));
assemble_final_result<<<dimGrid, dimBlock>>>(thrust::raw_pointer_cast(d_norms_x_2.data()), thrust::raw_pointer_cast(d_norms_y_2.data()),
thrust::raw_pointer_cast(d_dots.data()), NX, NY);
for(int i = 0; i < NX * NY; i++) std::cout << d_dots[i] << "\n";
return 0;
}
The Utilities.cu and Utilities.cuh files are mantained here and omitted here. The TimingGPU.cu and TimingGPU.cuh are maintained here and are omitted as well.

Cholesky decomposition with CUDA

I am trying to implement Cholesky decomposition using the cuSOLVER library. I am a beginner CUDA programmer and I have always specified block-sizes and grid-sizes, but I am not able to find out how this can be set explicitly by the programmer with cuSOLVER functions.
Here is the documentation: http://docs.nvidia.com/cuda/cusolver/index.html#introduction
The QR decomposition is implemented using the cuSOLVER library (see the example here: http://docs.nvidia.com/cuda/cusolver/index.html#ormqr-example1) and even there the above two parameters are not set.
To summarize, I have the following questions
How can the parameters: block-size and grid-size can be set with the cuSOLVER library?
How is the same being done with the mentioned QR example code in the NVIDIA documentation?
Robert Crovella has already answered this question. Here, I'm just providing a full example showing how Cholesky decomposition can be easily performed using the potrf function provided by the cuSOLVER library.
The Utilities.cu and Utilities.cuh files are mantained at this page and omitted here. The example implements the CPU as well as the GPU approach.
#include "cuda_runtime.h"
#include "device_launch_paraMeters.h"
#include<iostream>
#include <fstream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
#define prec_save 10
/******************************************/
/* SET HERMITIAN POSITIVE DEFINITE MATRIX */
/******************************************/
// --- Credit to: https://math.stackexchange.com/questions/357980/how-to-generate-random-symmetric-positive-definite-matrices-using-matlab
void setPDMatrix(double * __restrict h_A, const int N) {
// --- Initialize random seed
srand(time(NULL));
double *h_A_temp = (double *)malloc(N * N * sizeof(double));
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A_temp[i * N + j] = (float)rand() / (float)RAND_MAX;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = 0.5 * (h_A_temp[i * N + j] + h_A_temp[j * N + i]);
for (int i = 0; i < N; i++) h_A[i * N + i] = h_A[i * N + i] + N;
}
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/********/
/* MAIN */
/********/
int main(){
const int N = 1000;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));
/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Setting the host, N x N matrix
double *h_A = (double *)malloc(N * N * sizeof(double));
setPDMatrix(h_A, N);
// --- Allocate device space for the input matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * sizeof(double)));
// --- Move the relevant matrix from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice));
/****************************************/
/* COMPUTING THE CHOLESKY DECOMPOSITION */
/****************************************/
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA CHOLESKY initialization
cusolveSafeCall(cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, &work_size));
// --- CUDA POTRF execution
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
cusolveSafeCall(cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, work, work_size, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful potrf execution\n\n" << "devInfo = " << devInfo_h << "\n\n";
// --- At this point, the lower triangular part of A contains the elements of L.
/***************************************/
/* CHECKING THE CHOLESKY DECOMPOSITION */
/***************************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\h_A.txt", N * N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\d_A.txt", N * N);
cusolveSafeCall(cusolverDnDestroy(solver_handle));
return 0;
}
EDIT
Cholesky decomposition requires that the relevant matrix is Hermitian and positive definite. Symmetric and positive definite matrices can be generated by the approach in How to generate random symmetric positive definite matrices using MATLAB?.
The following Matlab code can be used for checking the results
clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
load h_A.txt
A = reshape(h_A, N, N);
yMatlab = A * x;
Lmatlab = chol(A, 'lower');
xprime = inv(Lmatlab) * yMatlab;
xMatlab = inv(Lmatlab') * xprime;
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load d_A.txt
LCUDA = tril(reshape(d_A, N, N));
fprintf('Percentage rms of Cholesky decompositions in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(Lmatlab - LCUDA).^2)) / sum(sum(abs(Lmatlab).^2))));
load xCUDA.txt
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xCUDA - x).^2)) / sum(sum(abs(x).^2))));
You don't set block sizes (or grid sizes) explicitly when using a library like cusolver, or cublas, or cusparse.
The library chooses these, when it actually runs device code internal to the library.

Converting a dense matrix to sparse CSR format with cuSPARSE

I want to use Scsrmv cusparse function.
Reading the documentation from here, I can't figure how to define csrRowPtrA and csrColIndA:
csrRowPtrA : integer array of m+1 elements that contains the start of
every row and the end of the last row plus one.
csrColIndA : integer array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) )
column indices of the nonzero elements of matrix A.
So , for example:
float *devRow;
cudaMalloc((void **)&devRow, (m+1)*sizeof(float));
and if A is the matrix, then:
for (int i=0; i<m; i+= n) //m is rows , n is columns
devRow[i] = A[i];
This is for the start of every row. For the last row and plus 1? This confused me.
And for columns? Something like:
for (int i=0;i<nnz;i++)
devCol = devRow[m] - devRow[0];
Following Robert Crovella's answer, this is a fully worked example on how converting a sparse matrix stored in dense format into a CSR (compressed row storage) format. I hope it will be useful to other users.
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include "Utilities.cuh"
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 5; // --- Number of columns
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f; h_A_dense[16] = 0.0f;
h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f; h_A_dense[17] = 0.0f;
h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f; h_A_dense[18] = 8.0f;
h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f; h_A_dense[19] = 6.0f;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase (descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
}
You can convert a dense matrix to sparse with code you write yourself. For the CSR (compressed-sparse-row) formulation, you could also use the CUSPARSE function for this.
The general format of a CSR sparse matrix representation is documented in many places, including the CUSPARSE manual.

Solving tridiagonal linear systems in CUDA

I am trying to implement a tridiagonal system solver based on the Cyclic Reduction method on my GTS450.
Cyclic Reduction is illustrated in this paper
Y. Zhang, J. Cohen, J.D. Owens, "Fast Tridiagonal Solvers on GPU"
However, whatever I do, my CUDA code is far slower than the sequential counterpart. My result for a total of 512 x 512 points is 7ms, however on my i7 3.4GHz it is 5ms. The GPU is not accelerating!
Which could be the problem?
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(double *a,double *b,double *c,double *d,double *x)
{
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ double asub[512];
__shared__ double bsub[512];
__shared__ double csub[512];
__shared__ double dsub[512];
double at=0;
double bt=0;
double ct=0;
double dt=0;
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
for(int stride=1;stride<N;stride*=2)
{
int margin_left,margin_right;
margin_left=idx-stride;
margin_right=idx+stride;
at=(margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f;
bt=bsub[idx]+((margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?asub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
ct=(margin_right<512)?(-csub[idx+stride]*asub[idx]/bsub[idx+stride]):0.f;
dt=dsub[idx]+((margin_left>=0)?(-dsub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
-((margin_right<512)?dsub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f);
__syncthreads();
asub[idx]=at;
bsub[idx]=bt;
csub[idx]=ct;
dsub[idx]=dt;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
I launched this kernel by cutrid_RC_1b<<<512,512>>>(d_a,d_b,d_c,d_d,d_x), and reached 100% device occupancy. This result has puzzled me for days.
There is an improved version of my code:
#include "cutrid.cuh"
__global__ void cutrid_RC_1b(float *a,float *b,float *c,float *d,float *x)
{/*{{{*/
int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
int idx=threadIdx.x;
__shared__ float asub[512];
__shared__ float bsub[512];
__shared__ float csub[512];
__shared__ float dsub[512];
asub[idx]=a[idx_global];
bsub[idx]=b[idx_global];
csub[idx]=c[idx_global];
dsub[idx]=d[idx_global];
__syncthreads();
//Reduction
for(int stride=1;stride<512;stride*=2)
{
int margin_left=(idx-stride);
int margin_right=(idx+stride);
if(margin_left<0) margin_left=0;
if(margin_right>=512) margin_right=511;
float tmp1 = asub[idx] / bsub[margin_left];
float tmp2 = csub[idx] / bsub[margin_right];
float tmp3 = dsub[margin_right];
float tmp4 = dsub[margin_left];
__syncthreads();
dsub[idx] = dsub[idx] - tmp4*tmp1-tmp3*tmp2;
bsub[idx] = bsub[idx]-csub[margin_left]*tmp1-asub[margin_right]*tmp2;
tmp3 = -csub[margin_right];
tmp4 = -asub[margin_left];
__syncthreads();
asub[idx] = tmp3*tmp1;
csub[idx] = tmp4*tmp2;
__syncthreads();
}
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
The speed is improved to 0.73ms on a Quadro k4000 for 512 x 512 system, however the code in the mentioned paper runs in 0.5ms on a GTX280.
Solving a tridiagonal system of equations is a challenging parallel problem since the classical solution scheme, i.e., Gaussian elimination, is inherently sequential.
Cyclic Reduction consists of two phases:
Forward Reduction. The original system is split in two independent tridiagonal systems for two sets of unknowns, the ones with odd index and the ones with even index. Such systems can be solved independently and this step can be seen as the first of a divide et impera scheme. The two smaller systems are split again in the same way in two subsystems and the process is repeated until a system of only 2 equations is reached.
Backward Substitution. The system of 2 equations is solved first. Then, the divide et impera structure is climbed up by solving the sub-systems independently on different cores.
I'm not sure (but correct me if I'm wrong) that your code will return consistent results. N does not appear to be defined. Also, you are accessing csub[idx-stride], but I'm not sure what does it mean when idx==0 and stride>1. Furthermore, you are using several conditional statements, essentially for boundary checkings. Finally, your code lacks a proper thread structure capable to deal with the mentioned divide et impera scheme, conceptually pretty much like the one used in the CUDA SDK reduction samples.
As mentioned in one of my comments above, I remembered that at tridiagonalsolvers you can find an implementation of the Cyclic Reduction scheme for solving tridiagonal equation systems. Browsing the related google pages, it seems to me that the code is mantained, among others, by the first Author of the above paper (Yao Zhang). The code is copied and pasted below. Note that the boundary check is done only once (if (iRight >= systemSize) iRight = systemSize - 1;), thus limiting the number of conditional statements involved. Note also the thread structure capable to deal with a divide et impera scheme.
The code by Zhang, Cohen and Owens
__global__ void crKernel(T *d_a, T *d_b, T *d_c, T *d_d, T *d_x)
{
int thid = threadIdx.x;
int blid = blockIdx.x;
int stride = 1;
int numThreads = blockDim.x;
const unsigned int systemSize = blockDim.x * 2;
int iteration = (int)log2(T(systemSize/2));
#ifdef GPU_PRINTF
if (thid == 0 && blid == 0) printf("iteration = %d\n", iteration);
#endif
__syncthreads();
extern __shared__ char shared[];
T* a = (T*)shared;
T* b = (T*)&a[systemSize];
T* c = (T*)&b[systemSize];
T* d = (T*)&c[systemSize];
T* x = (T*)&d[systemSize];
a[thid] = d_a[thid + blid * systemSize];
a[thid + blockDim.x] = d_a[thid + blockDim.x + blid * systemSize];
b[thid] = d_b[thid + blid * systemSize];
b[thid + blockDim.x] = d_b[thid + blockDim.x + blid * systemSize];
c[thid] = d_c[thid + blid * systemSize];
c[thid + blockDim.x] = d_c[thid + blockDim.x + blid * systemSize];
d[thid] = d_d[thid + blid * systemSize];
d[thid + blockDim.x] = d_d[thid + blockDim.x + blid * systemSize];
__syncthreads();
//forward elimination
for (int j = 0; j <iteration; j++)
{
__syncthreads();
stride *= 2;
int delta = stride/2;
if (threadIdx.x < numThreads)
{
int i = stride * threadIdx.x + stride - 1;
int iLeft = i - delta;
int iRight = i + delta;
if (iRight >= systemSize) iRight = systemSize - 1;
T tmp1 = a[i] / b[iLeft];
T tmp2 = c[i] / b[iRight];
b[i] = b[i] - c[iLeft] * tmp1 - a[iRight] * tmp2;
d[i] = d[i] - d[iLeft] * tmp1 - d[iRight] * tmp2;
a[i] = -a[iLeft] * tmp1;
c[i] = -c[iRight] * tmp2;
}
numThreads /= 2;
}
if (thid < 2)
{
int addr1 = stride - 1;
int addr2 = 2 * stride - 1;
T tmp3 = b[addr2]*b[addr1]-c[addr1]*a[addr2];
x[addr1] = (b[addr2]*d[addr1]-c[addr1]*d[addr2])/tmp3;
x[addr2] = (d[addr2]*b[addr1]-d[addr1]*a[addr2])/tmp3;
}
// backward substitution
numThreads = 2;
for (int j = 0; j <iteration; j++)
{
int delta = stride/2;
__syncthreads();
if (thid < numThreads)
{
int i = stride * thid + stride/2 - 1;
if(i == delta - 1)
x[i] = (d[i] - c[i]*x[i+delta])/b[i];
else
x[i] = (d[i] - a[i]*x[i-delta] - c[i]*x[i+delta])/b[i];
}
stride /= 2;
numThreads *= 2;
}
__syncthreads();
d_x[thid + blid * systemSize] = x[thid];
d_x[thid + blockDim.x + blid * systemSize] = x[thid + blockDim.x];
}
I want to add a further answer to mention that tridiagonal systems can be easily solved in the framework of the cuSPARSE library by aid of the function
cusparse<t>gtsv()
cuSPARSE also provides
cusparse<t>gtsv_nopivot()
which, at variance with the first mentioned routine, does not perform pivoting. Both the above functions solve the same linear system with multiple right hand sides. A batched routine
cusparse<t>gtsvStridedBatch()
also exists which solves multiple linear systems.
For all the above routines, the system matrix is fixed by simply specifying the lower diagonal, the main diagonal and the upper diagonal.
Below, I'm reporting a fully worked out example using cusparse<t>gtsv() to solve a tridiagonal linear system.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int N = 5; // --- Size of the linear system
// --- Lower diagonal, diagonal and upper diagonal of the system matrix
double *h_ld = (double*)malloc(N * sizeof(double));
double *h_d = (double*)malloc(N * sizeof(double));
double *h_ud = (double*)malloc(N * sizeof(double));
h_ld[0] = 0.;
h_ud[N-1] = 0.;
for (int k = 0; k < N - 1; k++) {
h_ld[k + 1] = -1.;
h_ud[k] = -1.;
}
for (int k = 0; k < N; k++) h_d[k] = 2.;
double *d_ld; gpuErrchk(cudaMalloc(&d_ld, N * sizeof(double)));
double *d_d; gpuErrchk(cudaMalloc(&d_d, N * sizeof(double)));
double *d_ud; gpuErrchk(cudaMalloc(&d_ud, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_ld, h_ld, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_d, h_d, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ud, h_ud, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating and defining dense host and device data vectors
double *h_x = (double *)malloc(N * sizeof(double));
h_x[0] = 100.0; h_x[1] = 200.0; h_x[2] = 400.0; h_x[3] = 500.0; h_x[4] = 300.0;
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, h_x, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_y = (double *)malloc(N * sizeof(double));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
cusparseSafeCall(cusparseDgtsv(handle, N, 1, d_ld, d_d, d_ud, d_x, N));
cudaMemcpy(h_x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost);
for (int k=0; k<N; k++) printf("%f\n", h_x[k]);
}
At this gitHub repository, a comparison of different CUDA routines available in the cuSOLVER library for the solution of tridiagonal linear systems is reported.
Things I see:
1st __syncthreads() seems redundant.
There are repetitive sets of operations such as (-csub[idx-stride]*asub[idx]/bsub[idx-stride]) in your code. Use intermediate variables to hold the result and reuse them instead of making GPU calculate those sets each time.
Use NVIDIA profiler to see where issues are.

Solving 2d diffusion (heat) equation with CUDA

I am learning CUDA with trying to solve some standard problems. As a example, I am solving the diffusion equation in two dimensions with the following code. But my results are different than the standard results and I am not able to figure that out.
//kernel definition
__global__ void diffusionSolver(double* A, double * old,int n_x,int n_y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i*(n_x-i-1)*j*(n_y-j-1)!=0)
A[i+n_y*j] = A[i+n_y*j] + (old[i-1+n_y*j]+old[i+1+n_y*j]+
old[i+(j-1)*n_y]+old[i+(j+1)*n_y] -4*old[i+n_y*j])/40;
}
int main()
{
int i,j ,M;
M = n_y ;
phi = (double *) malloc( n_x*n_y* sizeof(double));
phi_old = (double *) malloc( n_x*n_y* sizeof(double));
dummy = (double *) malloc( n_x*n_y* sizeof(double));
int iterationMax =10;
//phase initialization
for(j=0;j<n_y ;j++)
{
for(i=0;i<n_x;i++)
{
if((.4*n_x-i)*(.6*n_x-i)<0)
phi[i+M*j] = -1;
else
phi[i+M*j] = 1;
phi_old[i+M*j] = phi[i+M*j];
}
}
double *dev_phi;
cudaMalloc((void **) &dev_phi, n_x*n_y*sizeof(double));
dim3 threadsPerBlock(100,10);
dim3 numBlocks(n_x*n_y / threadsPerBlock.x, n_x*n_y / threadsPerBlock.y);
//start iterating
for(int z=0; z<iterationMax; z++)
{
//copy array on host to device
cudaMemcpy(dev_phi, phi, n_x*n_y*sizeof(double),
cudaMemcpyHostToDevice);
//call kernel
diffusionSolver<<<numBlocks, threadsPerBlock>>>(dev_phi, phi_old,n_x,n_y);
//get updated array back on host
cudaMemcpy(phi, dev_phi,n_x*n_y*sizeof(double), cudaMemcpyDeviceToHost);
//old values will be assigned new values
for(j=0;j<n_y ;j++)
{
for(i=0;i<n_x;i++)
{
phi_old[i+n_y*j] = phi[i+n_y*j];
}
}
}
return 0;
}
Can someone tell me if there is anything wrong in this process? Any help will be greatly appreciated.
talonmies, brano and huseyin have already pointed out some mistakes of your code.
Diffusion (heat) equation is one of the classical example of partial differential equations solvable with CUDA. There is also a thorough example in Chapter 7 of the CUDA by Example book.
As a reference to future Users, I'm providing below a full worked example including both, CPU and GPU codes. Instead of swapping pointers, as suggested by talonmies, I'm just condensing two Jacobi iterations in a single loop.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Utilities.cuh"
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
/***********************************/
/* JACOBI ITERATION FUNCTION - GPU */
/***********************************/
__global__ void Jacobi_Iterator_GPU(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
// N
int P = i + j*NX; // node (i,j) |
int N = i + (j+1)*NX; // node (i,j+1) |
int S = i + (j-1)*NX; // node (i,j-1) W ---- P ---- E
int E = (i+1) + j*NX; // node (i+1,j) |
int W = (i-1) + j*NX; // node (i-1,j) |
// S
// --- Only update "interior" (not boundary) node points
if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]);
}
/***********************************/
/* JACOBI ITERATION FUNCTION - CPU */
/***********************************/
void Jacobi_Iterator_CPU(float * __restrict T, float * __restrict T_new, const int NX, const int NY, const int MAX_ITER)
{
for(int iter=0; iter<MAX_ITER; iter=iter+2)
{
// --- Only update "interior" (not boundary) node points
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T[(i+1) + NX*j];
float T_W = T[(i-1) + NX*j];
float T_N = T[i + NX*(j+1)];
float T_S = T[i + NX*(j-1)];
T_new[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T_new[(i+1) + NX*j];
float T_W = T_new[(i-1) + NX*j];
float T_N = T_new[i + NX*(j+1)];
float T_S = T_new[i + NX*(j-1)];
T[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
}
}
/******************************/
/* TEMPERATURE INITIALIZATION */
/******************************/
void Initialize(float * __restrict h_T, const int NX, const int NY)
{
// --- Set left wall to 1
for(int j=0; j<NY; j++) h_T[j * NX] = 1.0;
}
/********/
/* MAIN */
/********/
int main()
{
const int NX = 256; // --- Number of discretization points along the x axis
const int NY = 256; // --- Number of discretization points along the y axis
const int MAX_ITER = 1; // --- Number of Jacobi iterations
// --- CPU temperature distributions
float *h_T = (float *)calloc(NX * NY, sizeof(float));
float *h_T_old = (float *)calloc(NX * NY, sizeof(float));
Initialize(h_T, NX, NY);
Initialize(h_T_old, NX, NY);
float *h_T_GPU_result = (float *)malloc(NX * NY * sizeof(float));
// --- GPU temperature distribution
float *d_T; gpuErrchk(cudaMalloc((void**)&d_T, NX * NY * sizeof(float)));
float *d_T_old; gpuErrchk(cudaMalloc((void**)&d_T_old, NX * NY * sizeof(float)));
gpuErrchk(cudaMemcpy(d_T, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_old, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
// --- Grid size
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid (iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y));
// --- Jacobi iterations on the host
Jacobi_Iterator_CPU(h_T, h_T_old, NX, NY, MAX_ITER);
// --- Jacobi iterations on the device
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T, d_T_old, NX, NY); // --- Update d_T_old starting from data stored in d_T
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T_old, d_T , NX, NY); // --- Update d_T starting from data stored in d_T_old
}
// --- Copy result from device to host
gpuErrchk(cudaMemcpy(h_T_GPU_result, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
// --- Calculate percentage root mean square error between host and device results
float sum = 0., sum_ref = 0.;
for (int j=0; j<NY; j++)
for (int i=0; i<NX; i++) {
sum = sum + (h_T_GPU_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_result[j * NX + i] - h_T[j * NX + i]);
sum_ref = sum_ref + h_T[j * NX + i] * h_T[j * NX + i];
}
printf("Percentage root mean square error = %f\n", 100.*sqrt(sum / sum_ref));
// --- Release host memory
free(h_T);
free(h_T_GPU_result);
// --- Release device memory
gpuErrchk(cudaFree(d_T));
gpuErrchk(cudaFree(d_T_old));
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page.
One big mistake you have is that phi_old is passed to the kernel and used by the kernel but this is a host pointer.
Malloc a dev_phi_old using cudaMalloc. Set it to default value and copy it to the GPU first time before entering the z loop.
Here:
A[i+n_y*j] = A[i+n_y*j] + (old[i-1+n_y*j]+old[i+1+n_y*j]+old[i+(j-1)*n_y]+old[i+(j+1)*n_y] -4*old[i+n_y*j])/40;
You are dividing by 40(integer) which can result in wrong diffusing rate. Actually can result in none-diffusing.
But A is an array of doubles.
Divide the diffuse rate by 40.0 and see if it works.
If this is from Jos-Stam's solver, it should be 4.0 not 40
Theres also another thing:
-4*old[i+n_y*j])/40;
Here you are multiplying with 4(integer). This can cause a integral-casting too!
This:
-4.0*old[i+n_y*j])/40.0;
decreases some errors.
Have a nice day.