Covariance calculation with CUDA - cuda

I am implementing Principal Component Analysis (PCA) based face recognition using CUDA. I used orl face database and calculated the mean image and normalized images. I'm facing a problem in calculating the covariance matrix.
__global__ void mean(int* i_data, int num, int size, int* o_data, int WIDTH, int HEIGHT, int* normalized)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int idx = x + y * WIDTH;
int r = 0;
int idx_z=0;
for (int z = 0; z < num; ++z)
{
idx_z = z * WIDTH*HEIGHT + idx;
r += i_data[ idx_z ];
}
o_data[ idx ] = int(r/num);
for (int z = 0; z < num; ++z)
{
idx_z = z * WIDTH*HEIGHT + idx;
normalized[idx_z] = abs(i_data[idx_z] - o_data[idx]);
}
}
dim3 dimBlock = dim3(8,4,1);
dim3 dimGrid = dim3(ceil(rows/dimBlock.x) , ceil(cols/dimBlock.y));
mean<<<dimGrid,dimBlock>>>(dev_images, IMAGE_NUM,size,dev_output,rows,cols,dev_normalized);
The database images are of size (92,112).

Your code does not make any sense to me.
Covariance calculation in CUDA can be easily performed by using cuBLAS in conjunction with Thrust. Considering N realizations of K random variables, the covariance estimation formula is the following
where qjk, j,k=1,...,K are the covariance estimate values, Xj and Xk with the overbars are the random variable means as estimated from the available realizations.
Below, I'm reporting a fully worked example:
#include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/sequence.h>
#include <stdio.h>
#include <iostream>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
/*************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX */
/*************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {
T Ncols; // --- Number of columns
__host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
__host__ __device__ T operator()(T i) { return i / Ncols; }
};
/********/
/* MAIN */
/********/
int main()
{
const int Nsamples = 3; // --- Number of realizations for each random variable (number of rows of the X matrix)
const int NX = 4; // --- Number of random variables (number of columns of the X matrix)
// --- Random uniform integer distribution between 10 and 99
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// --- Matrix allocation and initialization
thrust::device_vector<float> d_X(Nsamples * NX);
for (size_t i = 0; i < d_X.size(); i++) d_X[i] = (float)dist(rng);
// --- cuBLAS handle creation
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
/*************************************************/
/* CALCULATING THE MEANS OF THE RANDOM VARIABLES */
/*************************************************/
// --- Array containing the means multiplied by Nsamples
thrust::device_vector<float> d_means(NX);
thrust::device_vector<float> d_ones(Nsamples, 1.f);
float alpha = 1.f / (float)Nsamples;
float beta = 0.f;
cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Nsamples, NX, &alpha, thrust::raw_pointer_cast(d_X.data()), Nsamples,
thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_means.data()), 1));
/**********************************************/
/* SUBTRACTING THE MEANS FROM THE MATRIX ROWS */
/**********************************************/
thrust::transform(
d_X.begin(), d_X.end(),
thrust::make_permutation_iterator(
d_means.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nsamples))),
d_X.begin(),
thrust::minus<float>());
/*************************************/
/* CALCULATING THE COVARIANCE MATRIX */
/*************************************/
thrust::device_vector<float> d_cov(NX * NX);
alpha = 1.f;
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, NX, NX, Nsamples, &alpha,
thrust::raw_pointer_cast(d_X.data()), Nsamples, thrust::raw_pointer_cast(d_X.data()), Nsamples, &beta,
thrust::raw_pointer_cast(d_cov.data()), NX));
// --- Final normalization by Nsamples - 1
thrust::transform(
d_cov.begin(), d_cov.end(),
thrust::make_constant_iterator((float)(Nsamples-1)),
d_cov.begin(),
thrust::divides<float>());
for(int i = 0; i < NX * NX; i++) std::cout << d_cov[i] << "\n";
return 0;
}

I implemented covariance calculator with CUBlas and Cuda Thrust and compared with online co variance calculation tools. It seems mine producing good results. The code below planned to QDA Bayes. So matrix given may contain more than one class. So multiple co variance matrices is calculated. I hope it will be useful for someone.
//! Calculates one or more than one coVarianceMatrix given data.
// There can be many classes since many covariance matrixes.
/*!
\param inMatrix This vector contains matrix data in major storage.
Forexample if inMatrix=[1 2 3 4 5 6] and trialSizes=[2] this means matrix we will work on a matrix like :
|1 4 |
|2 5 |
|3 6 | -> 2 Trials, 3 Features. Columns contains feature rows contains trials (samples)
\param trialSizes There can be many classes since many covariance matrixes. Samples from all classes will be given with inMatrix.
But we need to know how many trials(samples) we have for each class.
For example if inMatrix=[1 2 3 4 5 6 7 8 9 10 11 12] and trialSizes=[2,2]
this means matrix we will work on a matrix like :
|1 4 | |7 10 |
|2 5 | |8 11 |
|3 6 | |9 12 | --> Total number of trials(samples which is total rowCount) 2 + 2 = 4 ,
So colSize = inMatrix.size()/4 = 3(feature vector size)
--> There is two element in trialSize vec so each vector has to samples
*/
void multiQDACovianceCalculator(std::vector<float>& inMatrix, std::vector<int>& trialSizes)
{
cublasHandle_t handle; // CUBLAS context
int classCount = trialSizes.size();
int rowSize = std::accumulate(trialSizes.begin(), trialSizes.end(), 0);
int dimensionSize = inMatrix.size() / rowSize;
float alpha = 1.0f;
float beta = 0.0f; // bet =1
thrust::device_vector<float> d_cov1(dimensionSize * dimensionSize);
thrust::device_vector<float> d_cov2(dimensionSize * dimensionSize);
thrust::device_vector<float> d_covResult(dimensionSize * dimensionSize);
thrust::device_vector<float> d_wholeMatrix(inMatrix);
thrust::device_vector<float> d_meansVec(dimensionSize); // rowVec of means of trials
float *meanVecPtr = thrust::raw_pointer_cast(d_meansVec.data());
float *device2DMatrixPtr = thrust::raw_pointer_cast(d_wholeMatrix.data());
auto maxTrialNumber = *std::max_element(trialSizes.begin(), trialSizes.end());
thrust::device_vector<float> deviceVector(maxTrialNumber, 1.0f);
cublasCreate(&handle);
// Inside of for loop one covariance matrix calculated each time
for (int i = 0; i < trialSizes.size(); i++)
{
// X*transpose(X) / N
alpha = 1.0f / trialSizes[i];
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, dimensionSize, dimensionSize, trialSizes[i], &alpha,
device2DMatrixPtr, dimensionSize, device2DMatrixPtr, dimensionSize, &beta,
thrust::raw_pointer_cast(d_cov1.data()), dimensionSize);
// Mean vector of each column
alpha = 1.0f;
cublasSgemv(handle, CUBLAS_OP_N, dimensionSize, trialSizes[i], &alpha, device2DMatrixPtr,
dimensionSize, thrust::raw_pointer_cast(deviceVector.data()), 1, &beta, meanVecPtr, 1);
// MeanVec * transpose(MeanVec) / N*N
alpha = 1.0f / (trialSizes[i] * trialSizes[i]);
cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, dimensionSize, dimensionSize, 1, &alpha,
meanVecPtr, 1, meanVecPtr, 1, &beta,
thrust::raw_pointer_cast(d_cov2.data()), dimensionSize);
alpha = 1.0f;
beta = -1.0f;
// (X*transpose(X) / N) - (MeanVec * transpose(MeanVec) / N*N)
cublasSgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, dimensionSize, dimensionSize, &alpha,
thrust::raw_pointer_cast(d_cov1.data()), dimensionSize, &beta, thrust::raw_pointer_cast(d_cov2.data()),
dimensionSize, thrust::raw_pointer_cast(d_covResult.data()), dimensionSize);
// Go to other class and calculate its covarianceMatrix
device2DMatrixPtr += trialSizes[i] * dimensionSize;
}
printVector(d_covResult);
cublasDestroy(handle);
}

Related

CUBLAS Sgemm confusing results

For two matrices X and Q of size 4x3 and 2x3
which in memory look like
x = [0 1 2 3 4 5 6 7 8 9 10 11]
q = [3 4 5 6 7 8]
I tried to use cublas multiplication cublasSgemm, but I couldn't manage to get expected results.
Since they are stored in row-major order so they should be interpreted as 3x4 and 3x2 so it seemed for me that
cublasSgemm(cublas_handle,
CUBLAS_OP_T, CUBLAS_OP_N,
q_rows_num, x_rows_num, dim,
&alpha, // 1
q_device, q_rows_num,
x, x_rows_num,
&beta, // 0
x_q_multiplication, q_rows_num);
where
dim = 3
x_rows_num = 4
q_rows_num = 2
would work but in that case I got error
** On entry to SGEMM parameter number 8 had an illegal value
I also tried shuffling parameters a bit but I couldn't find any setup that would work.
So is it possible to multiply them without changing to column-major order?
EDIT:
So I got exepected results with changes made in this working example:
#include <cublas_v2.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
int main()
{
int x_rows_num = 4;
int q_rows_num = 2;
int dim = 3;
int N = x_rows_num*dim;
int M = q_rows_num*dim;
float *x, *q, *x_q_multiplication;
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&q, M*sizeof(float));
cudaMallocManaged(&x_q_multiplication, q_rows_num*x_rows_num*dim);
for (int i = 0; i< N; i++) x[i] = i*1.0f;
for (int i = 0; i< M; i++) q[i] = (i + 3)*1.0f;
float *q_device;
cudaMallocManaged(&q_device, M*sizeof(float));
cudaMemcpy(q_device, q, M*sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
float alpha = 1.f;
float beta = 0.f;
cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
x_rows_num, q_rows_num, dim,
&alpha,
x, dim,
q, dim,
&beta,
x_q_multiplication, x_rows_num);
cudaDeviceSynchronize();
for (int i = 0; i < q_rows_num*x_rows_num; i++) std::cout << x_q_multiplication[i] << " ";
cudaFree(x);
cudaFree(q);
cudaFree(x_q_multiplication);
return 0;
}
However I'am still not sure why dim became leading dimension
Your original CUBLAS call:
cublasSgemm(cublas_handle,
CUBLAS_OP_T, CUBLAS_OP_N,
q_rows_num, x_rows_num, dim,
&alpha, // 1
q_device, q_rows_num,
x, x_rows_num,
&beta, // 0
x_q_multiplication, q_rows_num);
was close to correct. Your interpretation of what the leading dimensions should be was correct. What you got wrong was the Op specifiers. If both matrices are row major ordered and the first array needs to be read in its (row major) transposed order, then the operation should be:
#include <cublas_v2.h>
#include <cstring>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
int main()
{
int x_rows_num = 4;
int q_rows_num = 2;
int dim = 3;
int N = x_rows_num*dim;
int M = q_rows_num*dim;
float x0[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
float q0[6] = {3, 4, 5, 6, 7, 8 };
float *x, *q, *x_q_multiplication;
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&q, M*sizeof(float));
cudaMallocManaged(&x_q_multiplication, q_rows_num*x_rows_num*dim);
std::memcpy(x, x0, N*sizeof(float));
std::memcpy(q, q0, M*sizeof(float));
float *q_device;
cudaMallocManaged(&q_device, M*sizeof(float));
cudaMemcpy(q_device, q, M*sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
float alpha = 1.f;
float beta = 0.f;
cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_T,
q_rows_num, x_rows_num, dim,
&alpha, // 1
q_device, q_rows_num,
x, x_rows_num,
&beta, // 0
x_q_multiplication, q_rows_num);
cudaDeviceSynchronize();
for (int i = 0; i < q_rows_num*x_rows_num; i++) std::cout << x_q_multiplication[i] << " "; std::cout << std::endl;
cudaFree(x);
cudaFree(q);
cudaFree(x_q_multiplication);
return 0;
}
which does this for me:
$ nvcc -arch=sm_52 cublas_trans.cu -o cublas_trans -lcublas
$ ./cublas_trans
76 88 91 106 106 124 121 142
and which I believe is the correct answer.
Incidentally, Robert Crovella's now deleted comment, which you say you take offense to was 100% correct. I suspect he read, as I did, your original CUBLAS call, interpreted the arguments and concluded, as I did, and as CUBLAS itself did, that you are trying to multiply a 3x4 matrix and a 3x2 matrix. Which is why the invalid argument error was raised.

Solving dense linear systems AX = B with CUDA

Can I use the new cuSOLVER library (CUDA 7) to solve linear systems of the form
AX = B
where A, X and B are NxN dense matrices ?
Yes.
Approach nr. 1
In the framework of cuSOLVER you can use QR decomposition, see QR decomposition to solve linear systems in CUDA.
Approach nr. 2
Alternatively, you can calculate the matrix inverse by the successive involation of
cublas<t>getrfBatched()
which calculates the LU decomposition of a matrix, and
cublas<t>getriBatched()
which calculates the inverse of the matrix starting from its LU decomposition.
Approach nr. 3
A final possibility is using
cublas<t>getrfBatched()
followed by a twofold invocation of
cublas<t>trsm()
which solves upper or lower triangular linear systems.
As pointed out by Robert Crovella, the answer may vary on the size and the type of the involved matrices.
Code for approach nr. 1
Please, see QR decomposition to solve linear systems in CUDA.
Code for approaches nr. 2 and nr. 3
Below, I'm reporting a worked example for the implementation of approaches nr. 2 and 3. Hankel matrices are used to feed the approaches with well-conditioned, invertible matrices. Please, note that approach nr. 3 requires permuting (rearranging) the system coefficients vector according to the pivot array obtained following the invokation of cublas<t>getrfBatched(). This permutation can be conveniently done on the CPU.
#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define prec_save 10
#define BLOCKSIZE 256
#define BLOCKSIZEX 16
#define BLOCKSIZEY 16
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {
double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));
// --- Initialize random seed
srand(time(NULL));
// --- Generate random numbers
for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();
// --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];
free(h_atemp);
}
/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref,
double * __restrict h_y, const int N) {
for (int k = 0; k < N; k++) h_y[k] = 0.f;
for (int m = 0; m < N; m++)
for (int n = 0; n < N; n++)
h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];
}
/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
for (int i = 0; i < N; i++) {
double temp = vec[i];
vec[i] = vec[pivotArray[i] - 1];
vec[pivotArray[i] - 1] = temp;
}
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 1000;
const unsigned int Nmatrices = 1;
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));
TimingGPU timerLU, timerApproach1, timerApproach2;
double timingLU, timingApproach1, timingApproach2;
/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Matrices to be inverted (only one in this example)
double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));
// --- Setting the Hankel matrix
setHankelMatrix(h_A, N);
// --- Defining the solution
double *h_xref = (double *)malloc(N * sizeof(double));
for (int k = 0; k < N; k++) h_xref[k] = 1.f;
// --- Coefficient vectors (only one in this example)
double *h_y = (double *)malloc(N * sizeof(double));
computeCoefficientsVector(h_A, h_xref, h_y, N);
// --- Result (only one in this example)
double *h_x = (double *)malloc(N * sizeof(double));
// --- Allocate device space for the input matrices
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
// --- Move the relevant matrices from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(double), cudaMemcpyHostToDevice));
/**********************************/
/* COMPUTING THE LU DECOMPOSITION */
/**********************************/
timerLU.StartCounter();
// --- Creating the array of pointers needed as input/output to the batched getrf
double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;
double **d_inout_pointers;
gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
free(h_inout_pointers);
int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
int *d_InfoArray; gpuErrchk(cudaMalloc(&d_InfoArray, Nmatrices * sizeof(int)));
int *h_InfoArray = (int *)malloc(Nmatrices * sizeof(int));
cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
//cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
timingLU = timerLU.GetCounter();
printf("Timing LU decomposition %f [ms]\n", timingLU);
/*********************************/
/* CHECKING THE LU DECOMPOSITION */
/*********************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
saveCPUrealtxt(h_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);
/******************************************************************************/
/* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
/******************************************************************************/
timerApproach1.StartCounter();
// --- Allocate device space for the inverted matrices
double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));
// --- Creating the array of pointers needed as output to the batched getri
double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));
double **d_out_pointers;
gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
free(h_out_pointers);
cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
double alpha1 = 1.f;
double beta1 = 0.f;
cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));
timingApproach1 = timingLU + timerApproach1.GetCounter();
printf("Timing approach 1 %f [ms]\n", timingApproach1);
/**************************/
/* CHECKING APPROACH NR.1 */
/**************************/
saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);
/*************************************************************/
/* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
/*************************************************************/
timerApproach2.StartCounter();
double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));
gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
rearrange(h_y, h_pivotArray, N);
gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- Now P*A=L*U
// Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y
// --- 1st phase - solve Ly = b
const double alpha = 1.f;
// --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result
// --- Lower triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));
// --- Upper triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));
timingApproach2 = timingLU + timerApproach2.GetCounter();
printf("Timing approach 2 %f [ms]\n", timingApproach2);
/**************************/
/* CHECKING APPROACH NR.2 */
/**************************/
saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.
Some useful references on the third approach:
NAG Fortran Library Routine Document
Scientific Computing Software Library (SCSL) User’s Guide
https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c
EDIT
Timings (in ms) for approaches nr. 2 and 3 (tests performed on a GTX960 card, cc. 5.2).
N LU decomposition Approach nr. 2 Approach nr. 3
100 1.08 2.75 1.28
500 45.4 161 45.7
1000 302 1053 303
As it emerges, approach nr. 3 is more convenient and its cost is essentially the cost of computing the LU factorization. Furthermore:
Solving linear systems by LU decomposition is faster than using QR decomposition (see QR decomposition to solve linear systems in CUDA);
LU decomposition is limited to square linear systems, while QR decomposition helps in case of non-square linear systems.
The below Matlab code can be used for checking the results
clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt
A = reshape(A, N, N);
yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);
load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);
L = eye(N);
for k = 1 : N
L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
U(k, k : N) = Adecomposed(k, k : N);
end
load pivotArray.txt
Pj = eye(N);
for j = 1 : N
tempVector = Pj(j, :);
Pj(j, :) = Pj(pivotArray(j), :);
Pj(pivotArray(j), :) = tempVector;
end
fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));
xprime = inv(Lmatlab) * yMatlab;
xMatlab = inv(Umatlab) * xprime;
fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));
load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

Instructions Per Count (IPC) and Instruction Level Parallelism (ILP) in CUDA

I observe IPC drops as ILP goes up for 32-bit int operations when trying to speed up my cryptographic kernel. The kernel consists of fairly unrolled loops of long sequence of ADD and XOR operations, which should have a throughput of 160 ops per 192 cores per cycle on Kepler (GTX Titan/780).
IPC for my kernel hits the upper bound of 3.28. Using ILP even drops IPC. Apparently ILP fails to help achieve my goal -- fully utilize the pipeline, so I wrote some little experiments. I put the code for ILP 4 at the end.
Profiler Measurements
Results are measured on GTX Titan.
cubin outputs are examined to make sure no instructions are eliminated during optimization.
Executed IPC is almost the same as issued IPC, so I just list one of them.
ADD instructions (XORs have identical behavior)
| ILP 1 | ILP 2 | ILP 4 | ILP 8
--------------------------------------------------
IPC | 4.00 | 3.32 | 2.72 | 3.44
--------------------------------------------------
Issue Slot | 99.17% | 59.34% | 48.61% | 61.71%
Utilization | | | |
I expect ILP 2, 4 and 8 would give better performance, but not.
Recall the integer throughput is 160. The 4 warp scheduler per SM should dual issue up to 5 instructions per cycle, so that IPC should go up towards 5. How can I explain what I observed? Why is the issue slot 99% utilized when IPC = 4?
Float / Int ADD instruction mix
If I modify the code for ILP 4 to do two int ADDs and two float ADDs:
IPC: 5.1
Issue slot utilization: 99.12%
Strangely enough, it seems that the warp scheduler does a better job to issue floating operations.
Discussion
Available literature suggests using ILP help reach the peak performance for floating point operations. Why doesn't ILP apply to integers? How can I do this for integer operations?
My kernel theoretically should do 2.25 integer operations per candidate. This is consistent with what I observed in cuobjdump. There are 2^48 candidates, so the minimun runtime on GTX Titan should be 2.25 * 2^48 / (2688 * 160/192) / 876 MHz = 322.75s. Is this estimation reasonable?
The measured performance for my kernel is 523s. This does imply that integer throughput is only about 160 * 3.28 (measure IPC) / 5 (max IPC).
ILP test code
__device__ int x[10];
__global__ void test(int flag = 0)
{
int a = x[0], b = x[1], c = x[2], d = x[3];
int _a = x[4], _b = x[5], _c = x[6], _d = x[7];
#pragma unroll 128
for (int i = 0; i < 51200; ++i)
{
asm volatile("add.u32 %0, %0, %1;": "+r"(a): "r"(_a));
asm volatile("add.u32 %0, %0, %1;": "+r"(b): "r"(_b));
asm volatile("add.u32 %0, %0, %1;": "+r"(c): "r"(_c));
asm volatile("add.u32 %0, %0, %1;": "+r"(d): "r"(_d));
}
int v = a + b + c + d;
if (flag * v == 1)
x[0] = v;
}
Code fragment for 4 candidates
Each candidate takes 9 / 4 = 2.25 ops. Cuobjdump also verifies this.
d ^= d2(1, 3); // d2 is located in constant memory
s ^= d;
t ^= d2(1, 16);
u ^= d2(1, 17);
v ^= some_const;
flag_s = min(flag_s, s); // int min has throughput of 160
flag_t = flag_t || (s == t); // setp.or should be the same
flag_u = flag_u || (s == u);
flag_v = flag_v || (s == v);
I'm providing an answer to remove this question from the unanswered list.
I do not observe a change in executed Instructions Per Count (IPC) with Instruction Level Parallelism. Overall, it is difficult to argue the reason for the effect observed by the OP without knowing any further information but that provided by the OP himself (f.i., the launch configuration).
In the code below, I'm considering an example using floats, although I have tested the same code with ints without changing the conceptual results. The code implements cyclical Multiply Add (MAD) operations with ILP=1, ILP=2 and ILP=4.
The executed IPC has been the following
ILP IPC FLOPs
1 3.924 67108864
2 4.323 67108864
4 4.016 67108864
for N=8192. The code has been compiled with CUDA 8.0 and run on an NVIDIA GT920M. As it can be seen, IPC keeps almost constant for the differently considered values of ILP. The Floating Point Operations (FLOPs) as estimated by the code assuming 2 FLOPs per MAD coincides with that measured by the Visual Profiler.
THE CODE
#include<stdio.h>
#define N_ITERATIONS 8192
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCKSIZE 512
//#define DEBUG
/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float a = d_a[tid];
float b = d_b[tid];
float c = d_c[tid];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a = a * b + c;
}
d_a[tid] = a;
}
}
/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N / 2) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float c1 = d_c[tid];
float a2 = d_a[tid + N / 2];
float b2 = d_b[tid + N / 2];
float c2 = d_c[tid + N / 2];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
}
d_a[tid] = a1;
d_a[tid + N / 2] = a2;
}
}
/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N / 4) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float c1 = d_c[tid];
float a2 = d_a[tid + N / 4];
float b2 = d_b[tid + N / 4];
float c2 = d_c[tid + N / 4];
float a3 = d_a[tid + N / 2];
float b3 = d_b[tid + N / 2];
float c3 = d_c[tid + N / 2];
float a4 = d_a[tid + 3 * N / 4];
float b4 = d_b[tid + 3 * N / 4];
float c4 = d_c[tid + 3 * N / 4];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
a3 = a3 * b3 + c3;
a4 = a4 * b4 + c4;
}
d_a[tid] = a1;
d_a[tid + N / 4] = a2;
d_a[tid + N / 2] = a3;
d_a[tid + 3 * N / 4] = a4;
}
}
/********/
/* MAIN */
/********/
int main() {
//const int N = 8192 * 64;
const int N = 8192;
//const int N = 1024;
TimingGPU timerGPU;
float *h_a = (float*)malloc(N*sizeof(float));
float *h_a_result_host = (float*)malloc(N*sizeof(float));
float *h_a_result_device = (float*)malloc(N*sizeof(float));
float *h_b = (float*)malloc(N*sizeof(float));
float *h_c = (float*)malloc(N*sizeof(float));
for (int i = 0; i<N; i++) {
h_a[i] = 2.;
h_b[i] = 1.;
h_c[i] = 2.;
h_a_result_host[i] = h_a[i];
for (unsigned int k = 0; k < N_ITERATIONS; k++) {
h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
}
}
float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(float)));
float *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(float)));
float *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(float), cudaMemcpyHostToDevice));
/***********/
/* KERNEL0 */
/***********/
timerGPU.StartCounter();
kernel0 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
/***********/
/* KERNEL1 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
kernel1 << <iDivUp(N / 2, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
/***********/
/* KERNEL2 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
kernel2 << <iDivUp(N / 4, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
cudaDeviceReset();
return 0;
}

Matrix Multiplication giving wrong output [duplicate]

This question already has an answer here:
Unable to execute device kernel in CUDA
(1 answer)
Closed 7 years ago.
What I am attempting to do is Multiply Matrix A & Matrix B and then from the product matrix I get the index of the maximum value per column. But unfortunately, only the first 128*128 values of the matrix multiplication are correct while others are just garbage. I do not quite understand how this works. I request you to kindly guide me with this ..
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 4096;
const int wB = 4096;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = 32; //x + y*wA;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = 21; //x + y*wB;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i,j;
f1 = fopen("C.txt","w");
for(i = hA - 2 ; i < hA; i ++){
for(j = 0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
// free the memory allocated on the CPU
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + k] > temp){
temp = Pd[x*wB + k];
temp_idx = x*wB + k;
}
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
// free the memory allocated on the GPU
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
In your code you seem to have more than one problem. One of the problems is, in place of this:
dim3 dimGrid(wA/blockD, hB/blockD);
You should have this:
dim3 dimGrid(wB/blockD, hA/blockD);
Ultimately you need one thread in your grid for each output point. Your formulation was giving you a grid of 4 blocks by 4 blocks, whereas you need a grid of 128 blocks by 128 blocks.
The other problem I found with your code was in these lines in the kernel:
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
They are not indexing properly through the output array. Rather than try to sort it out using your scheme, I used this instead:
Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;
When I made the above two changes to your code, I got what I believe are correct results throughout the array. And it took about 32 seconds on my machine to run it. (Note that I haven't tried fixing your original max-finding code -- see below for a better approach.)
Based on your previous question, you seemed to be concerned about speed. If you want to do fast matrix multiply, you should use cublas. The following code shows how to use cublas to multiply two ordinary C-style matrices (they don't have to be square). I've also included a column-max finding kernel that will be fast when the number of columns is large (say, over 500 or so. You have 4096 columns in your example). For small numbers of columns, there may be quicker ways to perform this function, but small numbers of columns also suggests that the overall problem size may be small and so speed (of this piece of code) will not really be an issue.
Here's the code:
#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
int idx = threadIdx.x + blockDim.x*blockIdx.x;
if (idx < cols){
float tempmax = mat[idx];
unsigned int tempmidx = 0;
for (int i = 1; i< rows; i++)
if (mat[idx + (i*cols)] > tempmax){
tempmax = mat[idx + (i*cols)];
tempmidx = i;}
max[idx] = tempmax;
midx[idx] = tempmidx;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
unsigned int *h_idx, *d_idx;
h_A = (float *)malloc(SIZ_A*sizeof(float));
if (h_A==0) {printf("malloc fail\n"); return -1;}
h_B = (float *)malloc(SIZ_B*sizeof(float));
if (h_B==0) {printf("malloc fail\n"); return -1;}
h_C = (float *)malloc(SIZ_C*sizeof(float));
if (h_C==0) {printf("malloc fail\n"); return -1;}
h_max = (float *)malloc(COL_C*sizeof(float));
if (h_max==0) {printf("malloc fail\n"); return -1;}
h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));
if (h_idx==0) {printf("malloc fail\n"); return -1;}
cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
cudaMalloc((void **)&d_max, COL_C*sizeof(float));
cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
cudaCheckErrors("cuda malloc fail");
// initialize data
for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);
cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cuda memcpy 1 fail");
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
// C = A*B
// due to cublas expecting column-major storage, parameters
// are scrambled
cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 2 fail");
col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
cudaCheckErrors("kernel launch fail");
cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cuda memcpy 3 fail/kernel fail");
if (VERBOSE){
printf("A: \n");
for (int i=0; i< ROW_A; i++){
for (int j=0; j< COL_A; j++)
printf("%7.5G", h_A[j+(i*COL_A)]);
printf("\n");}
printf("B: \n");
for (int i=0; i< ROW_B; i++){
for (int j=0; j< COL_B; j++)
printf("%7.5G", h_B[j+(i*COL_B)]);
printf("\n");}
printf("C = A*B: \n");
for (int i=0; i< ROW_C; i++){
for (int j=0; j< COL_C; j++)
printf("%7.5G", h_C[j+(i*COL_C)]);
printf("\n");}
printf("COLUMN MAX:\n");
for (int i=0; i< COL_C; i++)
printf("%7.5G", h_max[i]);
printf("\nCOLUMN MAX IDX:\n");
for (int i=0; i< COL_C; i++)
printf("%7d", h_idx[i]);
}
printf("\n finished!\n");
return 0;
}
Here's what I used to compile:
$ nvcc -arch=sm_20 -O3 -o t221 t221.cu -lcublas
And here's the sample output:
$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
B:
2 3 4 5
6 7 8 9
10 11 12 13
14 15 16 17
C = A*B:
100 110 120 130
228 254 280 306
356 398 440 482
484 542 600 658
COLUMN MAX:
484 542 600 658
COLUMN MAX IDX:
3 3 3 3
finished!
========= ERROR SUMMARY: 0 errors
$
When I extended my code to handle the same sizes you indicated, (A = 4096x128, B=128x4096) it took about 1 second on my machine. So it's much faster than your code. However, when I take your code and comment out your call to MaxFunction in the kernel, it also only takes about 1 second to compute the matrix multiply result. So if you wanted to keep your matrix multiply code (i.e. not use cublas) you could break the code into 2 kernels, and use your multiply routine in the first kernel with my max-finding routine (col_max) in the second kernel, and also probably get a pretty fast result.
As #talonmies indicated, if you are running on a windows machine, be sure you are aware of the ramifications of windows TDR. (search that in the upper right corner search box if needed)

1D problems in CUDA and HPC

I'm looking for some 1D problems in CUDA and HPC, e.g. Black Scholes.
By 1D problems, I mean problems in which all the work is done on 1D arrays. Although matrix multiplication can be expressed in this way, I want problems in which the basic problem is just 1D.
I am trying to develop a 1D library for CUDA and would need some benchmark problems to test it. I realize that a lot of real world problems are expressed as 2D, I would really like to see some real world 1D problems.
Thanks.
EDIT: Thanks for all the answers. It'll be great if the answers contain more HPC problems, e.g. Black Scholes, rather than just generic algorithms.
Thanks.
A common problem in parallel programing is a reduction: You are given an array of numbers and you have to compute a "prefix sum", that is, every element stores a sum of all preceidings elements (+ itself or not. I prefer inclusive).
It is fairly simple problem, but since it is often repeated many times in more complex algorithms, having that efficient is cruicial.
Another common problem is sorting.
There already some papers on that topic, take this one for example:
enter link description here
I think it is a good problem to start with, to solve bigger problems on top of it.
A simple problem you can use for 1 to 3 dimensions is the heat equation. There are several different numerical methods for solving it, some of them can be implementes in parallel.
A method that works at least with OpenMp and MPI is the finite difference method. I suppose if you combine it with a clever stencil you should be able to implement it efficently in Cuda C.
A classical 1D example is provided by the heat equation.
Below, I'm posting a concrete, fully worked CPU/GPU example on this topic exploiting the Jacobi solution scheme. Please, note that two time-step kernels are provided, one not using shared memory and one using shared memory.
#include <stdio.h>
#include <stdlib.h>
#include <thrust\device_vector.h>
#include "Utilities.cuh"
#define BLOCKSIZE 512
/****************************/
/* CPU CALCULATION FUNCTION */
/****************************/
void HeatEquation1DCPU(float * __restrict__ h_T, int *Niter, const float T0, const float Q_N_1, const float dx, const float k, const float rho,
const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
float *h_DeltaT = (float *)malloc(N * sizeof(float));
// --- Enforcing boundary condition at the left end.
*h_T = T0;
h_DeltaT[0] = 0.f;
float current_max;
do {
// --- Internal region between the two boundaries.
for (int i = 1; i < N - 1; i++) h_DeltaT[i] = dt * alpha * ((h_T[i - 1] + h_T[i + 1] - 2.f * h_T[i]) / (dx * dx));
// --- Enforcing boundary condition at the right end.
h_DeltaT[N - 1] = dt * 2.f * ((k * ((h_T[N - 2] - h_T[N - 1]) / dx) + Q_N_1) / (dx * rho * cp));
// --- Update the temperature and find the maximum DeltaT over all nodes
current_max = h_DeltaT[0]; // --- Remember: h_DeltaT[0] = 0
for (int i = 1; i < N; i++)
{
h_T[i] = h_T[i] + h_DeltaT[i]; // h_T[0] keeps
current_max = abs(h_DeltaT[i]) > current_max ? abs(h_DeltaT[i]) : current_max;
}
// --- Increase iteration counter
(*Niter)++;
} while (*Niter < maxIterNumber && current_max > maxErr);
delete [] h_DeltaT;
}
/**************************/
/* GPU CALCULATION KERNEL */
/**************************/
__global__ void HeatEquation1DGPU_IterationKernel(float * __restrict__ d_T, float * __restrict__ d_DeltaT, const float T0, const float Q_N_1, const float dx, const float k, const float rho,
const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N) {
// --- Internal region between the two boundaries.
if ((tid > 0) && (tid < N - 1) ) d_DeltaT[tid] = dt * alpha *((d_T[tid - 1] + d_T[tid + 1] - 2.f * d_T[tid]) / (dx * dx));
// --- Enforcing boundary condition at the left end.
if (tid == 0) d_DeltaT[0] = 0.f;
// --- Enforcing boundary condition at the right end.
if (tid == N - 1) d_DeltaT[tid] = dt * 2.f * ((k * ((d_T[tid - 1] - d_T[tid]) / dx) + Q_N_1) / (dx * rho * cp));
// --- Update the temperature
d_T[tid] = d_T[tid] + d_DeltaT[tid];
d_DeltaT[tid] = abs(d_DeltaT[tid]);
}
}
__global__ void HeatEquation1DGPU_IterationSharedKernel(float * __restrict__ d_T, float * __restrict__ d_DeltaT, const float T0, const float Q_N_1, const float dx, const float k, const float rho,
const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
// --- Shared memory has 0, 1, ..., BLOCKSIZE - 1, BLOCKSIZE locations, so it has BLOCKSIZE locations + 2 (left and right) halo cells.
__shared__ float d_T_shared[BLOCKSIZE + 2]; // --- Need to know BLOCKSIZE beforehand
if (tid < N) {
// --- Load data from global memory to shared memory locations 1, 2, ..., BLOCKSIZE - 1
d_T_shared[threadIdx.x + 1] = d_T[tid];
// --- Left halo cell
if ((threadIdx.x == 0) && (tid > 0)) { d_T_shared[0] = d_T[tid - 1]; }
// --- Right halo cell
if ((threadIdx.x == blockDim.x - 1) && (tid < N - 1)) { d_T_shared[threadIdx.x + 2] = d_T[tid + 1]; }
__syncthreads();
// --- Internal region between the two boundaries.
if ((tid > 0) && (tid < N - 1) ) d_DeltaT[tid] = dt * alpha *((d_T_shared[threadIdx.x] + d_T_shared[threadIdx.x + 2] - 2.f * d_T_shared[threadIdx.x + 1]) / (dx * dx));
// --- Enforcing boundary condition at the left end.
if (tid == 0) d_DeltaT[0] = 0.f;
// --- Enforcing boundary condition at the right end.
if (tid == N - 1) d_DeltaT[tid] = dt * 2.f * ((k * ((d_T_shared[threadIdx.x] - d_T_shared[threadIdx.x + 1]) / dx) + Q_N_1) / (dx * rho * cp));
// --- Update the temperature
d_T[tid] = d_T[tid] + d_DeltaT[tid];
d_DeltaT[tid] = abs(d_DeltaT[tid]);
}
}
/****************************/
/* GPU CALCULATION FUNCTION */
/****************************/
void HeatEquation1DGPU(float * __restrict__ d_T, int *Niter, const float T0, const float Q_N_1, const float dx, const float k, const float rho,
const float cp, const float alpha, const float dt, const float maxErr, const int maxIterNumber, const int N)
{
// --- Absolute values of DeltaT
float *d_DeltaT; gpuErrchk(cudaMalloc(&d_DeltaT, N * sizeof(float)));
// --- Enforcing boundary condition at the left end.
gpuErrchk(cudaMemcpy(d_T, &T0, sizeof(float), cudaMemcpyHostToDevice));
float current_max = 0.f;
do {
//HeatEquation1DGPU_IterationKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_T, d_DeltaT, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
HeatEquation1DGPU_IterationSharedKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_T, d_DeltaT, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
thrust::device_ptr<float> d = thrust::device_pointer_cast(d_DeltaT);
current_max = thrust::reduce(d, d + N, current_max, thrust::maximum<float>());
// --- Increase iteration counter
(*Niter)++;
} while (*Niter < maxIterNumber && current_max > maxErr);
gpuErrchk(cudaFree(d_DeltaT));
}
/********/
/* MAIN */
/********/
int main()
{
// --- See https://en.wikipedia.org/wiki/Thermal_diffusivity
// --- Parameters of the problem
const float k = 0.19f; // --- Thermal conductivity [W / (m * K)]
const float rho = 930.f; // --- Density [kg / m^3]
const float cp = 1340.f; // --- Specific heat capacity [J / (kg * K)]
const float alpha = k / (rho * cp); // --- Thermal diffusivity [m^2 / s]
const float length = 1.6f; // --- Total length of the domain [m]
const int N = 64 * BLOCKSIZE; // --- Number of grid points
const float dx = (length / (float)(N - 1));// --- Discretization step [m]
const float dt = (float)(dx * dx / (4.f * alpha));
// --- Time step [s]
const float T0 = 0.f; // --- Temperature at the first end of the domain [C]
const float Q_N_1 = 10.f; // --- Heat flux at the second end of the domain [W / m^2]
const float maxErr = 1.0e-5f; // --- Maximum admitted DeltaT
const int maxIterNumber = 10.0 / dt; // --- Number of overall time steps
/********************/
/* GPU CALCULATIONS */
/********************/
float *h_T_final_device = (float *)malloc(N * sizeof(float)); // --- Final "host-side" result of GPU calculations
int Niter_GPU = 0; // --- Iteration counter for GPU calculations
// --- Device temperature allocation and initialization
float *d_T; gpuErrchk(cudaMalloc(&d_T, N * sizeof(float)));
gpuErrchk(cudaMemset(d_T, 0, N * sizeof(float)));
// --- GPU calculations
HeatEquation1DGPU(d_T, &Niter_GPU, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
// --- Transfer the GPU calculation results from device to host
gpuErrchk(cudaMemcpy(h_T_final_device, d_T, N * sizeof(float), cudaMemcpyDeviceToHost));
/********************/
/* CPU CALCULATIONS */
/********************/
// --- Host temperature allocation and initialization
float *h_T_final_host = (float *)malloc(N * sizeof(float));
memset(h_T_final_host, 0, N * sizeof(float));
int Niter_CPU = 0;
HeatEquation1DCPU(h_T_final_host, &Niter_CPU, T0, Q_N_1, dx, k, rho, cp, alpha, dt, maxErr, maxIterNumber, N);
/************************/
/* CHECKING THE RESULTS */
/************************/
for (int i = 0; i < N; i++) {
printf("Node = %i; T_host = %3.10f; T_device = %3.10f\n", i, h_T_final_host[i], h_T_final_device[i]);
if (h_T_final_host[i] != h_T_final_device[i]) {
printf("Error at i = %i; T_host = %f; T_device = %f\n", i, h_T_final_host[i], h_T_final_device[i]);
return 0;
}
}
printf("Test passed!\n");
delete [] h_T_final_device;
gpuErrchk(cudaFree(d_T));
return 0;
}
Reduction (finding min, max or sum of array) and Sorting are best examples of 1D problems. There can be many variables of these algorithms like sorting on structures etc