cuFFT wrong results only when starting from complex - cuda

I was helped before in this answer to realise an in-place transform and it works well but ONLY if I start with real data. If I start with complex data, the results after IFT+FFT are wrong, and this happens only in the in-place version, I have perfect results with an out-of-place version of this transform.
This is the code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <complex.h>
#include <cuComplex.h>
#include <cufft.h>
#include <cufftXt.h>
#define N 4
#define N_PAD ( 2*(N/2+1) )
void print_3D_Real(double *array){
printf("\nPrinting 3D real matrix \n");
unsigned long int idx;
for (int z = 0; z < N; z++){
printf("---------------------------------------------------------------------------- plane %d below\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + N_PAD * (y + x * N);
printf("%.3f \t", array[idx]);
}
printf("\n");
}
}
}
void print_3D_Comp(cuDoubleComplex *array){
printf("\nPrinting 3D complex matrix \n");
unsigned long int idx;
for (int z = 0; z < (N/2+1); z++){
printf("---------------------------------------------------------------------------- plane %d below\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + (N/2+1) * (y + x * N);
printf("%+.3f%+.3fi \t", array[idx].x, array[idx].y);
}
printf("\n");
}
}
}
// Main function
int main(int argc, char **argv){
CU_ERR_CHECK( cudaSetDevice(0) );
unsigned long int idx, in_mem_size, out_mem_size;
cuDoubleComplex *in = NULL, *d_in = NULL;
double *out = NULL, *d_out = NULL;
cufftHandle plan_r2c, plan_c2r;
in_mem_size = sizeof(cuDoubleComplex) * N*N*(N/2+1);
out_mem_size = in_mem_size;
in = (cuDoubleComplex *) malloc (in_mem_size);
out = (double *) in;
cudaMalloc((void **)&d_in, in_mem_size);
d_out = (double *) d_in;
cufftPlan3d(&plan_c2r, N, N, N, CUFFT_Z2D);
cufftPlan3d(&plan_r2c, N, N, N, CUFFT_D2Z);
memset(in, 0, in_mem_size);
memset(out, 0, out_mem_size);
// Initial complex data
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
for (int z = 0; z < (N/2+1); z++){
idx = z + (N/2+1) * (y + x * N);
in[idx].x = idx;
}
}
}
print_3D_Comp(in);
cudaMemcpy(d_in, in, in_mem_size, cudaMemcpyHostToDevice);
cufftExecZ2D(plan_c2r, (cufftDoubleComplex *)d_in, (cufftDoubleReal *)d_out);
cudaMemcpy(out, d_out, out_mem_size, cudaMemcpyDeviceToHost);
// Normalisation
for (int i = 0; i < N*N*N_PAD; i++)
out[i] /= (N*N*N);
print_3D_Real(out);
cudaMemcpy(d_out, out, out_mem_size, cudaMemcpyHostToDevice);
cufftExecD2Z(plan_r2c, (cufftDoubleReal *)d_out, (cufftDoubleComplex *)d_in);
cudaMemcpy(in, d_in, in_mem_size, cudaMemcpyDeviceToHost) );
print_3D_Comp(in);
cudaDeviceReset();
return 0;
}
The output of my program is on this pastebin.
Can someone direct me on the right path? Thank you very much in advance.

First of all, your code doesn't compile.
In its most general definition, the fourier transform performs a mapping from one complex domain to another complex domain, and this operation should be reversible.
However, the C2R and R2C are special cases, with an assumption that the signal is completely representable in one of the 2 domains (the "time" domain) as a purely real signal (all imaginary components are zero).
However, it should be evident that there will be some complex "frequency" domain representations that cannot be represented by a purely real time domain signal. If the counter case were true (any complex frequency domain signal can be represented as a purely real time domain signal) then the FFT could not be reversible for a complex time domain signal (since all frequency domain data sets map to purely real time domain data sets.)
Therefore you cannot choose arbitrary data in the frequency domain, and expect it to map correctly into a purely real time domain signal. (*)
As a demonstration, change your input data set to the following:
in[idx].x = (idx)?0:1;
and I believe you will get a "passing" test case.
Furthermore, your allegation that "I have perfect results with an out-of-place version of this transform" I believe cannot be supported, if you are in fact using this particular data set as posted in your question. If you disagree, please post a complete code demonstrating your passing test case with the out-of-place transform, that is otherwise identical to your posted code.
Finally, we can test this with fftw. A conversion of your program to use fftw instead of cufft produces exactly the same output:
$ cat t355.cpp
#include <stdio.h>
#include <stdlib.h>
#include <fftw3.h>
#include <string.h>
#define N 4
#define N_PAD ( 2*(N/2+1) )
void print_3D_Real(double *array){
printf("\nPrinting 3D real matrix \n");
unsigned long int idx;
for (int z = 0; z < N; z++){
printf("---------------------------------------------------------------------------- plane %d below\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + N_PAD * (y + x * N);
printf("%.3f \t", array[idx]);
}
printf("\n");
}
}
}
void print_3D_Comp(fftw_complex *array){
printf("\nPrinting 3D complex matrix \n");
unsigned long int idx;
for (int z = 0; z < (N/2+1); z++){
printf("---------------------------------------------------------------------------- plane %d below\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + (N/2+1) * (y + x * N);
printf("%+.3f%+.3fi \t", array[idx][0], array[idx][1]);
}
printf("\n");
}
}
}
// Main function
int main(int argc, char **argv){
unsigned long int idx, in_mem_size, out_mem_size;
fftw_complex *in = NULL;
double *out = NULL;
in_mem_size = sizeof(fftw_complex) * N*N*(N/2+1);
out_mem_size = in_mem_size;
in = (fftw_complex *) malloc (in_mem_size);
out = (double *) in;
memset(in, 0, in_mem_size);
memset(out, 0, out_mem_size);
// Initial complex data
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
for (int z = 0; z < (N/2+1); z++){
idx = z + (N/2+1) * (y + x * N);
in[idx][0] = idx;
}
}
}
print_3D_Comp(in);
fftw_plan plan_c2r = fftw_plan_dft_c2r_3d(N, N, N, in, out, FFTW_ESTIMATE);
fftw_plan plan_r2c = fftw_plan_dft_r2c_3d(N, N, N, out, in, FFTW_ESTIMATE);
fftw_execute(plan_c2r);
// Normalisation
for (int i = 0; i < N*N*N_PAD; i++)
out[i] /= (N*N*N);
print_3D_Real(out);
fftw_execute(plan_r2c);
print_3D_Comp(in);
return 0;
}
$ g++ t355.cpp -o t355 -lfftw3
$ ./t355
Printing 3D complex matrix
---------------------------------------------------------------------------- plane 0 below
+0.000+0.000i +3.000+0.000i +6.000+0.000i +9.000+0.000i
+12.000+0.000i +15.000+0.000i +18.000+0.000i +21.000+0.000i
+24.000+0.000i +27.000+0.000i +30.000+0.000i +33.000+0.000i
+36.000+0.000i +39.000+0.000i +42.000+0.000i +45.000+0.000i
---------------------------------------------------------------------------- plane 1 below
+1.000+0.000i +4.000+0.000i +7.000+0.000i +10.000+0.000i
+13.000+0.000i +16.000+0.000i +19.000+0.000i +22.000+0.000i
+25.000+0.000i +28.000+0.000i +31.000+0.000i +34.000+0.000i
+37.000+0.000i +40.000+0.000i +43.000+0.000i +46.000+0.000i
---------------------------------------------------------------------------- plane 2 below
+2.000+0.000i +5.000+0.000i +8.000+0.000i +11.000+0.000i
+14.000+0.000i +17.000+0.000i +20.000+0.000i +23.000+0.000i
+26.000+0.000i +29.000+0.000i +32.000+0.000i +35.000+0.000i
+38.000+0.000i +41.000+0.000i +44.000+0.000i +47.000+0.000i
Printing 3D real matrix
---------------------------------------------------------------------------- plane 0 below
23.500 -1.500 -1.500 -1.500
-6.000 0.000 0.000 0.000
-6.000 0.000 0.000 0.000
-6.000 0.000 0.000 0.000
---------------------------------------------------------------------------- plane 1 below
-0.500 0.750 0.000 -0.750
3.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
-3.000 0.000 0.000 0.000
---------------------------------------------------------------------------- plane 2 below
0.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
---------------------------------------------------------------------------- plane 3 below
-0.500 -0.750 0.000 0.750
-3.000 0.000 0.000 0.000
0.000 0.000 0.000 0.000
3.000 0.000 0.000 0.000
Printing 3D complex matrix
---------------------------------------------------------------------------- plane 0 below
+0.000+0.000i +6.000+0.000i +6.000+0.000i +6.000+0.000i
+24.000+0.000i +30.000+0.000i +30.000+0.000i +30.000+0.000i
+24.000+0.000i +30.000+0.000i +30.000+0.000i +30.000+0.000i
+24.000+0.000i +30.000+0.000i +30.000+0.000i +30.000+0.000i
---------------------------------------------------------------------------- plane 1 below
+1.000+0.000i +4.000+0.000i +7.000+0.000i +10.000+0.000i
+13.000+0.000i +16.000+0.000i +19.000+0.000i +22.000+0.000i
+25.000+0.000i +28.000+0.000i +31.000+0.000i +34.000+0.000i
+37.000+0.000i +40.000+0.000i +43.000+0.000i +46.000+0.000i
---------------------------------------------------------------------------- plane 2 below
+2.000+0.000i +8.000+0.000i +8.000+0.000i +8.000+0.000i
+26.000+0.000i +32.000+0.000i +32.000+0.000i +32.000+0.000i
+26.000+0.000i +32.000+0.000i +32.000+0.000i +32.000+0.000i
+26.000+0.000i +32.000+0.000i +32.000+0.000i +32.000+0.000i
$
(*) You can argue, if you wish, that that the complex-conjugate symmetry feature of the C2R and R2C transforms should account for a correct mapping of all possible complex "frequency" domain signals into unique, purely real "time" domain signals. I claim, without proof, that it does not, with 2 data points:
The example code in this question.
Since the complex space in a C2R or R2C transform is numerically larger than the real space (by a factor of (2*(N/2+1))/N), it stands to reason that there cannot be a unique 1:1 mapping of all possible complex signals into unique real signals. And the unique 1:1 mapping would be necessary for full reversibility.
For additional background on the possibility of lack of symmetry in random data, note the discussion around CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC in the cufft documentation.

Related

Wrong results cufft 3D in-place

I write because I'm facing problems with the cufft 3D transform in-place, while I have no problems for the out-of-place version. I tried to follow Robert Crovella's answer here but I'm not obtaining the correct results when I make a FFT+IFT.
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <complex.h>
#include <cuComplex.h>
#include <cufft.h>
// Main function
int main(int argc, char **argv){
int N = 4;
double *in = NULL, *d_in = NULL;
cuDoubleComplex *out = NULL, *d_out = NULL;
cufftHandle plan_r2c, plan_c2r;
unsigned int out_mem_size = sizeof(cuDoubleComplex) * N*N*(N/2 + 1);
unsigned int in_mem_size = out_mem_size;
in = (double *) malloc (in_mem_size);
out = (cuDoubleComplex *)in;
cudaMalloc((void **)&d_in, in_mem_size);
d_out = (cuDoubleComplex *)d_in;
cufftPlan3d(&plan_r2c, N, N, N, CUFFT_D2Z);
cufftPlan3d(&plan_c2r, N, N, N, CUFFT_Z2D);
memset(in, 0, in_mem_size);
unsigned int idx;
for (int z = 0; z < N; z++){
for (int y = 0; y < N; y++){
for (int x = 0; x < N; x++){
idx = z + N * ( y + x * N);
in[idx] = idx;
}
}
}
printf("\nStart: \n");
for (int z = 0; z < N; z++){
printf("plane = %d ----------------------------\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + N * ( y + x * N);
printf("%.3f \t", in[idx]);
}
printf("\n");
}
}
cudaMemcpy(d_in, in, in_mem_size, cudaMemcpyHostToDevice);
cufftExecD2Z(plan_r2c, (cufftDoubleReal *)d_in, (cufftDoubleComplex *)d_out);
cufftExecZ2D(plan_c2r, (cufftDoubleComplex *)d_out, (cufftDoubleReal *)d_in);
memset(in, 0, in_mem_size);
CU_ERR_CHECK( cudaMemcpy(in, d_in, in_mem_size, cudaMemcpyDeviceToHost) );
printf("\nAfter FFT+IFT: \n");
for (int z = 0; z < N; z++){
printf("plane = %d ----------------------------\n", z);
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
idx = z + N * ( y + x * N);
// Normalisation
in[idx] /= (N*N*N);
printf("%.3f \t", in[idx]);
}
printf("\n");
}
}
return 0;
}
The program outputs the following data:
Starting file
plane = 0 ----------------------------
0.000 4.000 8.000 12.000
16.000 20.000 24.000 28.000
32.000 36.000 40.000 44.000
48.000 52.000 56.000 60.000
plane = 1 ----------------------------
1.000 5.000 9.000 13.000
17.000 21.000 25.000 29.000
33.000 37.000 41.000 45.000
49.000 53.000 57.000 61.000
plane = 2 ----------------------------
2.000 6.000 10.000 14.000
18.000 22.000 26.000 30.000
34.000 38.000 42.000 46.000
50.000 54.000 58.000 62.000
plane = 3 ----------------------------
3.000 7.000 11.000 15.000
19.000 23.000 27.000 31.000
35.000 39.000 43.000 47.000
51.000 55.000 59.000 63.000
After FFT+IFT
plane = 0 ----------------------------
-0.000 -0.344 8.000 12.000
-0.031 20.000 24.000 -0.031
32.000 36.000 0.031 44.000
48.000 -0.094 56.000 60.000
plane = 1 ----------------------------
1.000 -0.000 9.000 13.000
-0.000 21.000 25.000 0.125
33.000 37.000 0.000 45.000
49.000 0.000 57.000 61.000
plane = 2 ----------------------------
2.000 6.000 -0.000 14.000
18.000 0.000 26.000 30.000
0.000 38.000 42.000 -0.000
50.000 54.000 -0.000 62.000
plane = 3 ----------------------------
3.000 7.000 0.031 15.000
19.000 -0.031 27.000 31.000
-0.031 39.000 43.000 0.031
51.000 55.000 0.031 63.000
I even tried to pad the data this way:
// With padding
unsigned int idx;
for (int x = 0; x < N; x++){
for (int y = 0; y < N; y++){
for (int z = 0; z < 2*(N/2+1); z++){
idx = z + N * ( y + x * N);
if (z < 4) in[idx] = idx;
else in[idx] = 0;
}
}
}
What am I doing wrong?
As you already found out, you need padding if you use the CUFFT_COMPATIBILITY_FFTW_PADDINGcompatibility mode which is default. For your code to work you could use cufftSetCompatibilityMode() to set CUFFT_COMPATIBILITY_NATIVE. However, this mode is marked as deprecated in the current version of CUDA.
Therefore, I recommend to use the default compatibility mode and use padding. Your try to implement padding is wrong. The formula to calculate a linear index for 3 dimension x, y, z where z is the fastest running index is idx = z + Nz*(y + Ny*x). The size Nz of the z dimension including padding is Nz = (N/2+1)*2. Then, the correct initialization of the array is:
unsigned int idx;
for (int z = 0; z < N; z++){
for (int y = 0; y < N; y++){
for (int x = 0; x < N; x++){
idx = z + (N/2+1)*2 * ( y + x * N);
in[idx] = idx;
}
}
}
Accordingly for the print loops.

Covariance calculation with CUDA

I am implementing Principal Component Analysis (PCA) based face recognition using CUDA. I used orl face database and calculated the mean image and normalized images. I'm facing a problem in calculating the covariance matrix.
__global__ void mean(int* i_data, int num, int size, int* o_data, int WIDTH, int HEIGHT, int* normalized)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int idx = x + y * WIDTH;
int r = 0;
int idx_z=0;
for (int z = 0; z < num; ++z)
{
idx_z = z * WIDTH*HEIGHT + idx;
r += i_data[ idx_z ];
}
o_data[ idx ] = int(r/num);
for (int z = 0; z < num; ++z)
{
idx_z = z * WIDTH*HEIGHT + idx;
normalized[idx_z] = abs(i_data[idx_z] - o_data[idx]);
}
}
dim3 dimBlock = dim3(8,4,1);
dim3 dimGrid = dim3(ceil(rows/dimBlock.x) , ceil(cols/dimBlock.y));
mean<<<dimGrid,dimBlock>>>(dev_images, IMAGE_NUM,size,dev_output,rows,cols,dev_normalized);
The database images are of size (92,112).
Your code does not make any sense to me.
Covariance calculation in CUDA can be easily performed by using cuBLAS in conjunction with Thrust. Considering N realizations of K random variables, the covariance estimation formula is the following
where qjk, j,k=1,...,K are the covariance estimate values, Xj and Xk with the overbars are the random variable means as estimated from the available realizations.
Below, I'm reporting a fully worked example:
#include <cublas_v2.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
#include <thrust/random.h>
#include <thrust/sequence.h>
#include <stdio.h>
#include <iostream>
#include "Utilities.cuh"
#include "TimingGPU.cuh"
/*************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX */
/*************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {
T Ncols; // --- Number of columns
__host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
__host__ __device__ T operator()(T i) { return i / Ncols; }
};
/********/
/* MAIN */
/********/
int main()
{
const int Nsamples = 3; // --- Number of realizations for each random variable (number of rows of the X matrix)
const int NX = 4; // --- Number of random variables (number of columns of the X matrix)
// --- Random uniform integer distribution between 10 and 99
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// --- Matrix allocation and initialization
thrust::device_vector<float> d_X(Nsamples * NX);
for (size_t i = 0; i < d_X.size(); i++) d_X[i] = (float)dist(rng);
// --- cuBLAS handle creation
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
/*************************************************/
/* CALCULATING THE MEANS OF THE RANDOM VARIABLES */
/*************************************************/
// --- Array containing the means multiplied by Nsamples
thrust::device_vector<float> d_means(NX);
thrust::device_vector<float> d_ones(Nsamples, 1.f);
float alpha = 1.f / (float)Nsamples;
float beta = 0.f;
cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Nsamples, NX, &alpha, thrust::raw_pointer_cast(d_X.data()), Nsamples,
thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_means.data()), 1));
/**********************************************/
/* SUBTRACTING THE MEANS FROM THE MATRIX ROWS */
/**********************************************/
thrust::transform(
d_X.begin(), d_X.end(),
thrust::make_permutation_iterator(
d_means.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nsamples))),
d_X.begin(),
thrust::minus<float>());
/*************************************/
/* CALCULATING THE COVARIANCE MATRIX */
/*************************************/
thrust::device_vector<float> d_cov(NX * NX);
alpha = 1.f;
cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, NX, NX, Nsamples, &alpha,
thrust::raw_pointer_cast(d_X.data()), Nsamples, thrust::raw_pointer_cast(d_X.data()), Nsamples, &beta,
thrust::raw_pointer_cast(d_cov.data()), NX));
// --- Final normalization by Nsamples - 1
thrust::transform(
d_cov.begin(), d_cov.end(),
thrust::make_constant_iterator((float)(Nsamples-1)),
d_cov.begin(),
thrust::divides<float>());
for(int i = 0; i < NX * NX; i++) std::cout << d_cov[i] << "\n";
return 0;
}
I implemented covariance calculator with CUBlas and Cuda Thrust and compared with online co variance calculation tools. It seems mine producing good results. The code below planned to QDA Bayes. So matrix given may contain more than one class. So multiple co variance matrices is calculated. I hope it will be useful for someone.
//! Calculates one or more than one coVarianceMatrix given data.
// There can be many classes since many covariance matrixes.
/*!
\param inMatrix This vector contains matrix data in major storage.
Forexample if inMatrix=[1 2 3 4 5 6] and trialSizes=[2] this means matrix we will work on a matrix like :
|1 4 |
|2 5 |
|3 6 | -> 2 Trials, 3 Features. Columns contains feature rows contains trials (samples)
\param trialSizes There can be many classes since many covariance matrixes. Samples from all classes will be given with inMatrix.
But we need to know how many trials(samples) we have for each class.
For example if inMatrix=[1 2 3 4 5 6 7 8 9 10 11 12] and trialSizes=[2,2]
this means matrix we will work on a matrix like :
|1 4 | |7 10 |
|2 5 | |8 11 |
|3 6 | |9 12 | --> Total number of trials(samples which is total rowCount) 2 + 2 = 4 ,
So colSize = inMatrix.size()/4 = 3(feature vector size)
--> There is two element in trialSize vec so each vector has to samples
*/
void multiQDACovianceCalculator(std::vector<float>& inMatrix, std::vector<int>& trialSizes)
{
cublasHandle_t handle; // CUBLAS context
int classCount = trialSizes.size();
int rowSize = std::accumulate(trialSizes.begin(), trialSizes.end(), 0);
int dimensionSize = inMatrix.size() / rowSize;
float alpha = 1.0f;
float beta = 0.0f; // bet =1
thrust::device_vector<float> d_cov1(dimensionSize * dimensionSize);
thrust::device_vector<float> d_cov2(dimensionSize * dimensionSize);
thrust::device_vector<float> d_covResult(dimensionSize * dimensionSize);
thrust::device_vector<float> d_wholeMatrix(inMatrix);
thrust::device_vector<float> d_meansVec(dimensionSize); // rowVec of means of trials
float *meanVecPtr = thrust::raw_pointer_cast(d_meansVec.data());
float *device2DMatrixPtr = thrust::raw_pointer_cast(d_wholeMatrix.data());
auto maxTrialNumber = *std::max_element(trialSizes.begin(), trialSizes.end());
thrust::device_vector<float> deviceVector(maxTrialNumber, 1.0f);
cublasCreate(&handle);
// Inside of for loop one covariance matrix calculated each time
for (int i = 0; i < trialSizes.size(); i++)
{
// X*transpose(X) / N
alpha = 1.0f / trialSizes[i];
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, dimensionSize, dimensionSize, trialSizes[i], &alpha,
device2DMatrixPtr, dimensionSize, device2DMatrixPtr, dimensionSize, &beta,
thrust::raw_pointer_cast(d_cov1.data()), dimensionSize);
// Mean vector of each column
alpha = 1.0f;
cublasSgemv(handle, CUBLAS_OP_N, dimensionSize, trialSizes[i], &alpha, device2DMatrixPtr,
dimensionSize, thrust::raw_pointer_cast(deviceVector.data()), 1, &beta, meanVecPtr, 1);
// MeanVec * transpose(MeanVec) / N*N
alpha = 1.0f / (trialSizes[i] * trialSizes[i]);
cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, dimensionSize, dimensionSize, 1, &alpha,
meanVecPtr, 1, meanVecPtr, 1, &beta,
thrust::raw_pointer_cast(d_cov2.data()), dimensionSize);
alpha = 1.0f;
beta = -1.0f;
// (X*transpose(X) / N) - (MeanVec * transpose(MeanVec) / N*N)
cublasSgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N, dimensionSize, dimensionSize, &alpha,
thrust::raw_pointer_cast(d_cov1.data()), dimensionSize, &beta, thrust::raw_pointer_cast(d_cov2.data()),
dimensionSize, thrust::raw_pointer_cast(d_covResult.data()), dimensionSize);
// Go to other class and calculate its covarianceMatrix
device2DMatrixPtr += trialSizes[i] * dimensionSize;
}
printVector(d_covResult);
cublasDestroy(handle);
}

Solving dense linear systems AX = B with CUDA

Can I use the new cuSOLVER library (CUDA 7) to solve linear systems of the form
AX = B
where A, X and B are NxN dense matrices ?
Yes.
Approach nr. 1
In the framework of cuSOLVER you can use QR decomposition, see QR decomposition to solve linear systems in CUDA.
Approach nr. 2
Alternatively, you can calculate the matrix inverse by the successive involation of
cublas<t>getrfBatched()
which calculates the LU decomposition of a matrix, and
cublas<t>getriBatched()
which calculates the inverse of the matrix starting from its LU decomposition.
Approach nr. 3
A final possibility is using
cublas<t>getrfBatched()
followed by a twofold invocation of
cublas<t>trsm()
which solves upper or lower triangular linear systems.
As pointed out by Robert Crovella, the answer may vary on the size and the type of the involved matrices.
Code for approach nr. 1
Please, see QR decomposition to solve linear systems in CUDA.
Code for approaches nr. 2 and nr. 3
Below, I'm reporting a worked example for the implementation of approaches nr. 2 and 3. Hankel matrices are used to feed the approaches with well-conditioned, invertible matrices. Please, note that approach nr. 3 requires permuting (rearranging) the system coefficients vector according to the pivot array obtained following the invokation of cublas<t>getrfBatched(). This permutation can be conveniently done on the CPU.
#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define prec_save 10
#define BLOCKSIZE 256
#define BLOCKSIZEX 16
#define BLOCKSIZEY 16
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {
double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));
// --- Initialize random seed
srand(time(NULL));
// --- Generate random numbers
for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();
// --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];
free(h_atemp);
}
/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref,
double * __restrict h_y, const int N) {
for (int k = 0; k < N; k++) h_y[k] = 0.f;
for (int m = 0; m < N; m++)
for (int n = 0; n < N; n++)
h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];
}
/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
for (int i = 0; i < N; i++) {
double temp = vec[i];
vec[i] = vec[pivotArray[i] - 1];
vec[pivotArray[i] - 1] = temp;
}
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 1000;
const unsigned int Nmatrices = 1;
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));
TimingGPU timerLU, timerApproach1, timerApproach2;
double timingLU, timingApproach1, timingApproach2;
/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Matrices to be inverted (only one in this example)
double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));
// --- Setting the Hankel matrix
setHankelMatrix(h_A, N);
// --- Defining the solution
double *h_xref = (double *)malloc(N * sizeof(double));
for (int k = 0; k < N; k++) h_xref[k] = 1.f;
// --- Coefficient vectors (only one in this example)
double *h_y = (double *)malloc(N * sizeof(double));
computeCoefficientsVector(h_A, h_xref, h_y, N);
// --- Result (only one in this example)
double *h_x = (double *)malloc(N * sizeof(double));
// --- Allocate device space for the input matrices
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
// --- Move the relevant matrices from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(double), cudaMemcpyHostToDevice));
/**********************************/
/* COMPUTING THE LU DECOMPOSITION */
/**********************************/
timerLU.StartCounter();
// --- Creating the array of pointers needed as input/output to the batched getrf
double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;
double **d_inout_pointers;
gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
free(h_inout_pointers);
int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
int *d_InfoArray; gpuErrchk(cudaMalloc(&d_InfoArray, Nmatrices * sizeof(int)));
int *h_InfoArray = (int *)malloc(Nmatrices * sizeof(int));
cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
//cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
timingLU = timerLU.GetCounter();
printf("Timing LU decomposition %f [ms]\n", timingLU);
/*********************************/
/* CHECKING THE LU DECOMPOSITION */
/*********************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
saveCPUrealtxt(h_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);
/******************************************************************************/
/* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
/******************************************************************************/
timerApproach1.StartCounter();
// --- Allocate device space for the inverted matrices
double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));
// --- Creating the array of pointers needed as output to the batched getri
double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));
double **d_out_pointers;
gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
free(h_out_pointers);
cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
double alpha1 = 1.f;
double beta1 = 0.f;
cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));
timingApproach1 = timingLU + timerApproach1.GetCounter();
printf("Timing approach 1 %f [ms]\n", timingApproach1);
/**************************/
/* CHECKING APPROACH NR.1 */
/**************************/
saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);
/*************************************************************/
/* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
/*************************************************************/
timerApproach2.StartCounter();
double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));
gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
rearrange(h_y, h_pivotArray, N);
gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- Now P*A=L*U
// Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y
// --- 1st phase - solve Ly = b
const double alpha = 1.f;
// --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result
// --- Lower triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));
// --- Upper triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));
timingApproach2 = timingLU + timerApproach2.GetCounter();
printf("Timing approach 2 %f [ms]\n", timingApproach2);
/**************************/
/* CHECKING APPROACH NR.2 */
/**************************/
saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.
Some useful references on the third approach:
NAG Fortran Library Routine Document
Scientific Computing Software Library (SCSL) User’s Guide
https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c
EDIT
Timings (in ms) for approaches nr. 2 and 3 (tests performed on a GTX960 card, cc. 5.2).
N LU decomposition Approach nr. 2 Approach nr. 3
100 1.08 2.75 1.28
500 45.4 161 45.7
1000 302 1053 303
As it emerges, approach nr. 3 is more convenient and its cost is essentially the cost of computing the LU factorization. Furthermore:
Solving linear systems by LU decomposition is faster than using QR decomposition (see QR decomposition to solve linear systems in CUDA);
LU decomposition is limited to square linear systems, while QR decomposition helps in case of non-square linear systems.
The below Matlab code can be used for checking the results
clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt
A = reshape(A, N, N);
yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);
load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);
L = eye(N);
for k = 1 : N
L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
U(k, k : N) = Adecomposed(k, k : N);
end
load pivotArray.txt
Pj = eye(N);
for j = 1 : N
tempVector = Pj(j, :);
Pj(j, :) = Pj(pivotArray(j), :);
Pj(pivotArray(j), :) = tempVector;
end
fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));
xprime = inv(Lmatlab) * yMatlab;
xMatlab = inv(Umatlab) * xprime;
fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));
load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

Instructions Per Count (IPC) and Instruction Level Parallelism (ILP) in CUDA

I observe IPC drops as ILP goes up for 32-bit int operations when trying to speed up my cryptographic kernel. The kernel consists of fairly unrolled loops of long sequence of ADD and XOR operations, which should have a throughput of 160 ops per 192 cores per cycle on Kepler (GTX Titan/780).
IPC for my kernel hits the upper bound of 3.28. Using ILP even drops IPC. Apparently ILP fails to help achieve my goal -- fully utilize the pipeline, so I wrote some little experiments. I put the code for ILP 4 at the end.
Profiler Measurements
Results are measured on GTX Titan.
cubin outputs are examined to make sure no instructions are eliminated during optimization.
Executed IPC is almost the same as issued IPC, so I just list one of them.
ADD instructions (XORs have identical behavior)
| ILP 1 | ILP 2 | ILP 4 | ILP 8
--------------------------------------------------
IPC | 4.00 | 3.32 | 2.72 | 3.44
--------------------------------------------------
Issue Slot | 99.17% | 59.34% | 48.61% | 61.71%
Utilization | | | |
I expect ILP 2, 4 and 8 would give better performance, but not.
Recall the integer throughput is 160. The 4 warp scheduler per SM should dual issue up to 5 instructions per cycle, so that IPC should go up towards 5. How can I explain what I observed? Why is the issue slot 99% utilized when IPC = 4?
Float / Int ADD instruction mix
If I modify the code for ILP 4 to do two int ADDs and two float ADDs:
IPC: 5.1
Issue slot utilization: 99.12%
Strangely enough, it seems that the warp scheduler does a better job to issue floating operations.
Discussion
Available literature suggests using ILP help reach the peak performance for floating point operations. Why doesn't ILP apply to integers? How can I do this for integer operations?
My kernel theoretically should do 2.25 integer operations per candidate. This is consistent with what I observed in cuobjdump. There are 2^48 candidates, so the minimun runtime on GTX Titan should be 2.25 * 2^48 / (2688 * 160/192) / 876 MHz = 322.75s. Is this estimation reasonable?
The measured performance for my kernel is 523s. This does imply that integer throughput is only about 160 * 3.28 (measure IPC) / 5 (max IPC).
ILP test code
__device__ int x[10];
__global__ void test(int flag = 0)
{
int a = x[0], b = x[1], c = x[2], d = x[3];
int _a = x[4], _b = x[5], _c = x[6], _d = x[7];
#pragma unroll 128
for (int i = 0; i < 51200; ++i)
{
asm volatile("add.u32 %0, %0, %1;": "+r"(a): "r"(_a));
asm volatile("add.u32 %0, %0, %1;": "+r"(b): "r"(_b));
asm volatile("add.u32 %0, %0, %1;": "+r"(c): "r"(_c));
asm volatile("add.u32 %0, %0, %1;": "+r"(d): "r"(_d));
}
int v = a + b + c + d;
if (flag * v == 1)
x[0] = v;
}
Code fragment for 4 candidates
Each candidate takes 9 / 4 = 2.25 ops. Cuobjdump also verifies this.
d ^= d2(1, 3); // d2 is located in constant memory
s ^= d;
t ^= d2(1, 16);
u ^= d2(1, 17);
v ^= some_const;
flag_s = min(flag_s, s); // int min has throughput of 160
flag_t = flag_t || (s == t); // setp.or should be the same
flag_u = flag_u || (s == u);
flag_v = flag_v || (s == v);
I'm providing an answer to remove this question from the unanswered list.
I do not observe a change in executed Instructions Per Count (IPC) with Instruction Level Parallelism. Overall, it is difficult to argue the reason for the effect observed by the OP without knowing any further information but that provided by the OP himself (f.i., the launch configuration).
In the code below, I'm considering an example using floats, although I have tested the same code with ints without changing the conceptual results. The code implements cyclical Multiply Add (MAD) operations with ILP=1, ILP=2 and ILP=4.
The executed IPC has been the following
ILP IPC FLOPs
1 3.924 67108864
2 4.323 67108864
4 4.016 67108864
for N=8192. The code has been compiled with CUDA 8.0 and run on an NVIDIA GT920M. As it can be seen, IPC keeps almost constant for the differently considered values of ILP. The Floating Point Operations (FLOPs) as estimated by the code assuming 2 FLOPs per MAD coincides with that measured by the Visual Profiler.
THE CODE
#include<stdio.h>
#define N_ITERATIONS 8192
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define BLOCKSIZE 512
//#define DEBUG
/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float a = d_a[tid];
float b = d_b[tid];
float c = d_c[tid];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a = a * b + c;
}
d_a[tid] = a;
}
}
/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N / 2) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float c1 = d_c[tid];
float a2 = d_a[tid + N / 2];
float b2 = d_b[tid + N / 2];
float c2 = d_c[tid + N / 2];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
}
d_a[tid] = a1;
d_a[tid + N / 2] = a2;
}
}
/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N / 4) {
float a1 = d_a[tid];
float b1 = d_b[tid];
float c1 = d_c[tid];
float a2 = d_a[tid + N / 4];
float b2 = d_b[tid + N / 4];
float c2 = d_c[tid + N / 4];
float a3 = d_a[tid + N / 2];
float b3 = d_b[tid + N / 2];
float c3 = d_c[tid + N / 2];
float a4 = d_a[tid + 3 * N / 4];
float b4 = d_b[tid + 3 * N / 4];
float c4 = d_c[tid + 3 * N / 4];
for (unsigned int i = 0; i < N_ITERATIONS; i++) {
a1 = a1 * b1 + c1;
a2 = a2 * b2 + c2;
a3 = a3 * b3 + c3;
a4 = a4 * b4 + c4;
}
d_a[tid] = a1;
d_a[tid + N / 4] = a2;
d_a[tid + N / 2] = a3;
d_a[tid + 3 * N / 4] = a4;
}
}
/********/
/* MAIN */
/********/
int main() {
//const int N = 8192 * 64;
const int N = 8192;
//const int N = 1024;
TimingGPU timerGPU;
float *h_a = (float*)malloc(N*sizeof(float));
float *h_a_result_host = (float*)malloc(N*sizeof(float));
float *h_a_result_device = (float*)malloc(N*sizeof(float));
float *h_b = (float*)malloc(N*sizeof(float));
float *h_c = (float*)malloc(N*sizeof(float));
for (int i = 0; i<N; i++) {
h_a[i] = 2.;
h_b[i] = 1.;
h_c[i] = 2.;
h_a_result_host[i] = h_a[i];
for (unsigned int k = 0; k < N_ITERATIONS; k++) {
h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
}
}
float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(float)));
float *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(float)));
float *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(float), cudaMemcpyHostToDevice));
/***********/
/* KERNEL0 */
/***********/
timerGPU.StartCounter();
kernel0 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
/***********/
/* KERNEL1 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
kernel1 << <iDivUp(N / 2, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
/***********/
/* KERNEL2 */
/***********/
gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
kernel2 << <iDivUp(N / 4, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
// --- Remember: timing is in ms
printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }
cudaDeviceReset();
return 0;
}

CUDA C/C++: Calculate the average of inverse of distance per point (interaction energy, perhaps?)

I've been trying to write a kernel in that calculates the sum of the inverse of the distance between N given points over N. A serial coda in C would be like
average = 0;
for(int i = 0; i < Np; i++){
for(int j = i + 1; j < Np; j++){
average += 1.0e0f/sqrtf((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(float)N;
Where rx and ry are the x and y coordinates, respectively.
I generate the points via a kernel that uses random number generator. For the kernel, I used 128(256) threads per block for 4k(8k) points. On it every thread performs the inner above inner loop, then the results are passed to a reduce sum function, as follows
Generate points:
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(float* X, float* Y,const int N, curandState *state){
float rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
float range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrtf(0.25e0f*N*rdmn1);
X[tIdx] = range*cosf(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sinf(2.0e0f*pi*rdmn2);
}
}
Reduction:
__device__
float ReduceSum2(float In){
__shared__ float data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
Kernel:
__global__
void AvgDistance(float *X, float *Y, float *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
float x , y;
float d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrtf(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
The kernel is configured and launched as follows:
dim3 threads(BlockSize,BlockSize);
dim3 blocks(ceil(Np/threads.x),ceil(Np/threads.y));
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(float)>>>(d_rx,d_ry,d_Avg,Np);
Finally, I copy the data back to host and then perform the remaining sum:
Avg = new float[blocks.x];
CHECK(cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(float),cudaMemcpyDeviceToHost),ERROR_CPY_DEVTOH);
float average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(float)Np;
For 4k points, ok! the results are:
Average distance between points (via Kernel) = 108.615
Average distance between points (via CPU) = 110.191
In this case the sum may be performed in different order, causing both results to diverge from each other, I don't know...
But when it comes to 8k, the results are quiet different:
Average distance between points (via Kernel) = 153.63
Average distance between points (via CPU) = 131.471
To me it seems that both the kernel and the serial code are written the same way. What leads me to distrust the precision on CUDA calculation of floating point numbers. Does this make sense? Or are the access to global memory causing some conflicts when some threads load the same data from X and Y at the same time? Or the way I wrote the kernel is in some way 'wrong'(I mean, am I doing something that is causing both results to diverge from each other?).
Actually, from what I can tell, the problem seems to be on the CPU side. I created a sample code based on your code.
I was able to reproduce your results.
First I switched all instances of sinf, cosf, and sqrtf to their corresponding double versions. This made no difference in the results.
Next I included a typedef so I could easily switch the precision from float to double and back, replacing every relevant instance of float in the code with mytype which is my typedef.
When I run the code with typedef of float and a data size of 4096 I get these results:
GPU average = 108.294922
CPU average = 109.925285
When I run the code with typedef of double and a data size of 4096 I get these results:
GPU average = 108.294903
CPU average = 108.294903
When I run the code with typedef of float and a data size of 8192 I get these results:
GPU average = 153.447327
CPU average = 131.473526
When I run the code with typedef of double and a data size of 8192 I get these results:
GPU average = 153.447380
CPU average = 153.447380
There are at least 2 observations:
The GPU results don't vary between float and double, except in the 5th decimal place
The CPU results vary by 1-20% or so between float and double, but when double is selected, they line up exactly (to the 6th decimal place, anyway) with the GPU results.
Based on this, I believe the CPU is providing the variable, questionable behavior.
Here's my code for reference:
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192
#define BlockSize 32
#define pi 3.14159f
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef double mytype;
__global__ void InitRNG ( curandState * state, const int seed ){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
curand_init (seed, tIdx, 0, &state[tIdx]);
}
__global__
void SortPoints(mytype* X, mytype* Y,const int N, curandState *state){
mytype rdmn1, rdmn2;
unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
mytype range;
if(tIdx < N){
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
range = sqrt(0.25e0f*N*rdmn1);
X[tIdx] = range*cos(2.0e0f*pi*rdmn2);
Y[tIdx] = range*sin(2.0e0f*pi*rdmn2);
}
}
__device__
mytype ReduceSum2(mytype In){
__shared__ mytype data[BlockSize];
unsigned int tIdx = threadIdx.x;
data[tIdx] = In;
__syncthreads();
for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){
if(tIdx < i){
data[tIdx] += data[tIdx + i];
}
__syncthreads();
}
return data[0];
}
__global__
void AvgDistance(mytype *X, mytype *Y, mytype *Avg, const int N){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
mytype x , y;
mytype d = 0.0f;
if(tIdx < N){
for(int i = tIdx + 1; i < N ; i++){
x = X[tIdx] - X[i];
y = Y[tIdx] - Y[i];
d += 1.0e0f/(sqrt(x*x + y*y));
}
__syncthreads();
Avg[bIdx] = ReduceSum2(d);
}
}
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
}
average = average/(mytype)size;
return average;
}
int main() {
int Np = DSIZE;
mytype *rx, *ry, *d_rx, *d_ry, *d_Avg, *Avg;
curandState *d_state;
int seed = 1;
dim3 threads(BlockSize,BlockSize);
dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y));
printf("number of blocks = %d\n", blocks.x);
printf("number of threads= %d\n", threads.x);
rx = (mytype *)malloc(DSIZE*sizeof(mytype));
if (rx == 0) {printf("malloc fail\n"); return 1;}
ry = (mytype *)malloc(DSIZE*sizeof(mytype));
if (ry == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void**)&d_rx, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_ry, DSIZE * sizeof(mytype));
cudaMalloc((void**)&d_Avg, blocks.x * sizeof(mytype));
cudaMalloc((void**)&d_state, DSIZE * sizeof(curandState));
cudaCheckErrors("cudamalloc");
InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(mytype)>>>(d_rx,d_ry,d_Avg,Np);
cudaCheckErrors("kernels");
Avg = new mytype[blocks.x];
cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(rx, d_rx, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaMemcpy(ry, d_ry, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy");
mytype average = 0;
for(int i = 0; i < blocks.x; i++){
average += Avg[i];
}
average = average/(mytype)Np;
printf("GPU average = %f\n", average);
average = cpu_avg(rx, ry, DSIZE);
printf("CPU average = %f\n", average);
return 0;
}
I am running on RHEL 5.5, CUDA 5.0, Intel Xeon X5560
compiled with:
nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu
EDIT:
After observing that the variability was on the CPU side, I found that I could eliminate most of the CPU variability by modifying your CPU averaging code like this:
mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
mytype average = 0.0f;
mytype temp = 0.0f;
for(int i = 0; i < size; i++){
for(int j = i + 1; j < size; j++){
temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
}
average += temp/(mytype)size;
temp = 0.0f;
}
return average;
}
So I would say there's a problem with intermediate results on the CPU side. It's interesting that it doesn't show up on the GPU result. I suspect the reason for this is that the final summation of GPU averages is done on the CPU (therefore each individual GPU block result is scaled down by the size, e.g. 8192), and these may have an intermediate precision that is sufficient to survive until the final division. If you inlined the CPU average calculation, you may observe something different again.