Matrix-vector multiplication in CUDA: benchmarking & performance - cuda

I'm updating my question with some new benchmarking results (I also reformulated the question to be more specific and I updated the code)...
I implemented a kernel for matrix-vector multiplication in CUDA C following the CUDA C Programming Guide using shared memory. Let me first present some benchmarking results which I did on a Jetson TK1 (GPU: Tegra K1, compute capability 3.2) and a comparison with cuBLAS:
Here I guess cuBLAS does some magic since it seems that its execution is not affected by the number of columns of A, which, in turn, implies that there is some sort of parallelisation along the columns of A.
Now, here is the source code of my kernel and a host function to call it (file: mv.cuh):
#include <cuda_runtime.h>
#define BLOCK_SIZE 16
/* Set to __restric__ */
#define RESTRICT
/**
* Performs matrix-vector multiplication on the device.
*
* #param dA Address of matrix `A` on the device
* #param dx Address of vector `x` on the device
* #param dev_ptr_y Address of result y = A*x
* #param nRows Number of rows of `A`
* #param nx Size of `x` (number of columns of `A`)
*
* #tparam T Data type
*
*/
template<typename T>
__global__ void matvec_kernel(
const T * RESTRICT dA,
const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows,
const unsigned int nx);
/**
* Host-side wrapper for #matvec_kernel.
*
* #param dA Address of matrix `A` on the device
* #param dx Address of vector `x` on the device
* #param dev_ptr_y Address of result y = A*x
* #param nRows Number of rows of `A`
* #param nx Size of `x` (number of columns of `A`)
* #param elapsed_time Time for the kernel to complete the execution in `ms`.
* If NULL is passed to this argument, the elapsed time
* will not be computed.
*
* #tparam T Data type for `A` and `x`
*/
template<typename T>
__host__ void matvec(
const T * RESTRICT dA,
const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows,
const unsigned int nx);
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
template<typename T>
__global__ void matvec_kernel(const T * RESTRICT dA, const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows, const unsigned int nx)
{
unsigned int bid = blockIdx.x;
unsigned int row = threadIdx.x;
const unsigned int block_size = blockDim.x;
const unsigned int num_hor_blocks = ((nx + block_size - 1)/ block_size);
unsigned int n_star;
unsigned int idx_x;
unsigned int idx_Asub;
unsigned int idx_y;
const T * Asub;
const T * xsub;
/* Only `x` is copied to shared memory */
__shared__ T x_shared[BLOCK_SIZE];
idx_y = bid * block_size;
T * y_sub = dy + idx_y;
T y_val = 0.0;
#pragma unroll
for (unsigned int m = 0; m < num_hor_blocks; ++m)
{
idx_Asub = block_size * (bid + m * nRows);
idx_x = m * block_size;
Asub = dA + idx_Asub;
xsub = dx + idx_x;
if (idx_x + row < nx) {
x_shared[row] = xsub[row];
}
__syncthreads();
/* If the tiling is exact */
if ( (nRows % block_size == 0 && nx % block_size == 0 ) ||
(m != block_size - 1 || bid != gridDim.x - 1)) {
y_val += Asub[row] * x_shared[0];
y_val += Asub[row + nRows] * x_shared[1];
y_val += Asub[row + 2 * nRows] * x_shared[2];
y_val += Asub[row + 3 * nRows] * x_shared[3];
y_val += Asub[row + 4 * nRows] * x_shared[4];
y_val += Asub[row + 5 * nRows] * x_shared[5];
y_val += Asub[row + 6 * nRows] * x_shared[6];
y_val += Asub[row + 7 * nRows] * x_shared[7];
y_val += Asub[row + 8 * nRows] * x_shared[8];
y_val += Asub[row + 9 * nRows] * x_shared[9];
y_val += Asub[row + 10 * nRows] * x_shared[10];
y_val += Asub[row + 11 * nRows] * x_shared[11];
y_val += Asub[row + 12 * nRows] * x_shared[12];
y_val += Asub[row + 13 * nRows] * x_shared[13];
y_val += Asub[row + 14 * nRows] * x_shared[14];
y_val += Asub[row + 15 * nRows] * x_shared[15];
} else {
n_star = min(BLOCK_SIZE, nx - idx_x);
#pragma unroll
for (unsigned int e = 0; e < n_star; ++e) {
y_val += Asub[row + e * nRows] * x_shared[e];
}
}
__syncthreads();
}
if (row + idx_y < nRows)
y_sub[row] = y_val;
}
template<typename T>
__host__ void matvec(
const T * RESTRICT dA,
const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows,
const unsigned int nx)
{
dim3 dim_grid( (nRows + BLOCK_SIZE -1)/ BLOCK_SIZE );
dim3 dim_block(BLOCK_SIZE);
matvec_kernel<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);
}
I'm using this to time my execution (file: cuda_timer.cuh):
#include <cuda_runtime.h>
#include "error_handles.cuh"
static cudaEvent_t start;
static cudaEvent_t stop;
static short timer_running = 0;
static short tic_called = 0;
/**
* Sets up the timer.
*
* Must be called before any invocation to
* tic() or toc(), preferrably at the beginning of your
* application.
*/
void start_tictoc();
/**
* Starts the timer.
*
* Use `toc()` to get the elapsed time; `tic()` must
* be called before a `toc()`.
*/
void tic();
/**
* Returns the elapsed time between its invocation
* and a previous invocation of `toc()`. Returns `-1`
* and prints a warning message if `toc()` was not
* previously called. Returns `-2` and prints and error
* message if `start_tictoc()` has not been called.
*
* #return Elapsed time between `tic()` and `toc()` in milliseconds
* with a resolution of `0.5` microseconds.
*/
float toc();
/**
* This function should be called when the
* time will not be being used any more. It destroys
* the events used to time CUDA kernels. If the timer
* is not running, this function does nothing and
* prints a warning message.
*/
void stop_tictoc();
void start_tictoc() {
_CUDA(cudaEventCreate(&start));
_CUDA(cudaEventCreate(&stop));
timer_running = 1;
}
void tic() {
if (timer_running) {
_CUDA(cudaEventRecord(start, 0));
tic_called = 1;
} else {
printf("WARNING: tic() called without a timer running!\n");
}
}
float toc() {
float elapsed_time;
if (tic_called == 0) {
printf("WARNING: toc() called without a previous tic()!\n");
return -1;
}
if (timer_running == 1) {
// _CUDA(cudaDeviceSynchronize()); // Removed! (See discussion below)
_CUDA(cudaEventRecord(stop, 0));
_CUDA(cudaEventSynchronize(stop));
_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
tic_called = 0;
return elapsed_time;
} else {
printf("WARNING: toc() called without a timer running!\n");
return -2;
}
}
void stop_tictoc()
{
if (timer_running == 1){
_CUDA(cudaEventDestroy(start));
_CUDA(cudaEventDestroy(stop));
timer_running = 0;
} else{
printf("WARNING: stop_tictoc() called without a timer running!\n");
}
}
and my main file (main.cu) is the following:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include "cublas_v2.h"
#include <math.h>
#include <curand.h>
#include <stdbool.h>
#include "mv.cuh"
#include "cuda_timer.cuh"
#include "error_handles.cuh"
typedef float real_t;
#define _CUDA(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define _CUBLAS(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define _CURAND(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define TEST_COLUMNS 1
#define TEST_ROWS 0
/**
* If `TEST_WRT_` is set to `TEST_COLUMNS`, then a benchmark
* will be performed with respect to columns (with a fixed
* number of rows). If it is set to `TEST_ROWS`, then a benchmark will
* run with respect to rows (fixed number of columns).
*/
#define TEST_WRT_ TEST_ROWS
#define CONSTANT_COLS 300
#define CONSTANT_ROWS 256
/**
* In order to estimate the execution time, every
* kernel is run `RUNS` times and the average is taken.
*/
#define RUNS 50
void compare_results(real_t *dev_y_cublas, real_t * dev_y,unsigned int nrows)
{
real_t * hst_y_cublas;
real_t * hst_y;
const size_t s = nrows * sizeof(real_t);
hst_y_cublas = (real_t*) malloc(s);
hst_y = (real_t*) malloc(s);
_CUDA(cudaMemcpy(hst_y, dev_y, s, cudaMemcpyDeviceToHost));
_CUDA(cudaMemcpy(hst_y_cublas, dev_y_cublas, s, cudaMemcpyDeviceToHost));
for (unsigned int i = 0; i < nrows; ++i) {
if (fabsf(hst_y_cublas[i] - hst_y[i]) > 0.001) {
printf("ERROR ------ %f\n", fabsf(hst_y_cublas[i] - hst_y[i]));
exit(EXIT_FAILURE);
}
}
if (hst_y_cublas) free(hst_y_cublas);
if (hst_y) free(hst_y);
}
void do_benchmark() {
curandGenerator_t gen;
real_t *dev_rand_data = NULL; // Random data will be allocated here!
real_t *dev_y = NULL;
real_t *dev_y_cublas = NULL;
real_t t;
real_t t_cublas;
const size_t n_rows_max = 1500;
const size_t n_cols_max = 300;
const size_t ntot = n_cols_max * (1 + n_rows_max);
const size_t size_tot = sizeof(real_t) * ntot;
float alpha = 1.0, beta = 0.0; // beta was initially set to 1.0 by mistake
cublasHandle_t handle;
_CUBLAS(cublasCreate(&handle));
start_tictoc();
_CUDA(cudaMalloc((void** )&dev_rand_data, size_tot));
_CUDA(cudaMalloc((void** )&dev_y, n_rows_max * sizeof(real_t)));
_CUDA(cudaMalloc((void** )&dev_y_cublas, n_rows_max * sizeof(real_t)));
_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
_CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
tic();
_CURAND(curandGenerateUniform(gen, dev_rand_data, ntot));
t = toc();
printf("RNG in %f ms\n", t);
_CURAND(curandDestroyGenerator(gen));
size_t ncols = CONSTANT_COLS;
size_t nrows = CONSTANT_ROWS;
size_t runs = RUNS;
cudaMemset(dev_y_cublas, 0, n_rows_max * sizeof(real_t));
matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows, ncols);
_CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
/* Compare results */
compare_results(dev_y_cublas,dev_y, nrows);
FILE * pFile;
char filename[50];
#if (TEST_WRT_ == TEST_COLUMNS)
sprintf(filename, "times_rows%lu_cols.txt", nrows);
#else
sprintf(filename, "times_cols%lu_rows.txt", ncols);
#endif
printf("Logging to : '%s'\n", filename);
pFile = fopen(filename, "w");
if (pFile == NULL) {
perror("Error opening file.");
exit(79);
}
#if (TEST_WRT_ == TEST_COLUMNS)
fprintf(pFile, "0, %lu, 0, 0\n", nrows);
for (ncols = 32; ncols < n_cols_max; ncols += 32) {
#else
fprintf(pFile, "1, %lu, 0, 0\n", ncols);
for (nrows = 32; nrows < n_rows_max; nrows += 32) {
#endif
tic();
for (short i = 0; i < runs; i++) {
matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows,
ncols);
}
t = toc() / runs;
tic();
for (short i = 0; i < runs; i++) {
_CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
}
t_cublas = toc() / runs;
#if (TEST_WRT_ == TEST_COLUMNS)
fprintf(pFile, "%lu, %f, %f\n", ncols, t, t_cublas);
#else
fprintf(pFile, "%lu, %f, %f\n", nrows, t, t_cublas);
#endif
}
_CUBLAS(cublasDestroy(handle));
fclose(pFile);
if (dev_rand_data != NULL)
_CUDA(cudaFree(dev_rand_data));
stop_tictoc();
}
int main(void)
{
do_benchmark();
return EXIT_SUCCESS;
}
Finally, this is a MATLAB script I'm using to plot the execution times:
fetch_this = 'times_cols512_rows.txt';
username = 'ubuntu';
target_hostname = 'jetson';
% Do not modify below this line
eval_this=['! scp ' username '#' target_hostname ':~/mv/Debug/' fetch_this ' .'];
eval(eval_this)
set(0, 'DefaultAxesFontSize', 14);
r = csvread(fetch_this);
r_header = r(1,:);
plot(r(2:end,1), r(2:end,2)*1000, '-');
hold on
plot(r(2:end,1), r(2:end,3)*1000, '-r');
grid on;
fig_title = 'Matvec on Tegra K1 - %d %s';
if (r_header(1)==1),
xlabel('Number of rows');
title(sprintf(fig_title, r_header(2),'columns'));
else
xlabel('Number of columns');
title(sprintf(fig_title, r_header(2),'rows'));
end
ylabel('Computation time [us]');
legend('Kernel', 'cuBLAS');
axis tight
I am concerned about the performance and the scalability of my kernel, so first I would like to know how to improve the scalability with respect to the number of rows of matrix A. Second, I know that it is not very good practice to have branch divergence (and my code has), but I'm feeling I want some hints to improve it.
UPDATE :
Thanks to all your comments and suggestions, I reached the conclusion that cudaDeviceSynchronized() caused, in the first place, some peculiarities with my timing so my initial measurements were inaccurate. Row-major ordering leads to worse results. The size of the blocks is an important tuning parameter and changing from 16 to 32 or 64 improves the execution time. Further benchmarking is necessary to choose the block size. To this end, one may use the following API for the kernel:
template<typename T, const uint_t blk>
__global__ void matvec_kernel(const T * RESTRICT dA, const T * RESTRICT dx,
T * RESTRICT dy, const uint_t nRows, const uint_t nx);
and call it like this from the host:
template<typename T>
__host__ void matvec(const T * RESTRICT dA, const T * RESTRICT dx,
T * RESTRICT dy, const uint_t nRows, const uint_t nx) {
uint_t blk_size_opt = 64;
/* Add code to decide the value of `blk_size_opt` */
if (blk_size_opt == 32) {
matvec_engine<T, 32>(dA, dx, dy, nRows, nx);
} else if (blk_size_opt == 64) {
matvec_engine<T, 64>(dA, dx, dy, nRows, nx);
} else if (blk_size_opt == 128) {
matvec_engine<T, 128>(dA, dx, dy, nRows, nx);
} else if (blk_size_opt == 256) {
matvec_engine<T, 256>(dA, dx, dy, nRows, nx);
}
}
Let me provide some benchmarking results. First a comparison with cublasSgemv:
and the effect of block size on the execution time:

First, let me write down the full working Matrix-Vector multiplication kernel employing shared memory:
template<typename T>
__global__ void matvec_kernel(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ T x_shared[BLOCK_SIZE];
T y_val = 0.0;
#pragma unroll
for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
{
if ((m * BLOCK_SIZE + threadIdx.x) < nCols) x_shared[threadIdx.x] = dx[threadIdx.x + m * BLOCK_SIZE];
else x_shared[threadIdx.x] = 0.f;
__syncthreads();
#pragma unroll
for (unsigned int e = 0; e < BLOCK_SIZE; ++e) {
// --- Column-major ordering - faster
y_val += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
// --- Row-major ordering - slower
//y_val += dA[tid * nCols + (e + BLOCK_SIZE * m)] * x_shared[e];
}
__syncthreads();
}
if (tid < nRows) dy[tid] = y_val;
}
Unless differently specified, all the tests will be done on a GT540M card.
A first parameter to be optimized is the BLOCK_SIZE. Changing the BLOCK_SIZE changes the algorithm performance, as witnessed by the following graph:
The following graphs compares row-major ordering vs. column-major ordering. The latter is faster:
Another optimization you may wish to try is using more Instruction Level Parallelism (ILP) by this modified kernel employing ILP = 2
template<typename T>
__global__ void matvec_kernel_ILP2(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ T x_shared[BLOCK_SIZE];
T y_val1 = 0.0;
T y_val2 = 0.0;
#pragma unroll
for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
{
if ((m * BLOCK_SIZE + threadIdx.x) < nCols) x_shared[threadIdx.x] = dx[threadIdx.x + m * BLOCK_SIZE];
else x_shared[threadIdx.x] = 0.f;
__syncthreads();
#pragma unroll
for (unsigned int e = 0; e < BLOCK_SIZE; ++e) {
y_val1 += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
y_val2 += dA[tid + gridDim.x * BLOCK_SIZE + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
}
__syncthreads();
}
if (tid < nRows) dy[tid] = y_val1;
if ((tid + gridDim.x * BLOCK_SIZE) < nRows) dy[tid + gridDim.x * BLOCK_SIZE] = y_val2;
}
This kernel should be called with half of the threads, as
dim3 dim_grid((nRows/2 + BLOCK_SIZE -1)/ BLOCK_SIZE);
dim3 dim_block(BLOCK_SIZE);
matvec_kernel_ILP2<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);
Finally, since you are using a device with compute capability 3.2, you can try using shuffle operations. I'm providing here the kernel using shuffle operations instead of shared memory. In this case, you should set BLOCK_SIZE = 32:
template<typename T>
__global__ void matvec_kernel_shfl(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
T x_shfl_src, x_shfl_dest;
T y_val = 0.0;
#pragma unroll
for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
{
if ((m * BLOCK_SIZE + threadIdx.x) < nCols) x_shfl_src = dx[threadIdx.x + m * BLOCK_SIZE];
else x_shfl_src = 0.f;
__syncthreads();
// #pragma unroll
for (int e = 0; e < 32; ++e) {
// --- Column-major ordering - faster
x_shfl_dest = __shfl(x_shfl_src, e);
y_val += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shfl_dest;
// --- Row-major ordering - slower
//y_val += dA[tid * nCols + (e + BLOCK_SIZE * m)] * x_shared[e];
}
__syncthreads();
}
if (tid < nRows) dy[tid] = y_val;
}
Shuffle operations improve the performance over shared memory for BLOCK_SIZE = 32 on a Kepler K20c as shown by the graph below:

Looking at your code I think that the way you traverse the elements of A may be the problem:
for (unsigned int e = 0; e < n_star; ++e) {
y_val += Asub[row + e * nRows] * x_shared[e];
}
So, when nRows becomes large, you actually read from the global memory (that is where A is stored) with a large stride. In particular this happens in every block: threads inside the same block will read from the global memory in a non-consecutive fashion. This can be improved if you consider storing from the beginning the values of A row-by-row (i.e., using row-major order). This is just a guess and I would have written a comment, but it requires a higher score on Stackoverflow...

Related

Solve the Poisson equation using FFT with CUDA

I'm following a tutorial on using the cuFFT library here: http://gpgpu.org/static/sc2007/SC07_CUDA_3_Libraries.pdf
After following line by line of its code, I'm getting really strange results.
I have input data that is an NxN array of floats. The program does a FFT forward transform, solves Poisson's equation, and then does an inverse FFT. The input data (and output data) is referred to as a square image with sidelength N. When I comment out solve_poisson <<<dimGrid, dimBlock>>> (r_complex_d, kx_d, ky_d, N);, it correctly forward transforms the data and then performs an inverse transform, which causes the output data to be the same as the input data. This is supposed to happen.
Here is the output without calling the solve_poisson method.
0 r_initial: 0.00125126 r: 0.00125132
1 r_initial: 0.563585 r: 0.563585
2 r_initial: 0.193304 r: 0.193304
3 r_initial: 0.80874 r: 0.80874
4 r_initial: 0.585009 r: 0.585009
5 r_initial: 0.479873 r: 0.479873
6 r_initial: 0.350291 r: 0.350291
7 r_initial: 0.895962 r: 0.895962
8 r_initial: 0.82284 r: 0.82284
9 r_initial: 0.746605 r: 0.746605
10 r_initial: 0.174108 r: 0.174108
11 r_initial: 0.858943 r: 0.858943
12 r_initial: 0.710501 r: 0.710502
13 r_initial: 0.513535 r: 0.513535
14 r_initial: 0.303995 r: 0.303995
15 r_initial: 0.0149846 r: 0.0149846
Press any key to continue . . .
However, when I uncomment out the solve_poisson method, the output data is inf or nan, which leads me to believe that the scale variable was somehow close to zero in the solve_poisson method.
So I changed float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]); to float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]) + 0.00001f. This change is not in the original tutorial. The results computed here are not supposed to have extreme positive or negative values.
0 r_initial: 0.00125126 r: -11448.1
1 r_initial: 0.563585 r: 11449.3
2 r_initial: 0.193304 r: -11448.3
3 r_initial: 0.80874 r: 11449.2
4 r_initial: 0.585009 r: 11449.4
5 r_initial: 0.479873 r: -11448.4
6 r_initial: 0.350291 r: 11449.5
7 r_initial: 0.895962 r: -11448.6
8 r_initial: 0.82284 r: -11448.5
9 r_initial: 0.746605 r: 11449.4
10 r_initial: 0.174108 r: -11448.3
11 r_initial: 0.858943 r: 11449.3
12 r_initial: 0.710501 r: 11449.2
13 r_initial: 0.513535 r: -11448.4
14 r_initial: 0.303995 r: 11449.3
15 r_initial: 0.0149846 r: -11448.1
Press any key to continue . . .
In the tutorial, a sample calculation on slide 43 on page 22 is computed=0.975879 reference=0.975882, yet my results are completely different and really large.
The following code is what I used.
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cufft.h>
#include <stdlib.h>
#include <iostream>
#define N 4 //4 X 4 // N is the sidelength of the image -> 16 pixels in entire image
#define block_size_x 2
#define block_size_y 2
__global__ void real2complex(cufftComplex *c, float *a, int n);
__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n);
__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n);
int main()
{
float *kx, *ky, *r;
kx = (float *)malloc(sizeof(float) * N);
ky = (float *)malloc(sizeof(float) * N);
r = (float *)malloc(sizeof(float) * N * N);
float *kx_d, *ky_d, *r_d;
cufftComplex *r_complex_d;
cudaMalloc((void **)&kx_d, sizeof(float) * N);
cudaMalloc((void **)&ky_d, sizeof(float) * N);
cudaMalloc((void **)&r_d, sizeof(float) * N * N);
cudaMalloc((void **)&r_complex_d, sizeof(cufftComplex) * N * N);
for (int y = 0; y < N; y++)
for (int x = 0; x < N; x++)
r[x + y * N] = rand() / (float)RAND_MAX;
//r[x + y * N] = sin(exp(-((x - N / 2.0f) * (x - N / 2.0f) + (N / 2.0f - y) * (N / 2.0f - y)) / (20 * 20))) * 255 / sin(1); //Here is sample data that will high values at the center of the image and low values as you go farther and farther away from the center.
float* r_inital = (float *)malloc(sizeof(float) * N * N);
for (int i = 0; i < N * N; i++)
r_inital[i] = r[i];
for (int i = 0; i < N; i++)
{
kx[i] = i - N / 2.0f; //centers kx values to be at center of image
ky[i] = N / 2.0f - i; //centers ky values to be at center of image
}
cudaMemcpy(kx_d, kx, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(ky_d, ky, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaMemcpy(r_d, r, sizeof(float) * N * N, cudaMemcpyHostToDevice);
cufftHandle plan;
cufftPlan2d(&plan, N, N, CUFFT_C2C);
/* Compute the execution configuration, block_size_x*block_size_y = number of threads */
dim3 dimBlock(block_size_x, block_size_y);
dim3 dimGrid(N / dimBlock.x, N / dimBlock.y);
/* Handle N not multiple of block_size_x or block_size_y */
if (N % block_size_x != 0) dimGrid.x += 1;
if (N % block_size_y != 0) dimGrid.y += 1;
real2complex << < dimGrid, dimBlock >> > (r_complex_d, r_d, N);
cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_FORWARD);
solve_poisson << <dimGrid, dimBlock >> > (r_complex_d, kx_d, ky_d, N);
cufftExecC2C(plan, r_complex_d, r_complex_d, CUFFT_INVERSE);
float scale = 1.0f / (N * N);
complex2real_scaled << <dimGrid, dimBlock >> > (r_d, r_complex_d, scale, N);
cudaMemcpy(r, r_d, sizeof(float) * N * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N * N; i++)
std::cout << i << "\tr_initial: " << r_inital[i] << "\tr: " << r[i] << std::endl;
system("pause");
/* Destroy plan and clean up memory on device*/
free(kx);
free(ky);
free(r);
free(r_inital);
cufftDestroy(plan);
cudaFree(r_complex_d);
cudaFree(kx_d);
}
__global__ void real2complex(cufftComplex *c, float *a, int n)
{
/* compute idx and idy, the location of the element in the original NxN array */
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < n && idy < n)
{
int index = idx + idy * n;
c[index].x = a[index];
c[index].y = 0.0f;
}
}
__global__ void complex2real_scaled(float *a, cufftComplex *c, float scale, int n)
{
/* compute idx and idy, the location of the element in the original NxN array */
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < n && idy < n)
{
int index = idx + idy * n;
a[index] = scale * c[index].x;
}
}
__global__ void solve_poisson(cufftComplex *c, float *kx, float *ky, int n)
{
/* compute idx and idy, the location of the element in the original NxN array */
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < n && idy < n)
{
int index = idx + idy * n;
float scale = -(kx[idx] * kx[idx] + ky[idy] * ky[idy]) + 0.00001f;
if (idx == 0 && idy == 0) scale = 1.0f;
scale = 1.0f / scale;
c[index].x *= scale;
c[index].y *= scale;
}
}
Is there anything I messed up on? I would really appreciate if anyone could help me out.
Although the poster has found the mistake by himself, I want to share my own implementation of the 2D Poisson equation solver.
The implementation slightly differs from the one linked to by the poster.
The theory is reported at Solve Poisson Equation Using FFT.
MATLAB VERSION
I first report the Matlab version for reference:
clear all
close all
clc
M = 64; % --- Number of Fourier harmonics along x (should be a multiple of 2)
N = 32; % --- Number of Fourier harmonics along y (should be a multiple of 2)
Lx = 3; % --- Domain size along x
Ly = 1.5; % --- Domain size along y
sigma = 0.1; % --- Characteristic width of f (make << 1)
% --- Wavenumbers
kx = (2 * pi / Lx) * [0 : (M / 2 - 1) (- M / 2) : (-1)]; % --- Wavenumbers along x
ky = (2 * pi / Ly) * [0 : (N / 2 - 1) (- N / 2) : (-1)]; % --- Wavenumbers along y
[Kx, Ky] = meshgrid(kx, ky);
% --- Right-hand side of differential equation
hx = Lx / M; % --- Grid spacing along x
hy = Ly / N; % --- Grid spacing along y
x = (0 : (M - 1)) * hx;
y = (0 : (N - 1)) * hy;
[X, Y] = meshgrid(x, y);
rSquared = (X - 0.5 * Lx).^2 + (Y - 0.5 * Ly).^2;
sigmaSquared = sigma^2;
f = exp(-rSquared / (2 * sigmaSquared)) .* (rSquared - 2 * sigmaSquared) / (sigmaSquared^2);
fHat = fft2(f);
% --- Denominator of the unknown spectrum
den = -(Kx.^2 + Ky.^2);
den(1, 1) = 1; % --- Avoid division by zero at wavenumber (0, 0)
% --- Unknown determination
uHat = ifft2(fHat ./ den);
% uHat(1, 1) = 0; % --- Force the unknown spectrum at (0, 0) to be zero
u = real(uHat);
u = u - u(1,1); % --- Force arbitrary constant to be zero by forcing u(1, 1) = 0
% --- Plots
uRef = exp(-rSquared / (2 * sigmaSquared));
err = 100 * sqrt(sum(sum(abs(u - uRef).^2)) / sum(sum(abs(uRef).^2)));
errMax = norm(u(:)-uRef(:),inf)
fprintf('Percentage root mean square error = %f\n', err);
fprintf('Maximum error = %f\n', errMax);
surf(X, Y, u)
xlabel('x')
ylabel('y')
zlabel('u')
title('Solution of 2D Poisson equation by spectral method')
CUDA VERSION
Here is the corresponding CUDA version:
#include <stdio.h>
#include <fstream>
#include <iomanip>
// --- Greek pi
#define _USE_MATH_DEFINES
#include <math.h>
#include <cufft.h>
#define BLOCKSIZEX 16
#define BLOCKSIZEY 16
#define prec_save 10
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************************************/
/* COMPUTE RIGHT HAND SIDE OF 2D POISSON EQUATION */
/**************************************************/
__global__ void computeRHS(const float * __restrict__ d_x, const float * __restrict__ d_y,
float2 * __restrict__ d_r, const float Lx, const float Ly, const float sigma,
const int M, const int N) {
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
const float sigmaSquared = sigma * sigma;
const float rSquared = (d_x[tidx] - 0.5f * Lx) * (d_x[tidx] - 0.5f * Lx) +
(d_y[tidy] - 0.5f * Ly) * (d_y[tidy] - 0.5f * Ly);
d_r[tidy * M + tidx].x = expf(-rSquared / (2.f * sigmaSquared)) * (rSquared - 2.f * sigmaSquared) / (sigmaSquared * sigmaSquared);
d_r[tidy * M + tidx].y = 0.f;
}
/****************************************************/
/* SOLVE 2D POISSON EQUATION IN THE SPECTRAL DOMAIN */
/****************************************************/
__global__ void solvePoisson(const float * __restrict__ d_kx, const float * __restrict__ d_ky,
float2 * __restrict__ d_r, const int M, const int N)
{
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
float scale = -(d_kx[tidx] * d_kx[tidx] + d_ky[tidy] * d_ky[tidy]);
if (tidx == 0 && tidy == 0) scale = 1.f;
scale = 1.f / scale;
d_r[M * tidy + tidx].x *= scale;
d_r[M * tidy + tidx].y *= scale;
}
/****************************************************************************/
/* SOLVE 2D POISSON EQUATION IN THE SPECTRAL DOMAIN - SHARED MEMORY VERSION */
/****************************************************************************/
__global__ void solvePoissonShared(const float * __restrict__ d_kx, const float * __restrict__ d_ky,
float2 * __restrict__ d_r, const int M, const int N)
{
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
// --- Use shared memory to minimize multiple access to same spectral coordinate values
__shared__ float kx_s[BLOCKSIZEX], ky_s[BLOCKSIZEY];
kx_s[threadIdx.x] = d_kx[tidx];
ky_s[threadIdx.y] = d_ky[tidy];
__syncthreads();
float scale = -(kx_s[threadIdx.x] * kx_s[threadIdx.x] + ky_s[threadIdx.y] * ky_s[threadIdx.y]);
if (tidx == 0 && tidy == 0) scale = 1.f;
scale = 1.f / scale;
d_r[M * tidy + tidx].x *= scale;
d_r[M * tidy + tidx].y *= scale;
}
/******************************/
/* COMPLEX2REAL SCALED KERNEL */
/******************************/
__global__ void complex2RealScaled(float2 * __restrict__ d_r, float * __restrict__ d_result, const int M, const int N, float scale)
{
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
d_result[tidy * M + tidx] = scale * (d_r[tidy * M + tidx].x - d_r[0].x);
}
/******************************************/
/* COMPLEX2REAL SCALED KERNEL - OPTIMIZED */
/******************************************/
__global__ void complex2RealScaledOptimized(float2 * __restrict__ d_r, float * __restrict__ d_result, const int M, const int N, float scale)
{
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
if ((tidx >= M) || (tidy >= N)) return;
__shared__ float d_r_0[1];
if (threadIdx.x == 0) d_r_0[0] = d_r[0].x;
volatile float2 c;
c.x = d_r[tidy * M + tidx].x;
c.y = d_r[tidy * M + tidx].y;
d_result[tidy * M + tidx] = scale * (c.x - d_r_0[0]);
}
/**************************************/
/* SAVE FLOAT2 ARRAY FROM GPU TO FILE */
/**************************************/
void saveGPUcomplextxt(const float2 * d_in, const char *filename, const int M) {
float2 *h_in = (float2 *)malloc(M * sizeof(float2));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(float2), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) {
//printf("%f %f\n", h_in[i].c.x, h_in[i].c.y);
outfile << std::setprecision(prec_save) << h_in[i].x << "\n"; outfile << std::setprecision(prec_save) << h_in[i].y << "\n";
}
outfile.close();
}
/*************************************/
/* SAVE FLOAT ARRAY FROM GPU TO FILE */
/*************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/********/
/* MAIN */
/********/
int main()
{
const int M = 64; // --- Number of Fourier harmonics along x (should be a multiple of 2)
const int N = 32; // --- Number of Fourier harmonics along y(should be a multiple of 2)
const float Lx = 3.f; // --- Domain size along x
const float Ly = 1.5f; // --- Domain size along y
const float sigma = 0.1f; // --- Characteristic width of f(make << 1)
// --- Wavenumbers on the host
float *h_kx = (float *)malloc(M * sizeof(float));
float *h_ky = (float *)malloc(N * sizeof(float));
for (int k = 0; k < M / 2; k++) h_kx[k] = (2.f * M_PI / Lx) * k;
for (int k = -M / 2; k < 0; k++) h_kx[k + M] = (2.f * M_PI / Lx) * k;
for (int k = 0; k < N / 2; k++) h_ky[k] = (2.f * M_PI / Ly) * k;
for (int k = -N / 2; k < 0; k++) h_ky[k + N] = (2.f * M_PI / Ly) * k;
// --- Wavenumbers on the device
float *d_kx; gpuErrchk(cudaMalloc(&d_kx, M * sizeof(float)));
float *d_ky; gpuErrchk(cudaMalloc(&d_ky, N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_kx, h_kx, M * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_ky, h_ky, N * sizeof(float), cudaMemcpyHostToDevice));
// --- Domain discretization on the host
float *h_x = (float *)malloc(M * sizeof(float));
float *h_y = (float *)malloc(N * sizeof(float));
for (int k = 0; k < M; k++) h_x[k] = Lx / (float)M * k;
for (int k = 0; k < N; k++) h_y[k] = Ly / (float)N * k;
// --- Domain discretization on the device
float *d_x; gpuErrchk(cudaMalloc(&d_x, M * sizeof(float)));
float *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x, h_x, M * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(float), cudaMemcpyHostToDevice));
// --- Compute right-hand side of the equation on the device
float2 *d_r; gpuErrchk(cudaMalloc(&d_r, M * N * sizeof(float2)));
dim3 dimBlock(BLOCKSIZEX, BLOCKSIZEY);
dim3 dimGrid(iDivUp(M, BLOCKSIZEX), iDivUp(N, BLOCKSIZEY));
computeRHS << <dimGrid, dimBlock >> >(d_x, d_y, d_r, Lx, Ly, sigma, M, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
// --- Create plan for CUDA FFT
cufftHandle plan;
cufftPlan2d(&plan, N, M, CUFFT_C2C);
// --- Compute in place forward FFT of right-hand side
cufftExecC2C(plan, d_r, d_r, CUFFT_FORWARD);
// --- Solve Poisson equation in Fourier space
//solvePoisson << <dimGrid, dimBlock >> > (d_kx, d_ky, d_r, M, N);
solvePoissonShared << <dimGrid, dimBlock >> > (d_kx, d_ky, d_r, M, N);
// --- Compute in place inverse FFT
cufftExecC2C(plan, d_r, d_r, CUFFT_INVERSE);
//saveGPUcomplextxt(d_r, "D:\\Project\\poisson2DFFT\\poisson2DFFT\\d_r.txt", M * N);
// --- With cuFFT, an FFT followed by an IFFT will return the same array times the size of the transform
// --- Accordingly, we need to scale the result.
const float scale = 1.f / ((float)M * (float)N);
float *d_result; gpuErrchk(cudaMalloc(&d_result, M * N * sizeof(float)));
//complex2RealScaled << <dimGrid, dimBlock >> > (d_r, d_result, M, N, scale);
complex2RealScaledOptimized << <dimGrid, dimBlock >> > (d_r, d_result, M, N, scale);
//saveGPUrealtxt(d_result, "D:\\Project\\poisson2DFFT\\poisson2DFFT\\d_result.txt", M * N);
// --- Transfer data from device to host
float *h_result = (float *)malloc(M * N * sizeof(float));
gpuErrchk(cudaMemcpy(h_result, d_result, M * N * sizeof(float), cudaMemcpyDeviceToHost));
return 0;
}
As it shows in the tutorial, the Matlab implementation on slide 33 on page 17 shows that the Poisson calculations are based on the top left corner of the screen as the origin. The x and y data values are then x = (0:(N-1))*h; and y = (0:(N-1))*h;, which is why the meshgrid created from these x and y values both start from 0 and increase, as shown on the graph's x and y axes on slide 31 on page 16. In this case, where the image length was 1 (I refer to the input data of the NxN float array or the meshgrid as an image), the center of the image is actually (0.5, 0.5). I wanted to translate these points, so the center point would instead be (0,0) and followed a typical representation of the Cartesian Plane.
So in my code, instead of the Matlab code of
x = (0:(N-1))*h;
y = (0:(N-1))*h;
which could be implemented as
for (int i = 0; i < N; i++)
{
kx[i] = i;
ky[i] = i;
}
I replaced it with
for (int i = 0; i < N; i++)
{
kx[i] = i - N / 2.0f; //centers kx values to be at center of image
ky[i] = N / 2.0f - i; //centers ky values to be at center of image
}
However, I had forgot to change the Poisson calculation so it recognizes the center of the image as the origin instead of the top right corner as the origin. So as Mr. Robert Crovella said, I would have to
change this line: if (idx == 0 && idy == 0) scale = 1.0f; to this: if
(idx == 2 && idy == 2) scale = 1.0f;
for the case where the image length, or N, is 4.
To generalize this for any image length, this line of code could be then changed to if (idx == n/2 && idy == n/2) scale = 1.0f;

Solving dense linear systems AX = B with CUDA

Can I use the new cuSOLVER library (CUDA 7) to solve linear systems of the form
AX = B
where A, X and B are NxN dense matrices ?
Yes.
Approach nr. 1
In the framework of cuSOLVER you can use QR decomposition, see QR decomposition to solve linear systems in CUDA.
Approach nr. 2
Alternatively, you can calculate the matrix inverse by the successive involation of
cublas<t>getrfBatched()
which calculates the LU decomposition of a matrix, and
cublas<t>getriBatched()
which calculates the inverse of the matrix starting from its LU decomposition.
Approach nr. 3
A final possibility is using
cublas<t>getrfBatched()
followed by a twofold invocation of
cublas<t>trsm()
which solves upper or lower triangular linear systems.
As pointed out by Robert Crovella, the answer may vary on the size and the type of the involved matrices.
Code for approach nr. 1
Please, see QR decomposition to solve linear systems in CUDA.
Code for approaches nr. 2 and nr. 3
Below, I'm reporting a worked example for the implementation of approaches nr. 2 and 3. Hankel matrices are used to feed the approaches with well-conditioned, invertible matrices. Please, note that approach nr. 3 requires permuting (rearranging) the system coefficients vector according to the pivot array obtained following the invokation of cublas<t>getrfBatched(). This permutation can be conveniently done on the CPU.
#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "Utilities.cuh"
#include "TimingGPU.cuh"
#define prec_save 10
#define BLOCKSIZE 256
#define BLOCKSIZEX 16
#define BLOCKSIZEY 16
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {
double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));
// --- Initialize random seed
srand(time(NULL));
// --- Generate random numbers
for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();
// --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];
free(h_atemp);
}
/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref,
double * __restrict h_y, const int N) {
for (int k = 0; k < N; k++) h_y[k] = 0.f;
for (int m = 0; m < N; m++)
for (int n = 0; n < N; n++)
h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];
}
/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
for (int i = 0; i < N; i++) {
double temp = vec[i];
vec[i] = vec[pivotArray[i] - 1];
vec[pivotArray[i] - 1] = temp;
}
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 1000;
const unsigned int Nmatrices = 1;
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));
TimingGPU timerLU, timerApproach1, timerApproach2;
double timingLU, timingApproach1, timingApproach2;
/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Matrices to be inverted (only one in this example)
double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));
// --- Setting the Hankel matrix
setHankelMatrix(h_A, N);
// --- Defining the solution
double *h_xref = (double *)malloc(N * sizeof(double));
for (int k = 0; k < N; k++) h_xref[k] = 1.f;
// --- Coefficient vectors (only one in this example)
double *h_y = (double *)malloc(N * sizeof(double));
computeCoefficientsVector(h_A, h_xref, h_y, N);
// --- Result (only one in this example)
double *h_x = (double *)malloc(N * sizeof(double));
// --- Allocate device space for the input matrices
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));
// --- Move the relevant matrices from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(double), cudaMemcpyHostToDevice));
/**********************************/
/* COMPUTING THE LU DECOMPOSITION */
/**********************************/
timerLU.StartCounter();
// --- Creating the array of pointers needed as input/output to the batched getrf
double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;
double **d_inout_pointers;
gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
free(h_inout_pointers);
int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
int *d_InfoArray; gpuErrchk(cudaMalloc(&d_InfoArray, Nmatrices * sizeof(int)));
int *h_InfoArray = (int *)malloc(Nmatrices * sizeof(int));
cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
//cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
timingLU = timerLU.GetCounter();
printf("Timing LU decomposition %f [ms]\n", timingLU);
/*********************************/
/* CHECKING THE LU DECOMPOSITION */
/*********************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
saveCPUrealtxt(h_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);
/******************************************************************************/
/* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
/******************************************************************************/
timerApproach1.StartCounter();
// --- Allocate device space for the inverted matrices
double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));
// --- Creating the array of pointers needed as output to the batched getri
double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));
double **d_out_pointers;
gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
free(h_out_pointers);
cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
double alpha1 = 1.f;
double beta1 = 0.f;
cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));
timingApproach1 = timingLU + timerApproach1.GetCounter();
printf("Timing approach 1 %f [ms]\n", timingApproach1);
/**************************/
/* CHECKING APPROACH NR.1 */
/**************************/
saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);
/*************************************************************/
/* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
/*************************************************************/
timerApproach2.StartCounter();
double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));
gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
rearrange(h_y, h_pivotArray, N);
gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
// --- Now P*A=L*U
// Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y
// --- 1st phase - solve Ly = b
const double alpha = 1.f;
// --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result
// --- Lower triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));
// --- Upper triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));
timingApproach2 = timingLU + timerApproach2.GetCounter();
printf("Timing approach 2 %f [ms]\n", timingApproach2);
/**************************/
/* CHECKING APPROACH NR.2 */
/**************************/
saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);
return 0;
}
The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.
Some useful references on the third approach:
NAG Fortran Library Routine Document
Scientific Computing Software Library (SCSL) User’s Guide
https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c
EDIT
Timings (in ms) for approaches nr. 2 and 3 (tests performed on a GTX960 card, cc. 5.2).
N LU decomposition Approach nr. 2 Approach nr. 3
100 1.08 2.75 1.28
500 45.4 161 45.7
1000 302 1053 303
As it emerges, approach nr. 3 is more convenient and its cost is essentially the cost of computing the LU factorization. Furthermore:
Solving linear systems by LU decomposition is faster than using QR decomposition (see QR decomposition to solve linear systems in CUDA);
LU decomposition is limited to square linear systems, while QR decomposition helps in case of non-square linear systems.
The below Matlab code can be used for checking the results
clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt
A = reshape(A, N, N);
yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);
load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);
L = eye(N);
for k = 1 : N
L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
U(k, k : N) = Adecomposed(k, k : N);
end
load pivotArray.txt
Pj = eye(N);
for j = 1 : N
tempVector = Pj(j, :);
Pj(j, :) = Pj(pivotArray(j), :);
Pj(pivotArray(j), :) = tempVector;
end
fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));
xprime = inv(Lmatlab) * yMatlab;
xMatlab = inv(Umatlab) * xprime;
fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));
load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

Unable to execute device kernel in CUDA

I am trying to call a device kernel within a global kernel. My global kernel is a Matrix Multiplication and my device kernel is finding the maximum value and the index in each column of the product matrix. Following is the code :
__device__ void MaxFunction(float* Pd, float* max)
{
int x = (threadIdx.x + blockIdx.x * blockDim.x);
int y = (threadIdx.y + blockIdx.y * blockDim.y);
int k = 0;
int temp = 0; int temp_idx = 0;
for (k = 0; k < wB; ++k) {
if(Pd[x*wB + y] > temp){
temp = Pd[x*wB + y];
temp_idx = x*wB + y;
}
max[y*2 + 0] = temp;
max[y*2 + 1] = temp_idx;
}
}
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
// declare cache in the shared memory
__shared__ float Mds[blockD][blockD];
__shared__ float Nds[blockD][blockD];
float Pvalue = 0;
// Loop over the Md and Nd block dimension required to compute the Pd element
for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x);
m < ((wA * blockD * blockIdx.y)+wA-1);
m += blockD, n += (blockD*hB)){
// collaboratively loading of Md and Nd blocks into shared memory
Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
__syncthreads();
// keep track of the running sum
for (int k = 0; k < blockD; k++)
Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
__syncthreads();
}
// write back to the global memory
int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
__syncthreads();
MaxFunction(Pd, max);
}
The Main code :
#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>
#define blockD 32
const int wA = 128;
const int hA = 1024;
const int wB = 128;
const int hB = wA;
main(void){
void MatrixMultiplication(float *, float *, float *, float *);
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *M, *N, *P, *C;
// allocate memory on the CPU
M = (float*)malloc(size_A);
N = (float*)malloc(size_B);
P = (float*)malloc(size_max);
C = (float*)malloc(size_C);
// initialize the matrices
for (int y=0; y < hA; y++) {
for (int x=0; x < wA; x++){
M[y*wA + x] = x;
}
}
for (int y=0; y<hB; y++) {
for (int x=0; x<wB; x++){
N[y*wB + x] = x;
}
}
MatrixMultiplication(M, N, P, C);
//Write
FILE *f1;
int i, j;
f1 = fopen("max_val.txt","w");
for(i=0; i < (wB * 2); i+=2){
fprintf(f1,"%d\t%d\n",int(P[i]),int(P[i+1]));
}
fclose(f1);
f1 = fopen("Prod_mat.txt","w");
for(i=0; i < 2; i++){
for(j=0; j < wB; j++){
fprintf(f1,"%d\t",int(C[i*wB + j]));
}
fprintf(f1,"\n");
}
fclose(f1);
free( M );
free( N );
free( P );
free( C );
cudaDeviceReset();
return 0;
}
void MatrixMultiplication(float *M, float *N, float *P, float *C) {
int size_A = wA * hA * sizeof(float);
int size_B = wB * hB * sizeof(float);
int size_C = wB * hA * sizeof(float);
int size_max = 2 * wB * sizeof(float);
float *Md, *Nd, *Pd, *max;
// allocate memory on the GPU
cudaMalloc((void**)&Md, size_A);
cudaMalloc((void**)&Nd, size_B);
cudaMalloc((void**)&Pd, size_C);
cudaMalloc((void**)&max, size_max);
// transfer M and N to device memory
cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);
// kernel invocation code
dim3 dimBlock(blockD, blockD);
dim3 dimGrid(wA/blockD, hB/blockD);
//Execute Kernel
MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);
// transfer P from device
cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
cudaFree(max);
}
The Matrix Multiplication result is fine (Verified using Matlab), but I am not able to get the max values and their corresponding index. I would appreciate if anyone can kindly point out at what I am doing wrong. The max variable has only garbage when I run the above code.
Apparently you are attempting to find the maximum value in each column, as well as the offset to that value.
But all of your threads in y are hammering on the same location for max value (max[x*2 + 0]). This isn't recommended, as there is no way to sort out a race condition. You should use atomic operations, or other methods (e.g. reduction) to handle multiple threads updating a single max value this way.
Since you have a need to update two values atomically (the max value and it's location), it's not a simple matter of replacing your plain access with a standard atomic function. However, since you are dealing with two 32-bit adjacent quantities, you may be interested in my answer here.
By the way I think matlab's native matrix multiply on gpuArray should be faster than any matrix multiply code you write. But it would require the Parallel Compute Toolbox.

solve Ax=b using CUDA, works for matrix size less than 128x128

I wrote a code that uses GPU to do the forward solve (get U matrix from A = LU where here the diagonal entries of U are not set to unity). My code works fine for matrices of dimension less than 128x128 which are a factor of the block size which is 16. I am puzzled why larger matrices give me the wrong result. Something goes wrong in the last row of blocks.
------------ gaussian.h --------------------
// Thread block size
#define BLOCK_SIZE 16 //for forward solve A = LU
#define BLOCK_SIZE2 32 //for finding the pivot factor
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]
////////////////////////////////////////////////////////////////////////////////////////
------------ gaussian.cu --------------------
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>
#include "gaussian_kernel.cu"
#define OUTPUT
void runTest(int argc, char** argv);
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
int main(int argc, char** argv)
{
runTest(argc, argv);
}
void runTest(int argc, char** argv)
{
int dim;
if (argc == 2)
{
dim = atoi(argv[1]);
}
else{
printf("Wrong Usage\n");
exit(1);
}
// Note: A is a square matrix of size NxN where N % BLOCK_SIZE = 0
// Every row of A has a pivot column
// It is known that all the arithmetic operations involved for
// Gaussian Elimination are of type int (not float)
// allocate host memory for matrix A augmented with vector b (Ax=b)
unsigned int size_A = dim * (dim + BLOCK_SIZE);
unsigned int mem_size_A = sizeof(int) * size_A;
int* h_A = (int*) malloc(mem_size_A);
// initialize host memory, generate a test case such as below
// augmented matrix A with padding to make the size evenly divisible by BLOCK_SIZE
// ----A----- | b --padding--
// 1 1 1 1 .. | 0 * * * ..
// 1 2 2 2 .. | 1 * * * ..
// 1 2 3 3 .. | 2 * * * ..
// 1 2 3 4 .. | 3 * * * ..
// .......... | ...
// * means uninitialized entry
// A is size NxN
// Augmented matrix with padding is size Nx(N+BLOCK_SIZE)
int dimRow = dim;
int dimCol = dim + BLOCK_SIZE;
for(int i = 0; i < dim; i++)
{
h_A[(i + 1) * dimCol - BLOCK_SIZE] = i; // b vector stored in (N+1)th column
// of augmented matrix
for (int j = 0; j < dimRow - i; j++)
{
h_A[j + i + i * dimCol] = i + 1; // A[i][j] entry
h_A[j * dimCol + i + i * dimCol] = i + 1; // A[j][i] entry
}
}
//display the test case [ A | b ]
for ( int m = 0 ; m < dimRow; m++)
{
for ( int n = 0 ; n <= dimRow; n++)
//for ( int n = 0 ; n < dimCol; n++)
{
printf("%d ", h_A[m * dimCol + n]);
}
printf("\n");
}
// allocate device memory for the augmented matrix A
int* d_A;
cudaMalloc(&d_A, mem_size_A);
// start timer
double timer1 = gettime();
// copy host memory to device
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
// setup execution parameters for obtaining the pivot factor
int gridP = (dimRow / BLOCK_SIZE2) + ( (dimRow % BLOCK_SIZE2) == 0 ? 0 : 1 );
// add 1 if dimRow/BLOCK_SIZE2 is not evenly divisible
// setup execution parameters for the forward solve (Gaussian Elimination)
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(dimCol / threads.x, dimRow / threads.y);
// execute the kernel
for ( int i = 0 ; i < dim; i++)
{
Gaussian_Pivot_CUDA<<< gridP, BLOCK_SIZE2 >>>(d_A, dimCol, i);
Gaussian_Elim_CUDA<<< grid, threads >>>(d_A, dimCol, i);
}
// copy result from device to host
cudaMemcpy(h_A, d_A, mem_size_A, cudaMemcpyDeviceToHost);
// stop timer
double timer2 = gettime();
printf("GPU time = %lf\n",(timer2-timer1)*1000);
// Back substitution
int X[dimRow];
for (int row = dimRow - 1; row >= 0; row--)
{
X[row] = h_A[(row + 1) * dimCol - BLOCK_SIZE]; // b[row] entry
for (int col = dimRow - 1; col > row; col--)
{
X[row] -= h_A[row * dimCol + col] * X[col];
}
X[row] /= h_A[row * dimCol + row];
printf("X[%d] = %d\t",row,X[row]);
}
printf("\n");
#ifdef OUTPUT
// result of Gaussian Elimination
// ----A----- | b --padding--
// 1 1 1 1 .. | 0 * * * ..
// 0 1 1 1 .. | 1 * * * ..
// 0 0 1 1 .. | 1 * * * ..
// 0 0 0 1 .. | 1 * * * ..
// .......... | ...
// * means garbage entry
for ( int m = 0 ; m < dimRow; m++)
{
for ( int n = 0 ; n <= dimRow; n++)
{
printf("%d ", h_A[m * dimCol + n]);
}
printf("\n");
}
#endif
free(h_A);
cudaFree(d_A);
}
///////////////////////////////////////////////////////////////////////////////
------------ gaussian_kernel.cu --------------------
#include "gaussian.h"
__global__
void Gaussian_Pivot_CUDA(int* A, int widthA, int Pcol)
{
// find the pivot factor
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]
int bx = blockIdx.x;
int tx = threadIdx.x;
int index_row = BLOCK_SIZE2 * bx + tx;
int index_rowA = index_row * widthA;
__shared__ int A_RowRow;
if (tx == 0)
{
A_RowRow = A[Pcol * widthA + Pcol]; // get A[k,k] where k is the pivot column
// for this problem pivot column = pivot row
}
__syncthreads();
A[index_rowA - 1] = A[index_rowA + Pcol] / A_RowRow;
// INVALID WRITE HERE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// store pivot factor at A[dimCol - 1][row]
}
__global__
void Gaussian_Elim_CUDA(int* A, int widthA, int Pcol)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int index_col = BLOCK_SIZE * bx + tx;
int index_row = BLOCK_SIZE * by + ty;
int index = widthA * index_row + index_col;
// d_A[index] "=" d_A[index_row][index_col]
// store pivot factor for rows that we are working on in this block
__shared__ int pivotFactor[BLOCK_SIZE];
if ( ty == 0 ) // use ty instead of tx for coalesced accesses
{
pivotFactor[tx] = A[(index_row + tx) * widthA - 1];
// INVALID WRITE HERE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
}
__syncthreads();
// implement the gaussian elimination for the current row
if ( index_row > Pcol )
{
A[index] -= A[Pcol * widthA + index_col] * pivotFactor[ty];
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]
}
}
cuda-memcheck told me that I was reading and writing stuff out of bounds.
My indexing was off. In Gaussian_Pivot_CUDA, the last line should be A[widthA*(index_row+1)-1)]=...
In Gaussian_Elim_CUDA, when writing pivotFactor I have to change ty to tx and read A[widthA*(index_row+1)-1)].
Thank you #Eugene !!!
If sizeof(int)==2, then This array index will overflow when dim is 128:
h_A[j * dimCol + i + i * dimCol] = i + 1; // A[j][i] entry
(127*144)+127+(127*144) = 36703 > 32767.

Optimizing the solution of the 2D diffusion (heat) equation in CUDA

I have already checked earlier questions on SO about this issue but not able to see how it relates here.
I am solving 2d diffusion equation with CUDA and it turns out that my GPU code is slower than its CPU counterpart.
Here is my code:
//kernel definition
__global__ void diffusionSolver(double* A, int n_x,int n_y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i<n_x && j <n_y && i*(n_x-i-1)*j*(n_y-j-1)!=0)
A[i+n_y*j] = A[i+n_y*j] + (A[i-1+n_y*j]+A[i+1+n_y*j]+A[i+(j-1)*n_y]+A[i+(j+1)*n_y] -4.0*A[i+n_y*j])/40.0;
}
int main function
int main()
{
int n_x = 200 ;
int n_y = 200 ;
double *phi;
double *dummy;
double *phi_old;
int i,j ;
phi = (double *) malloc( n_x*n_y* sizeof(double));
phi_old = (double *) malloc( n_x*n_y* sizeof(double));
dummy = (double *) malloc( n_x*n_y* sizeof(double));
int iterationMax =200;
for(j=0;j<n_y ;j++)
{
for(i=0;i<n_x;i++)
{
if((.4*n_x-i)*(.6*n_x-i)<0)
phi[i+n_y*j] = -1;
else
phi[i+n_y*j] = 1;
}
}
double *dev_phi;
cudaMalloc((void **) &dev_phi, n_x*n_y*sizeof(double));
cudaMemcpy(dev_phi, phi, n_x*n_y*sizeof(double),
cudaMemcpyHostToDevice);
dim3 threadsPerBlock(10,100);
dim3 numBlocks(n_x*n_y / threadsPerBlock.x, n_x*n_y / threadsPerBlock.y);
for(int z=0; z<iterationMax; z++)
{
if(z%100==0)
cout <<z/100 <<"\n";;
diffusionSolver<<<numBlocks, threadsPerBlock>>>(dev_phi, n_x,n_y);
}
cudaMemcpy(phi, dev_phi,n_x*n_y*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(dev_phi);
return 0;
}
The problem with this code is that it runs slower than a simple CPU-only iterative method. I don't know much about profiler and until now I tried with cuda-memcheck which gives 0 errors.
How can I know which portion of the code is performing slowly and speed that up? I am working on a Linux environment. Thanks in advance for any help.
The worst problem I see is that you are launching far too many blocks for the size of the input array. At the moment you are computing the grid size as:
dim3 numBlocks(n_x*n_y / threadsPerBlock.x, n_x*n_y / threadsPerBlock.y);
which should yield a grid size of (400,4000) blocks for an input array of only 200x200. That is clearly incorrect. The calculation should be something like:
int nbx = (n_x / threadsPerBlock.x) + (((n_x % threadsPerBlock.x) == 0) ? 0 : 1);
int nby = (n_y / threadsPerBlock.y) + (((n_y % threadsPerBlock.y) == 0) ? 0 : 1);
dim3 numBlocks(nbx,nby);
which would yield a grid size of (2,20) blocks, or 40000 times fewer than you are currently launching.
There are other optimisations which you could consider making to the kernel, but those pale into insignificance compared with mistakes of this magnitude.
You are doing a lot of integer multiplication and have a lot of global memory reads, both of which are slow in CUDA. I also imagine that there are not a lot of coalesced global memory reads.
The only way to speed up your kernel is to stage coalesced memory reads through shared memory and/or re-arrange your data so that you can index it without using lots of integer multiplication.
I don't have a great grasp of diffusion equations, but I don't think there is a lot of naive parallelism to be exploited. Take a look at the CUDA Programming Guide and the Best Practices Guide and maybe you'll get some ideas about how to improve your algorithm.
In case anybody is interested, I'm posting below a fully worked code concerning the optimization of the solution approach for the 2D heat equation.
Five approaches are considered, using:
Global memory, essentially the OP's approach;
Shared memory of size BLOCK_SIZE_X x BLOCK_SIZE_Y not loading the halo regions;
Shared memory of size BLOCK_SIZE_X x BLOCK_SIZE_Y loading the halo regions;
Shared memory of size (BLOCK_SIZE_X + 2) x (BLOCK_SIZE_Y + 2) loading the halo regions;
Texture memory.
Everybody can run the code and check out which approach is faster for his own GPU architecture.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Utilities.cuh"
#include "InputOutput.cuh"
#include "TimingGPU.cuh"
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
#define DEBUG
texture<float, 2, cudaReadModeElementType> tex_T;
texture<float, 2, cudaReadModeElementType> tex_T_old;
/***********************************/
/* JACOBI ITERATION FUNCTION - GPU */
/***********************************/
__global__ void Jacobi_Iterator_GPU(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
// N
int P = i + j*NX; // node (i,j) |
int N = i + (j+1)*NX; // node (i,j+1) |
int S = i + (j-1)*NX; // node (i,j-1) W ---- P ---- E
int E = (i+1) + j*NX; // node (i+1,j) |
int W = (i-1) + j*NX; // node (i-1,j) |
// S
// --- Only update "interior" (not boundary) node points
if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]);
}
/******************************************************/
/* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V1 */
/******************************************************/
__global__ void Jacobi_Iterator_GPU_shared_v1(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
// N
int P = i + j*NX; // node (i,j) |
int N = i + (j+1)*NX; // node (i,j+1) |
int S = i + (j-1)*NX; // node (i,j-1) W ---- P ---- E
int E = (i+1) + j*NX; // node (i+1,j) |
int W = (i-1) + j*NX; // node (i-1,j) |
// S
__shared__ float T_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y];
// --- Load data to shared memory. Halo regions are NOT loaded.
T_sh[threadIdx.x][threadIdx.y] = T_old[P];
__syncthreads();
if ((threadIdx.x > 0) && (threadIdx.x < (BLOCK_SIZE_X - 1)) && (threadIdx.y > 0) && (threadIdx.y < (BLOCK_SIZE_Y ‐ 1)))
// --- If we do not need halo region elements, then use shared memory.
T_new[P] = 0.25 * (T_sh[threadIdx.x][threadIdx.y - 1] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x - 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y]);
else if (i>0 && i<NX-1 && j>0 && j<NY-1) // --- Only update "interior" (not boundary) node points
// --- If we need halo region elements, then use global memory.
T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]);
}
/******************************************************/
/* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V2 */
/******************************************************/
__global__ void Jacobi_Iterator_GPU_shared_v2(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * (BLOCK_SIZE_X - 2) + threadIdx.x ;
const int j = blockIdx.y * (BLOCK_SIZE_Y - 2) + threadIdx.y ;
int P = i + j*NX;
if ((i >= NX) || (j >= NY)) return;
__shared__ float T_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y];
// --- Load data to shared memory. Halo regions ARE loaded.
T_sh[threadIdx.x][threadIdx.y] = T_old[P];
__syncthreads();
if (((threadIdx.x > 0) && (threadIdx.x < (BLOCK_SIZE_X - 1)) && (threadIdx.y > 0) && (threadIdx.y < (BLOCK_SIZE_Y ‐ 1))) &&
(i>0 && i<NX-1 && j>0 && j<NY-1))
T_new[P] = 0.25 * (T_sh[threadIdx.x][threadIdx.y - 1] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x - 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y]);
}
/******************************************************/
/* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V2 */
/******************************************************/
__global__ void Jacobi_Iterator_GPU_shared_v3(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
const int tid_block = threadIdx.y * BLOCK_SIZE_X + threadIdx.x; // --- Flattened thread index within a block
const int i1 = tid_block % (BLOCK_SIZE_X + 2);
const int j1 = tid_block / (BLOCK_SIZE_Y + 2);
const int i2 = (BLOCK_SIZE_X * BLOCK_SIZE_Y + tid_block) % (BLOCK_SIZE_X + 2);
const int j2 = (BLOCK_SIZE_X * BLOCK_SIZE_Y + tid_block) / (BLOCK_SIZE_Y + 2);
int P = i + j * NX;
if ((i >= NX) || (j >= NY)) return;
__shared__ float T_sh[BLOCK_SIZE_X + 2][BLOCK_SIZE_Y + 2];
if (((blockIdx.x * BLOCK_SIZE_X - 1 + i1) < NX) && ((blockIdx.y * BLOCK_SIZE_Y - 1 + j1) < NY))
T_sh[i1][j1] = T_old[(blockIdx.x * BLOCK_SIZE_X - 1 + i1) + (blockIdx.y * BLOCK_SIZE_Y - 1 + j1) * NX];
if (((i2 < (BLOCK_SIZE_X + 2)) && (j2 < (BLOCK_SIZE_Y + 2))) && (((blockIdx.x * BLOCK_SIZE_X - 1 + i2) < NX) && ((blockIdx.y * BLOCK_SIZE_Y - 1 + j2) < NY)))
T_sh[i2][j2] = T_old[(blockIdx.x * BLOCK_SIZE_X - 1 + i2) + (blockIdx.y * BLOCK_SIZE_Y - 1 + j2) * NX];
__syncthreads();
if ((threadIdx.x <= (BLOCK_SIZE_X - 1) && (threadIdx.y <= (BLOCK_SIZE_Y ‐ 1))) && (i>0 && i<NX-1 && j>0 && j<NY-1))
T_new[P] = 0.25 * (T_sh[threadIdx.x + 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y + 2] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x + 2][threadIdx.y + 1]);
}
/*********************************************/
/* JACOBI ITERATION FUNCTION - GPU - TEXTURE */
/*********************************************/
__global__ void Jacobi_Iterator_GPU_texture(float * __restrict__ T_new, const bool flag, const int NX, const int NY) {
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
float P, N, S, E, W;
if (flag) {
// N
P = tex2D(tex_T_old, i, j); // node (i,j) |
N = tex2D(tex_T_old, i, j + 1); // node (i,j+1) |
S = tex2D(tex_T_old, i, j - 1); // node (i,j-1) W ---- P ---- E
E = tex2D(tex_T_old, i + 1, j); // node (i+1,j) |
W = tex2D(tex_T_old, i - 1, j); // node (i-1,j) |
// S
} else {
// N
P = tex2D(tex_T, i, j); // node (i,j) |
N = tex2D(tex_T, i, j + 1); // node (i,j+1) |
S = tex2D(tex_T, i, j - 1); // node (i,j-1) W ---- P ---- E
E = tex2D(tex_T, i + 1, j); // node (i+1,j) |
W = tex2D(tex_T, i - 1, j); // node (i-1,j) |
// S
}
// --- Only update "interior" (not boundary) node points
if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[i + j*NX] = 0.25 * (E + W + N + S);
}
/***********************************/
/* JACOBI ITERATION FUNCTION - CPU */
/***********************************/
void Jacobi_Iterator_CPU(float * __restrict T, float * __restrict T_new, const int NX, const int NY, const int MAX_ITER)
{
for(int iter=0; iter<MAX_ITER; iter=iter+2)
{
// --- Only update "interior" (not boundary) node points
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T[(i+1) + NX*j];
float T_W = T[(i-1) + NX*j];
float T_N = T[i + NX*(j+1)];
float T_S = T[i + NX*(j-1)];
T_new[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T_new[(i+1) + NX*j];
float T_W = T_new[(i-1) + NX*j];
float T_N = T_new[i + NX*(j+1)];
float T_S = T_new[i + NX*(j-1)];
T[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
}
}
/******************************/
/* TEMPERATURE INITIALIZATION */
/******************************/
void Initialize(float * __restrict h_T, const int NX, const int NY)
{
// --- Set left wall to 1
for(int j=0; j<NY; j++) h_T[j * NX] = 1.0;
}
/********/
/* MAIN */
/********/
int main()
{
const int NX = 256; // --- Number of discretization points along the x axis
const int NY = 256; // --- Number of discretization points along the y axis
const int MAX_ITER = 100; // --- Number of Jacobi iterations
// --- CPU temperature distributions
float *h_T = (float *)calloc(NX * NY, sizeof(float));
float *h_T_old = (float *)calloc(NX * NY, sizeof(float));
Initialize(h_T, NX, NY);
Initialize(h_T_old, NX, NY);
float *h_T_GPU_result = (float *)malloc(NX * NY * sizeof(float));
float *h_T_GPU_tex_result = (float *)malloc(NX * NY * sizeof(float));
float *h_T_GPU_sh1_result = (float *)malloc(NX * NY * sizeof(float));
float *h_T_GPU_sh2_result = (float *)malloc(NX * NY * sizeof(float));
float *h_T_GPU_sh3_result = (float *)malloc(NX * NY * sizeof(float));
// --- GPU temperature distribution
float *d_T; gpuErrchk(cudaMalloc((void**)&d_T, NX * NY * sizeof(float)));
float *d_T_old; gpuErrchk(cudaMalloc((void**)&d_T_old, NX * NY * sizeof(float)));
float *d_T_tex; gpuErrchk(cudaMalloc((void**)&d_T_tex, NX * NY * sizeof(float)));
float *d_T_old_tex; gpuErrchk(cudaMalloc((void**)&d_T_old_tex, NX * NY * sizeof(float)));
float *d_T_sh1; gpuErrchk(cudaMalloc((void**)&d_T_sh1, NX * NY * sizeof(float)));
float *d_T_old_sh1; gpuErrchk(cudaMalloc((void**)&d_T_old_sh1, NX * NY * sizeof(float)));
float *d_T_sh2; gpuErrchk(cudaMalloc((void**)&d_T_sh2, NX * NY * sizeof(float)));
float *d_T_old_sh2; gpuErrchk(cudaMalloc((void**)&d_T_old_sh2, NX * NY * sizeof(float)));
float *d_T_sh3; gpuErrchk(cudaMalloc((void**)&d_T_sh3, NX * NY * sizeof(float)));
float *d_T_old_sh3; gpuErrchk(cudaMalloc((void**)&d_T_old_sh3, NX * NY * sizeof(float)));
gpuErrchk(cudaMemcpy(d_T, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_tex, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_sh1, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_sh2, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_sh3, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_old, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_T_old_tex, d_T_tex, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_T_old_sh1, d_T_sh1, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_T_old_sh2, d_T_sh2, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_T_old_sh3, d_T_sh3, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
//cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
gpuErrchk(cudaBindTexture2D(NULL, &tex_T, d_T_tex, &desc, NX, NY, sizeof(float) * NX));
gpuErrchk(cudaBindTexture2D(NULL, &tex_T_old, d_T_old_tex, &desc, NX, NY, sizeof(float) * NX));
tex_T.addressMode[0] = cudaAddressModeWrap;
tex_T.addressMode[1] = cudaAddressModeWrap;
tex_T.filterMode = cudaFilterModePoint;
tex_T.normalized = false;
tex_T_old.addressMode[0] = cudaAddressModeWrap;
tex_T_old.addressMode[1] = cudaAddressModeWrap;
tex_T_old.filterMode = cudaFilterModePoint;
tex_T_old.normalized = false;
// --- Grid size
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid (iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y));
// --- Jacobi iterations on the host
Jacobi_Iterator_CPU(h_T, h_T_old, NX, NY, MAX_ITER);
// --- Jacobi iterations on the device
TimingGPU timerGPU;
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T, d_T_old, NX, NY); // --- Update d_T_old starting from data stored in d_T
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T_old, d_T , NX, NY); // --- Update d_T starting from data stored in d_T_old
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing = %f ms\n", timerGPU.GetCounter());
// --- Jacobi iterations on the device - shared memory v1
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU_shared_v1<<<dimGrid, dimBlock>>>(d_T_sh1, d_T_old_sh1, NX, NY); // --- Update d_T_old starting from data stored in d_T
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU_shared_v1<<<dimGrid, dimBlock>>>(d_T_old_sh1, d_T_sh1 , NX, NY); // --- Update d_T starting from data stored in d_T_old
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing with shared memory v1 = %f ms\n", timerGPU.GetCounter());
// --- Jacobi iterations on the device - shared memory v2
dim3 dimBlock2(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid2 (iDivUp(NX, BLOCK_SIZE_X - 2), iDivUp(NY, BLOCK_SIZE_Y - 2));
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU_shared_v2<<<dimGrid2, dimBlock>>>(d_T_sh2, d_T_old_sh2, NX, NY); // --- Update d_T_old starting from data stored in d_T
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU_shared_v2<<<dimGrid2, dimBlock>>>(d_T_old_sh2, d_T_sh2 , NX, NY); // --- Update d_T starting from data stored in d_T_old
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing with shared memory v2 = %f ms\n", timerGPU.GetCounter());
// --- Jacobi iterations on the device - shared memory v3
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU_shared_v3<<<dimGrid, dimBlock>>>(d_T_sh3, d_T_old_sh3, NX, NY); // --- Update d_T_old starting from data stored in d_T
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU_shared_v3<<<dimGrid, dimBlock>>>(d_T_old_sh3, d_T_sh3 , NX, NY); // --- Update d_T starting from data stored in d_T_old
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing with shared memory v3 = %f ms\n", timerGPU.GetCounter());
// --- Jacobi iterations on the device - texture case
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU_texture<<<dimGrid, dimBlock>>>(d_T_old_tex, 0, NX, NY); // --- Update d_T_tex starting from data stored in d_T_old_tex
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU_texture<<<dimGrid, dimBlock>>>(d_T_tex, 1, NX, NY); // --- Update d_T_old_tex starting from data stored in d_T_tex
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing with texture = %f ms\n", timerGPU.GetCounter());
saveCPUrealtxt(h_T, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\CPU_result.txt", NX * NY);
saveGPUrealtxt(d_T_tex, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_tex.txt", NX * NY);
saveGPUrealtxt(d_T, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result.txt", NX * NY);
saveGPUrealtxt(d_T_sh1, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh1.txt", NX * NY);
saveGPUrealtxt(d_T_sh2, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh2.txt", NX * NY);
saveGPUrealtxt(d_T_sh3, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh3.txt", NX * NY);
// --- Copy results from device to host
gpuErrchk(cudaMemcpy(h_T_GPU_result, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_T_GPU_tex_result, d_T_tex, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_T_GPU_sh1_result, d_T_sh1, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_T_GPU_sh2_result, d_T_sh2, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_T_GPU_sh3_result, d_T_sh3, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
// --- Calculate percentage root mean square error between host and device results
float sum = 0.f, sum_tex = 0.f, sum_ref = 0.f, sum_sh1 = 0.f, sum_sh2 = 0.f, sum_sh3 = 0.f;
for (int j=0; j<NY; j++)
for (int i=0; i<NX; i++) {
sum = sum + (h_T_GPU_result [j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_result [j * NX + i] - h_T[j * NX + i]);
sum_tex = sum_tex + (h_T_GPU_tex_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_tex_result[j * NX + i] - h_T[j * NX + i]);
sum_sh1 = sum_sh1 + (h_T_GPU_sh1_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh1_result[j * NX + i] - h_T[j * NX + i]);
sum_sh2 = sum_sh2 + (h_T_GPU_sh2_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh2_result[j * NX + i] - h_T[j * NX + i]);
sum_sh3 = sum_sh3 + (h_T_GPU_sh3_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh3_result[j * NX + i] - h_T[j * NX + i]);
sum_ref = sum_ref + h_T[j * NX + i] * h_T[j * NX + i];
}
printf("Percentage root mean square error = %f\n", 100.*sqrt(sum / sum_ref));
printf("Percentage root mean square error texture = %f\n", 100.*sqrt(sum_tex / sum_ref));
printf("Percentage root mean square error shared v1 = %f\n", 100.*sqrt(sum_sh1 / sum_ref));
printf("Percentage root mean square error shared v2 = %f\n", 100.*sqrt(sum_sh2 / sum_ref));
printf("Percentage root mean square error shared v3 = %f\n", 100.*sqrt(sum_sh3 / sum_ref));
return 0;
}