How to use register memory for each thread in CUDA?

How to use register memory for each thread in CUDA? - cuda

I am trying to flip upside down the array which size is big.(ex. 4096x8192)
At first, I tried with two array for input and output and It works!.
(I will say input is original and output is flipped array)
But I thought it will be easier and much efficient if each thread can hold input elements.
Then I can only use one array!
Could you guys share your knowledge or introduce any documents that help this problem?
Thanks and here is my code.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define ThreadPB 32 // optimal size
dim3 threadsPerBlock(ThreadPB, ThreadPB);
__global__ void initKernel(int *input, int nx, int ny)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
int idx = idx_y * nx + idx_x;
if (idx_x < nx && idx_y < ny) {
input[idx] = idx_y;
}
}
__global__ void flipKernel(int *output, int *input, int nx, int ny)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
int idx = idx_y * nx + idx_x;
// is it possible to use only one array?
if (idx_x < nx && idx_y < ny) {
output[(ny - idx_y - 1) * nx + idx_x] = input[idx_y * nx + idx_x];
}
}
int main()
{
// time check
cudaEvent_t start, stop, start_temp, stop_temp;
cudaEvent_t start_temp2, stop_temp2;
float elapsedTime, elapsedTime_temp, elapsedTime_temp2;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventCreate(&start_temp); cudaEventCreate(&stop_temp);
cudaEventCreate(&start_temp2); cudaEventCreate(&stop_temp2);
const int num_x = 4096;
const int num_y = 8192;
const int arraySize = num_x * num_y;
int *orig, *flip;
orig = (int *)malloc(sizeof(int) * arraySize);
flip = (int *)malloc(sizeof(int) * arraySize);
int *dev_orig = 0;
int *dev_flip = 0;
cudaMalloc((void**)&dev_orig, arraySize * sizeof(int));
cudaMalloc((void**)&dev_flip, arraySize * sizeof(int));
cudaMemcpy(dev_orig, orig, arraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_flip, flip, arraySize * sizeof(int), cudaMemcpyHostToDevice);
dim3 blocksFlip((num_x + threadsPerBlock.x - 1) / threadsPerBlock.x, (num_y + threadsPerBlock.y - 1) / threadsPerBlock.y);
initKernel << <blocksFlip, threadsPerBlock >> > (dev_orig, num_x, num_y);
cudaEventRecord(start, 0);
flipKernel << <blocksFlip, threadsPerBlock >> > (dev_flip, dev_orig, num_x, num_y);
// time check end
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); printf("flip 1024x2048 처리 시간 = %f ms.\n", elapsedTime);
cudaMemcpy(orig, dev_orig, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(flip, dev_flip, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
// check flip works
printf("FLIP this array { 0, 1, 2, 3, 4 , 5, 6, 7, 8, 9...} \n= { %d, %d, %d, %d, %d, %d, %d, %d, %d, %d...}\n",
flip[num_x * 0], flip[num_x * 1], flip[num_x * 2], flip[num_x * 3], flip[num_x * 4],
flip[num_x * 5], flip[num_x * 6], flip[num_x * 7], flip[num_x * 8], flip[num_x * 9]);
return 0;
}

For an even number of rows in the array, you should be able to do something like this:
__global__ void flipKernel(int *input, int nx, int ny)
{
int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
int idx = idx_y * nx + idx_x;
if (idx_x < nx && idx_y < ny/2) {
int output_temp = input[(ny - idx_y - 1) * nx + idx_x];
input[(ny - idx_y - 1) * nx + idx_x] = input[idx_y * nx + idx_x];
input[idx_y * nx + idx_x] = output_temp;
}
}
You would only need to launch this kernel with half as many threads in y (half as many rows in y). Each thread is updating two values in the matrix.
Rather than thinking about things like "register" or imagining that CUDA is some kind of weird language, if you have C or C++ programming ability, I would encourage you to think about how you might solve the problem if it were framed as an ordinary C or C++ programming challenge. Your intuition from that will often work very well in CUDA.
The core of the routine above is just a swap. The thing you are referring to as a "register" is just an ordinary local variable in C or C++. There is a register keyword in C++, but it serves essentially no purpose in CUDA, and is not needed here anyway.
You can handle an odd number of rows by simply leaving the middle row as-is, and swapping the remaining rows. This would require just a slight change to the indexing calculations.

Related

Why PyCUDA is faster than C CUDA in this example

I am exploring to move from OpenCL to CUDA, and did a few tests to benchmark the speed of CUDA in various implementations. To my surprise, in the examples below, the PyCUDA implementation is about 20% faster than the C CUDA example.
I read many posts talking about "release build" of C CUDA code. I did try having -Xptxas -O3 in the makefile and that really did not make a difference. I also tried to adjust the block size, with which the kernel was executed. Unfortunately, it did not help improve the speed, either.
My questions here are:
What could be the reasons leading to the speed difference between C CUDA and PYCUDA?
If the "advanced" (lack of a better word) compiling in PYCUDA is one of reasons, how can I optimize the compiling of my C CUDA code?
Are there any other ways to improve the speed of C CUDA in this case?
While I appreciate general comments, I am looking for actionable suggestions that I can validate on my machine. Thanks!
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
import time
mod = SourceModule(
"""
__global__ void saxpy(int n, const float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n){
y[i] = a * x[i] + y[i];
}
}
"""
)
saxpy = mod.get_function("saxpy")
N = 1 << 25
time_elapse = 0.0
for i in range(100):
# print(i)
# print(N)
x = np.ones(N).astype(np.float32)
y = 2 * np.ones(N).astype(np.float32)
start = time.time()
saxpy(
np.int32(N),
np.float32(2.0),
drv.In(x),
drv.InOut(y),
block=(512, 1, 1),
grid=(int(N / 512) + 1, 1),
)
time_elapse += (time.time() - start)
print(time_elapse )
print(y[-100:-1])
print(y.sum())
print(N * 4.0)
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main(int num_iterations)
{
double start;
double cputime;
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
for (j = 0; j < num_iterations; j++)
{
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M elements
start = clock();
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cputime += ((double)(clock() - start) / CLOCKS_PER_SEC);
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
// float maxError = 0.0f;
// for (int i = 0; i < N; i++){
// maxError = max(maxError, abs(y[i] - 4.0f));
// //printf("y[%d]: %f\n", i,y[i]);
// }
// printf("Max error: %f\n", maxError);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
}
printf("cpu time is %f\n", cputime);
return 0;
}
I saved the above file as cuda_example.cu and compile it with the following commands in a makefile:
nvcc -arch=sm_61 -Xptxas -O3,-v -o main cuda_example.cu

If I execute your CUDA-C code as is, and set num_iterations to 300 like this:
int num_iterations =300;
then the execution of your program takes about 60s on a Geforce GTX 1650. Your code is extremely inefficient, as you copy data back and forth between GPU and device at every iteration.
So, lets restrict the loop to just the kernel execution:
#include <stdio.h>
#include <time.h>
#define DIM 512
__global__ void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n)
y[i] = a * x[i] + y[i];
}
int main()
{
double start = clock();
int N = 1 << 25;
float *x, *y, *d_x, *d_y;
int i, j;
int num_iterations = 300;
x = (float *)malloc(N * sizeof(float));
y = (float *)malloc(N * sizeof(float));
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
for (i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
cudaMemcpy(d_x, x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, N * sizeof(float), cudaMemcpyHostToDevice);
for (j = 0; j < num_iterations; j++){
saxpy<<<(N + DIM) / DIM, DIM>>>(N, 2.0f, d_x, d_y);
cudaDeviceSynchronize();
}
cudaMemcpy(y, d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);
free(x);
free(y);
double cputime = ((double)(clock() - start) / CLOCKS_PER_SEC);
printf("cpu time is %f\n", cputime);
return 0;
}
If I do that, then the execution time becomes 1.36 seconds. Doing sth similar to the PyCUDA code I got about 19s of execution time.

Calculating indices for nested loops in CUDA

I'm trying to learn CUDA and I'm a bit confused about calculating thread indices. Let's say I have this loop I'm trying to parallelize:
...
for(int x = 0; x < DIM_x; x++){
for(int y = 0; y < DIM_y; y++){
for(int dx = 0; dx < psize; dx++){
array[y*DIM_x + x + dx] += 1;
}
}
}
In PyCUDA, I set:
block = (8, 8, 8)
grid = (96, 96, 16)
Most of the examples I've seen for parallelizing loops calculate thread indices like this:
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int dx = blockIdx.z * blockDim.z + threadIdx.z;
if (x >= DIM_x || y >= DIM_y || dx >= psize)
return;
atomicAdd(&array[y*DIM_x + x + dx], 1)
DIM_x = 580, DIM_y = 550, psize = 50
However, if I print x, I see that multiple threads with the same thread Id are created, and the final result is wrong.
Instead, if I use this (3D grid of 3D blocks):
int blockId = blockIdx.x + blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int x = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x) + threadIdx.x;
It fixes the multiple same thread Ids problem for x, but I'm not sure how I'd parallelize y and dx.
If anyone could help me understand where I'm going wrong, and show me the right way to parallelize the loops, I'd really appreciate it.

However, if I print x, I see that multiple threads with the same
thread Id are created, and the final result is wrong.
It would be normal for you to see multiple threads with the same x thread ID in a multi-dimensional grid, as it would also be normal to observe many iterations of the loops in your host code with the same x value. If the result is wrong, it has nothing to do with any of the code you have shown, viz:
#include <vector>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <assert.h>
void host(int* array, int DIM_x, int DIM_y, int psize)
{
for(int x = 0; x < DIM_x; x++){
for(int y = 0; y < DIM_y; y++){
for(int dx = 0; dx < psize; dx++){
array[y*DIM_x + x + dx] += 1;
}
}
}
}
__global__
void kernel(int* array, int DIM_x, int DIM_y, int psize)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int dx = blockIdx.z * blockDim.z + threadIdx.z;
if (x >= DIM_x || y >= DIM_y || dx >= psize)
return;
atomicAdd(&array[y*DIM_x + x + dx], 1);
}
int main()
{
dim3 block(8, 8, 8);
dim3 grid(96, 96, 16);
int DIM_x = 580, DIM_y = 550, psize = 50;
std::vector<int> array_h(DIM_x * DIM_y * psize, 0);
std::vector<int> array_hd(DIM_x * DIM_y * psize, 0);
thrust::device_vector<int> array_d(DIM_x * DIM_y * psize, 0);
kernel<<<grid, block>>>(thrust::raw_pointer_cast(array_d.data()), DIM_x, DIM_y, psize);
host(&array_h[0], DIM_x, DIM_y, psize);
thrust::copy(array_d.begin(), array_d.end(), array_hd.begin());
cudaDeviceSynchronize();
for(int i=0; i<DIM_x * DIM_y * psize; i++) {
assert( array_h[i] == array_hd[i] );
}
return 0;
}
which when compiled and run
$ nvcc -arch=sm_52 -std=c++11 -o looploop loop_the_loop.cu
$ cuda-memcheck ./looploop
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
emits no errors and passes the check of all elements against the host code in your question.
If you are getting incorrect results, it is likely that you have a problem with initialization of the device memory before running the kernel. Otherwise I fail to see how incorrect results could be emitted by the code you have shown.
In general, performing a large number of atomic memory transactions, as your code does, is not the optimal way to perform computation on the GPU. Using non-atomic transactions would probably need to rely on other a priori information about the structure of the problem (such as a graph decomposition or a precise description of the write patterns of the problem).

In a 3D grid with 3D blocks, the thread ID is:
unsigned long blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
unsigned long threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
Not the x you computed. The x is only the x index of that 3D matrix.
There is a nice cheatsheet in this blog

Dot product in Cuda by example does not work for me

I'm starting to read "Cuda By Example" Book and I've been a problem with the dot example using "shared memory". I copy-paste the example from the book and I set: N = x * 1024; threadsPerBlock = 32; blocksPerGrid = 8. Where I test the "x" values with 2, 3, 4, 5.
If I set x = 3, the result is bad, but when I used x = 2,4,5 all is ok. I don't understand where is the problem. The code is:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define imin(a, b) (a<b?a:b)
#define sum_squares(x) (x*(x+1)*(2*x+1)/6)
const int x = 3;
const int N = 3 * 1024;
const int threadsPerBlock = 32;
const int blocksPerGrid = 8;
__global__ void dot(float *a, float *b, float *c)
{
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < N)
{
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x / 2;
while (i != 0)
{
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if (cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
int main()
{
float *a, *b, *partial_c, result;
float *d_a, *d_b, *d_partial_c;
a = (float *)malloc(N * sizeof(float));
b = (float *)malloc(N * sizeof(float));
partial_c = (float *)malloc(blocksPerGrid * sizeof(float));
cudaMalloc((void **)&d_a, N * sizeof(float));
cudaMalloc((void **)&d_b, N * sizeof(float));
cudaMalloc((void **)&d_partial_c, blocksPerGrid * sizeof(float));
for (int i = 0; i < N; i++)
{
a[i] = i;
b[i] = 2 * i;
}
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);
dot << <blocksPerGrid, threadsPerBlock >> >(d_a, d_b, d_partial_c);
cudaMemcpy(partial_c, d_partial_c, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost);
result = 0;
for (int i = 0; i < blocksPerGrid; i++)
result += partial_c[i];
if (2 * sum_squares((float)(N - 1)) == result)
printf(":)\n");
else
printf(":(\n");
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_partial_c);
free(a);
free(b);
free(partial_c);
getchar();
return 0;
}

Because float does not have enough precision, which is ~7 decimal digits only. But for x=3; your expected result is
19317916672
containing 11 digits.
for x=4,5, the results are bad on my machine too.

Matrix-vector multiplication in CUDA: benchmarking & performance

I'm updating my question with some new benchmarking results (I also reformulated the question to be more specific and I updated the code)...
I implemented a kernel for matrix-vector multiplication in CUDA C following the CUDA C Programming Guide using shared memory. Let me first present some benchmarking results which I did on a Jetson TK1 (GPU: Tegra K1, compute capability 3.2) and a comparison with cuBLAS:
Here I guess cuBLAS does some magic since it seems that its execution is not affected by the number of columns of A, which, in turn, implies that there is some sort of parallelisation along the columns of A.
Now, here is the source code of my kernel and a host function to call it (file: mv.cuh):
#include <cuda_runtime.h>
#define BLOCK_SIZE 16
/* Set to __restric__ */
#define RESTRICT
/**
* Performs matrix-vector multiplication on the device.
*
* #param dA Address of matrix `A` on the device
* #param dx Address of vector `x` on the device
* #param dev_ptr_y Address of result y = A*x
* #param nRows Number of rows of `A`
* #param nx Size of `x` (number of columns of `A`)
*
* #tparam T Data type
*
*/
template<typename T>
__global__ void matvec_kernel(
const T * RESTRICT dA,
const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows,
const unsigned int nx);
/**
* Host-side wrapper for #matvec_kernel.
*
* #param dA Address of matrix `A` on the device
* #param dx Address of vector `x` on the device
* #param dev_ptr_y Address of result y = A*x
* #param nRows Number of rows of `A`
* #param nx Size of `x` (number of columns of `A`)
* #param elapsed_time Time for the kernel to complete the execution in `ms`.
* If NULL is passed to this argument, the elapsed time
* will not be computed.
*
* #tparam T Data type for `A` and `x`
*/
template<typename T>
__host__ void matvec(
const T * RESTRICT dA,
const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows,
const unsigned int nx);
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
template<typename T>
__global__ void matvec_kernel(const T * RESTRICT dA, const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows, const unsigned int nx)
{
unsigned int bid = blockIdx.x;
unsigned int row = threadIdx.x;
const unsigned int block_size = blockDim.x;
const unsigned int num_hor_blocks = ((nx + block_size - 1)/ block_size);
unsigned int n_star;
unsigned int idx_x;
unsigned int idx_Asub;
unsigned int idx_y;
const T * Asub;
const T * xsub;
/* Only `x` is copied to shared memory */
__shared__ T x_shared[BLOCK_SIZE];
idx_y = bid * block_size;
T * y_sub = dy + idx_y;
T y_val = 0.0;
#pragma unroll
for (unsigned int m = 0; m < num_hor_blocks; ++m)
{
idx_Asub = block_size * (bid + m * nRows);
idx_x = m * block_size;
Asub = dA + idx_Asub;
xsub = dx + idx_x;
if (idx_x + row < nx) {
x_shared[row] = xsub[row];
}
__syncthreads();
/* If the tiling is exact */
if ( (nRows % block_size == 0 && nx % block_size == 0 ) ||
(m != block_size - 1 || bid != gridDim.x - 1)) {
y_val += Asub[row] * x_shared[0];
y_val += Asub[row + nRows] * x_shared[1];
y_val += Asub[row + 2 * nRows] * x_shared[2];
y_val += Asub[row + 3 * nRows] * x_shared[3];
y_val += Asub[row + 4 * nRows] * x_shared[4];
y_val += Asub[row + 5 * nRows] * x_shared[5];
y_val += Asub[row + 6 * nRows] * x_shared[6];
y_val += Asub[row + 7 * nRows] * x_shared[7];
y_val += Asub[row + 8 * nRows] * x_shared[8];
y_val += Asub[row + 9 * nRows] * x_shared[9];
y_val += Asub[row + 10 * nRows] * x_shared[10];
y_val += Asub[row + 11 * nRows] * x_shared[11];
y_val += Asub[row + 12 * nRows] * x_shared[12];
y_val += Asub[row + 13 * nRows] * x_shared[13];
y_val += Asub[row + 14 * nRows] * x_shared[14];
y_val += Asub[row + 15 * nRows] * x_shared[15];
} else {
n_star = min(BLOCK_SIZE, nx - idx_x);
#pragma unroll
for (unsigned int e = 0; e < n_star; ++e) {
y_val += Asub[row + e * nRows] * x_shared[e];
}
}
__syncthreads();
}
if (row + idx_y < nRows)
y_sub[row] = y_val;
}
template<typename T>
__host__ void matvec(
const T * RESTRICT dA,
const T * RESTRICT dx,
T * RESTRICT dy,
const unsigned int nRows,
const unsigned int nx)
{
dim3 dim_grid( (nRows + BLOCK_SIZE -1)/ BLOCK_SIZE );
dim3 dim_block(BLOCK_SIZE);
matvec_kernel<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);
}
I'm using this to time my execution (file: cuda_timer.cuh):
#include <cuda_runtime.h>
#include "error_handles.cuh"
static cudaEvent_t start;
static cudaEvent_t stop;
static short timer_running = 0;
static short tic_called = 0;
/**
* Sets up the timer.
*
* Must be called before any invocation to
* tic() or toc(), preferrably at the beginning of your
* application.
*/
void start_tictoc();
/**
* Starts the timer.
*
* Use `toc()` to get the elapsed time; `tic()` must
* be called before a `toc()`.
*/
void tic();
/**
* Returns the elapsed time between its invocation
* and a previous invocation of `toc()`. Returns `-1`
* and prints a warning message if `toc()` was not
* previously called. Returns `-2` and prints and error
* message if `start_tictoc()` has not been called.
*
* #return Elapsed time between `tic()` and `toc()` in milliseconds
* with a resolution of `0.5` microseconds.
*/
float toc();
/**
* This function should be called when the
* time will not be being used any more. It destroys
* the events used to time CUDA kernels. If the timer
* is not running, this function does nothing and
* prints a warning message.
*/
void stop_tictoc();
void start_tictoc() {
_CUDA(cudaEventCreate(&start));
_CUDA(cudaEventCreate(&stop));
timer_running = 1;
}
void tic() {
if (timer_running) {
_CUDA(cudaEventRecord(start, 0));
tic_called = 1;
} else {
printf("WARNING: tic() called without a timer running!\n");
}
}
float toc() {
float elapsed_time;
if (tic_called == 0) {
printf("WARNING: toc() called without a previous tic()!\n");
return -1;
}
if (timer_running == 1) {
// _CUDA(cudaDeviceSynchronize()); // Removed! (See discussion below)
_CUDA(cudaEventRecord(stop, 0));
_CUDA(cudaEventSynchronize(stop));
_CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
tic_called = 0;
return elapsed_time;
} else {
printf("WARNING: toc() called without a timer running!\n");
return -2;
}
}
void stop_tictoc()
{
if (timer_running == 1){
_CUDA(cudaEventDestroy(start));
_CUDA(cudaEventDestroy(stop));
timer_running = 0;
} else{
printf("WARNING: stop_tictoc() called without a timer running!\n");
}
}
and my main file (main.cu) is the following:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include "cublas_v2.h"
#include <math.h>
#include <curand.h>
#include <stdbool.h>
#include "mv.cuh"
#include "cuda_timer.cuh"
#include "error_handles.cuh"
typedef float real_t;
#define _CUDA(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define _CUBLAS(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define _CURAND(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(EXIT_FAILURE);}} while(0)
#define TEST_COLUMNS 1
#define TEST_ROWS 0
/**
* If `TEST_WRT_` is set to `TEST_COLUMNS`, then a benchmark
* will be performed with respect to columns (with a fixed
* number of rows). If it is set to `TEST_ROWS`, then a benchmark will
* run with respect to rows (fixed number of columns).
*/
#define TEST_WRT_ TEST_ROWS
#define CONSTANT_COLS 300
#define CONSTANT_ROWS 256
/**
* In order to estimate the execution time, every
* kernel is run `RUNS` times and the average is taken.
*/
#define RUNS 50
void compare_results(real_t *dev_y_cublas, real_t * dev_y,unsigned int nrows)
{
real_t * hst_y_cublas;
real_t * hst_y;
const size_t s = nrows * sizeof(real_t);
hst_y_cublas = (real_t*) malloc(s);
hst_y = (real_t*) malloc(s);
_CUDA(cudaMemcpy(hst_y, dev_y, s, cudaMemcpyDeviceToHost));
_CUDA(cudaMemcpy(hst_y_cublas, dev_y_cublas, s, cudaMemcpyDeviceToHost));
for (unsigned int i = 0; i < nrows; ++i) {
if (fabsf(hst_y_cublas[i] - hst_y[i]) > 0.001) {
printf("ERROR ------ %f\n", fabsf(hst_y_cublas[i] - hst_y[i]));
exit(EXIT_FAILURE);
}
}
if (hst_y_cublas) free(hst_y_cublas);
if (hst_y) free(hst_y);
}
void do_benchmark() {
curandGenerator_t gen;
real_t *dev_rand_data = NULL; // Random data will be allocated here!
real_t *dev_y = NULL;
real_t *dev_y_cublas = NULL;
real_t t;
real_t t_cublas;
const size_t n_rows_max = 1500;
const size_t n_cols_max = 300;
const size_t ntot = n_cols_max * (1 + n_rows_max);
const size_t size_tot = sizeof(real_t) * ntot;
float alpha = 1.0, beta = 0.0; // beta was initially set to 1.0 by mistake
cublasHandle_t handle;
_CUBLAS(cublasCreate(&handle));
start_tictoc();
_CUDA(cudaMalloc((void** )&dev_rand_data, size_tot));
_CUDA(cudaMalloc((void** )&dev_y, n_rows_max * sizeof(real_t)));
_CUDA(cudaMalloc((void** )&dev_y_cublas, n_rows_max * sizeof(real_t)));
_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
_CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
tic();
_CURAND(curandGenerateUniform(gen, dev_rand_data, ntot));
t = toc();
printf("RNG in %f ms\n", t);
_CURAND(curandDestroyGenerator(gen));
size_t ncols = CONSTANT_COLS;
size_t nrows = CONSTANT_ROWS;
size_t runs = RUNS;
cudaMemset(dev_y_cublas, 0, n_rows_max * sizeof(real_t));
matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows, ncols);
_CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
/* Compare results */
compare_results(dev_y_cublas,dev_y, nrows);
FILE * pFile;
char filename[50];
#if (TEST_WRT_ == TEST_COLUMNS)
sprintf(filename, "times_rows%lu_cols.txt", nrows);
#else
sprintf(filename, "times_cols%lu_rows.txt", ncols);
#endif
printf("Logging to : '%s'\n", filename);
pFile = fopen(filename, "w");
if (pFile == NULL) {
perror("Error opening file.");
exit(79);
}
#if (TEST_WRT_ == TEST_COLUMNS)
fprintf(pFile, "0, %lu, 0, 0\n", nrows);
for (ncols = 32; ncols < n_cols_max; ncols += 32) {
#else
fprintf(pFile, "1, %lu, 0, 0\n", ncols);
for (nrows = 32; nrows < n_rows_max; nrows += 32) {
#endif
tic();
for (short i = 0; i < runs; i++) {
matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows,
ncols);
}
t = toc() / runs;
tic();
for (short i = 0; i < runs; i++) {
_CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
}
t_cublas = toc() / runs;
#if (TEST_WRT_ == TEST_COLUMNS)
fprintf(pFile, "%lu, %f, %f\n", ncols, t, t_cublas);
#else
fprintf(pFile, "%lu, %f, %f\n", nrows, t, t_cublas);
#endif
}
_CUBLAS(cublasDestroy(handle));
fclose(pFile);
if (dev_rand_data != NULL)
_CUDA(cudaFree(dev_rand_data));
stop_tictoc();
}
int main(void)
{
do_benchmark();
return EXIT_SUCCESS;
}
Finally, this is a MATLAB script I'm using to plot the execution times:
fetch_this = 'times_cols512_rows.txt';
username = 'ubuntu';
target_hostname = 'jetson';
% Do not modify below this line
eval_this=['! scp ' username '#' target_hostname ':~/mv/Debug/' fetch_this ' .'];
eval(eval_this)
set(0, 'DefaultAxesFontSize', 14);
r = csvread(fetch_this);
r_header = r(1,:);
plot(r(2:end,1), r(2:end,2)*1000, '-');
hold on
plot(r(2:end,1), r(2:end,3)*1000, '-r');
grid on;
fig_title = 'Matvec on Tegra K1 - %d %s';
if (r_header(1)==1),
xlabel('Number of rows');
title(sprintf(fig_title, r_header(2),'columns'));
else
xlabel('Number of columns');
title(sprintf(fig_title, r_header(2),'rows'));
end
ylabel('Computation time [us]');
legend('Kernel', 'cuBLAS');
axis tight
I am concerned about the performance and the scalability of my kernel, so first I would like to know how to improve the scalability with respect to the number of rows of matrix A. Second, I know that it is not very good practice to have branch divergence (and my code has), but I'm feeling I want some hints to improve it.
UPDATE :
Thanks to all your comments and suggestions, I reached the conclusion that cudaDeviceSynchronized() caused, in the first place, some peculiarities with my timing so my initial measurements were inaccurate. Row-major ordering leads to worse results. The size of the blocks is an important tuning parameter and changing from 16 to 32 or 64 improves the execution time. Further benchmarking is necessary to choose the block size. To this end, one may use the following API for the kernel:
template<typename T, const uint_t blk>
__global__ void matvec_kernel(const T * RESTRICT dA, const T * RESTRICT dx,
T * RESTRICT dy, const uint_t nRows, const uint_t nx);
and call it like this from the host:
template<typename T>
__host__ void matvec(const T * RESTRICT dA, const T * RESTRICT dx,
T * RESTRICT dy, const uint_t nRows, const uint_t nx) {
uint_t blk_size_opt = 64;
/* Add code to decide the value of `blk_size_opt` */
if (blk_size_opt == 32) {
matvec_engine<T, 32>(dA, dx, dy, nRows, nx);
} else if (blk_size_opt == 64) {
matvec_engine<T, 64>(dA, dx, dy, nRows, nx);
} else if (blk_size_opt == 128) {
matvec_engine<T, 128>(dA, dx, dy, nRows, nx);
} else if (blk_size_opt == 256) {
matvec_engine<T, 256>(dA, dx, dy, nRows, nx);
}
}
Let me provide some benchmarking results. First a comparison with cublasSgemv:
and the effect of block size on the execution time:

First, let me write down the full working Matrix-Vector multiplication kernel employing shared memory:
template<typename T>
__global__ void matvec_kernel(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ T x_shared[BLOCK_SIZE];
T y_val = 0.0;
#pragma unroll
for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
{
if ((m * BLOCK_SIZE + threadIdx.x) < nCols) x_shared[threadIdx.x] = dx[threadIdx.x + m * BLOCK_SIZE];
else x_shared[threadIdx.x] = 0.f;
__syncthreads();
#pragma unroll
for (unsigned int e = 0; e < BLOCK_SIZE; ++e) {
// --- Column-major ordering - faster
y_val += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
// --- Row-major ordering - slower
//y_val += dA[tid * nCols + (e + BLOCK_SIZE * m)] * x_shared[e];
}
__syncthreads();
}
if (tid < nRows) dy[tid] = y_val;
}
Unless differently specified, all the tests will be done on a GT540M card.
A first parameter to be optimized is the BLOCK_SIZE. Changing the BLOCK_SIZE changes the algorithm performance, as witnessed by the following graph:
The following graphs compares row-major ordering vs. column-major ordering. The latter is faster:
Another optimization you may wish to try is using more Instruction Level Parallelism (ILP) by this modified kernel employing ILP = 2
template<typename T>
__global__ void matvec_kernel_ILP2(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
__shared__ T x_shared[BLOCK_SIZE];
T y_val1 = 0.0;
T y_val2 = 0.0;
#pragma unroll
for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
{
if ((m * BLOCK_SIZE + threadIdx.x) < nCols) x_shared[threadIdx.x] = dx[threadIdx.x + m * BLOCK_SIZE];
else x_shared[threadIdx.x] = 0.f;
__syncthreads();
#pragma unroll
for (unsigned int e = 0; e < BLOCK_SIZE; ++e) {
y_val1 += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
y_val2 += dA[tid + gridDim.x * BLOCK_SIZE + (e + BLOCK_SIZE * m) * nRows] * x_shared[e];
}
__syncthreads();
}
if (tid < nRows) dy[tid] = y_val1;
if ((tid + gridDim.x * BLOCK_SIZE) < nRows) dy[tid + gridDim.x * BLOCK_SIZE] = y_val2;
}
This kernel should be called with half of the threads, as
dim3 dim_grid((nRows/2 + BLOCK_SIZE -1)/ BLOCK_SIZE);
dim3 dim_block(BLOCK_SIZE);
matvec_kernel_ILP2<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);
Finally, since you are using a device with compute capability 3.2, you can try using shuffle operations. I'm providing here the kernel using shuffle operations instead of shared memory. In this case, you should set BLOCK_SIZE = 32:
template<typename T>
__global__ void matvec_kernel_shfl(const T * __restrict__ dA, const T * __restrict__ dx, T * __restrict__ dy, const unsigned int nRows, const unsigned int nCols)
{
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
T x_shfl_src, x_shfl_dest;
T y_val = 0.0;
#pragma unroll
for (unsigned int m = 0; m < ((nCols + BLOCK_SIZE - 1)/ BLOCK_SIZE); ++m)
{
if ((m * BLOCK_SIZE + threadIdx.x) < nCols) x_shfl_src = dx[threadIdx.x + m * BLOCK_SIZE];
else x_shfl_src = 0.f;
__syncthreads();
// #pragma unroll
for (int e = 0; e < 32; ++e) {
// --- Column-major ordering - faster
x_shfl_dest = __shfl(x_shfl_src, e);
y_val += dA[tid + (e + BLOCK_SIZE * m) * nRows] * x_shfl_dest;
// --- Row-major ordering - slower
//y_val += dA[tid * nCols + (e + BLOCK_SIZE * m)] * x_shared[e];
}
__syncthreads();
}
if (tid < nRows) dy[tid] = y_val;
}
Shuffle operations improve the performance over shared memory for BLOCK_SIZE = 32 on a Kepler K20c as shown by the graph below:

Looking at your code I think that the way you traverse the elements of A may be the problem:
for (unsigned int e = 0; e < n_star; ++e) {
y_val += Asub[row + e * nRows] * x_shared[e];
}
So, when nRows becomes large, you actually read from the global memory (that is where A is stored) with a large stride. In particular this happens in every block: threads inside the same block will read from the global memory in a non-consecutive fashion. This can be improved if you consider storing from the beginning the values of A row-by-row (i.e., using row-major order). This is just a guess and I would have written a comment, but it requires a higher score on Stackoverflow...

Optimizing the solution of the 2D diffusion (heat) equation in CUDA

I have already checked earlier questions on SO about this issue but not able to see how it relates here.
I am solving 2d diffusion equation with CUDA and it turns out that my GPU code is slower than its CPU counterpart.
Here is my code:
//kernel definition
__global__ void diffusionSolver(double* A, int n_x,int n_y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i<n_x && j <n_y && i*(n_x-i-1)*j*(n_y-j-1)!=0)
A[i+n_y*j] = A[i+n_y*j] + (A[i-1+n_y*j]+A[i+1+n_y*j]+A[i+(j-1)*n_y]+A[i+(j+1)*n_y] -4.0*A[i+n_y*j])/40.0;
}
int main function
int main()
{
int n_x = 200 ;
int n_y = 200 ;
double *phi;
double *dummy;
double *phi_old;
int i,j ;
phi = (double *) malloc( n_x*n_y* sizeof(double));
phi_old = (double *) malloc( n_x*n_y* sizeof(double));
dummy = (double *) malloc( n_x*n_y* sizeof(double));
int iterationMax =200;
for(j=0;j<n_y ;j++)
{
for(i=0;i<n_x;i++)
{
if((.4*n_x-i)*(.6*n_x-i)<0)
phi[i+n_y*j] = -1;
else
phi[i+n_y*j] = 1;
}
}
double *dev_phi;
cudaMalloc((void **) &dev_phi, n_x*n_y*sizeof(double));
cudaMemcpy(dev_phi, phi, n_x*n_y*sizeof(double),
cudaMemcpyHostToDevice);
dim3 threadsPerBlock(10,100);
dim3 numBlocks(n_x*n_y / threadsPerBlock.x, n_x*n_y / threadsPerBlock.y);
for(int z=0; z<iterationMax; z++)
{
if(z%100==0)
cout <<z/100 <<"\n";;
diffusionSolver<<<numBlocks, threadsPerBlock>>>(dev_phi, n_x,n_y);
}
cudaMemcpy(phi, dev_phi,n_x*n_y*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(dev_phi);
return 0;
}
The problem with this code is that it runs slower than a simple CPU-only iterative method. I don't know much about profiler and until now I tried with cuda-memcheck which gives 0 errors.
How can I know which portion of the code is performing slowly and speed that up? I am working on a Linux environment. Thanks in advance for any help.

The worst problem I see is that you are launching far too many blocks for the size of the input array. At the moment you are computing the grid size as:
dim3 numBlocks(n_x*n_y / threadsPerBlock.x, n_x*n_y / threadsPerBlock.y);
which should yield a grid size of (400,4000) blocks for an input array of only 200x200. That is clearly incorrect. The calculation should be something like:
int nbx = (n_x / threadsPerBlock.x) + (((n_x % threadsPerBlock.x) == 0) ? 0 : 1);
int nby = (n_y / threadsPerBlock.y) + (((n_y % threadsPerBlock.y) == 0) ? 0 : 1);
dim3 numBlocks(nbx,nby);
which would yield a grid size of (2,20) blocks, or 40000 times fewer than you are currently launching.
There are other optimisations which you could consider making to the kernel, but those pale into insignificance compared with mistakes of this magnitude.

You are doing a lot of integer multiplication and have a lot of global memory reads, both of which are slow in CUDA. I also imagine that there are not a lot of coalesced global memory reads.
The only way to speed up your kernel is to stage coalesced memory reads through shared memory and/or re-arrange your data so that you can index it without using lots of integer multiplication.
I don't have a great grasp of diffusion equations, but I don't think there is a lot of naive parallelism to be exploited. Take a look at the CUDA Programming Guide and the Best Practices Guide and maybe you'll get some ideas about how to improve your algorithm.

In case anybody is interested, I'm posting below a fully worked code concerning the optimization of the solution approach for the 2D heat equation.
Five approaches are considered, using:
Global memory, essentially the OP's approach;
Shared memory of size BLOCK_SIZE_X x BLOCK_SIZE_Y not loading the halo regions;
Shared memory of size BLOCK_SIZE_X x BLOCK_SIZE_Y loading the halo regions;
Shared memory of size (BLOCK_SIZE_X + 2) x (BLOCK_SIZE_Y + 2) loading the halo regions;
Texture memory.
Everybody can run the code and check out which approach is faster for his own GPU architecture.
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Utilities.cuh"
#include "InputOutput.cuh"
#include "TimingGPU.cuh"
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
#define DEBUG
texture<float, 2, cudaReadModeElementType> tex_T;
texture<float, 2, cudaReadModeElementType> tex_T_old;
/***********************************/
/* JACOBI ITERATION FUNCTION - GPU */
/***********************************/
__global__ void Jacobi_Iterator_GPU(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
// N
int P = i + j*NX; // node (i,j) |
int N = i + (j+1)*NX; // node (i,j+1) |
int S = i + (j-1)*NX; // node (i,j-1) W ---- P ---- E
int E = (i+1) + j*NX; // node (i+1,j) |
int W = (i-1) + j*NX; // node (i-1,j) |
// S
// --- Only update "interior" (not boundary) node points
if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]);
}
/******************************************************/
/* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V1 */
/******************************************************/
__global__ void Jacobi_Iterator_GPU_shared_v1(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
// N
int P = i + j*NX; // node (i,j) |
int N = i + (j+1)*NX; // node (i,j+1) |
int S = i + (j-1)*NX; // node (i,j-1) W ---- P ---- E
int E = (i+1) + j*NX; // node (i+1,j) |
int W = (i-1) + j*NX; // node (i-1,j) |
// S
__shared__ float T_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y];
// --- Load data to shared memory. Halo regions are NOT loaded.
T_sh[threadIdx.x][threadIdx.y] = T_old[P];
__syncthreads();
if ((threadIdx.x > 0) && (threadIdx.x < (BLOCK_SIZE_X - 1)) && (threadIdx.y > 0) && (threadIdx.y < (BLOCK_SIZE_Y ‐ 1)))
// --- If we do not need halo region elements, then use shared memory.
T_new[P] = 0.25 * (T_sh[threadIdx.x][threadIdx.y - 1] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x - 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y]);
else if (i>0 && i<NX-1 && j>0 && j<NY-1) // --- Only update "interior" (not boundary) node points
// --- If we need halo region elements, then use global memory.
T_new[P] = 0.25 * (T_old[E] + T_old[W] + T_old[N] + T_old[S]);
}
/******************************************************/
/* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V2 */
/******************************************************/
__global__ void Jacobi_Iterator_GPU_shared_v2(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * (BLOCK_SIZE_X - 2) + threadIdx.x ;
const int j = blockIdx.y * (BLOCK_SIZE_Y - 2) + threadIdx.y ;
int P = i + j*NX;
if ((i >= NX) || (j >= NY)) return;
__shared__ float T_sh[BLOCK_SIZE_X][BLOCK_SIZE_Y];
// --- Load data to shared memory. Halo regions ARE loaded.
T_sh[threadIdx.x][threadIdx.y] = T_old[P];
__syncthreads();
if (((threadIdx.x > 0) && (threadIdx.x < (BLOCK_SIZE_X - 1)) && (threadIdx.y > 0) && (threadIdx.y < (BLOCK_SIZE_Y ‐ 1))) &&
(i>0 && i<NX-1 && j>0 && j<NY-1))
T_new[P] = 0.25 * (T_sh[threadIdx.x][threadIdx.y - 1] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x - 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y]);
}
/******************************************************/
/* JACOBI ITERATION FUNCTION - GPU - SHARED MEMORY V2 */
/******************************************************/
__global__ void Jacobi_Iterator_GPU_shared_v3(const float * __restrict__ T_old, float * __restrict__ T_new, const int NX, const int NY)
{
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
const int tid_block = threadIdx.y * BLOCK_SIZE_X + threadIdx.x; // --- Flattened thread index within a block
const int i1 = tid_block % (BLOCK_SIZE_X + 2);
const int j1 = tid_block / (BLOCK_SIZE_Y + 2);
const int i2 = (BLOCK_SIZE_X * BLOCK_SIZE_Y + tid_block) % (BLOCK_SIZE_X + 2);
const int j2 = (BLOCK_SIZE_X * BLOCK_SIZE_Y + tid_block) / (BLOCK_SIZE_Y + 2);
int P = i + j * NX;
if ((i >= NX) || (j >= NY)) return;
__shared__ float T_sh[BLOCK_SIZE_X + 2][BLOCK_SIZE_Y + 2];
if (((blockIdx.x * BLOCK_SIZE_X - 1 + i1) < NX) && ((blockIdx.y * BLOCK_SIZE_Y - 1 + j1) < NY))
T_sh[i1][j1] = T_old[(blockIdx.x * BLOCK_SIZE_X - 1 + i1) + (blockIdx.y * BLOCK_SIZE_Y - 1 + j1) * NX];
if (((i2 < (BLOCK_SIZE_X + 2)) && (j2 < (BLOCK_SIZE_Y + 2))) && (((blockIdx.x * BLOCK_SIZE_X - 1 + i2) < NX) && ((blockIdx.y * BLOCK_SIZE_Y - 1 + j2) < NY)))
T_sh[i2][j2] = T_old[(blockIdx.x * BLOCK_SIZE_X - 1 + i2) + (blockIdx.y * BLOCK_SIZE_Y - 1 + j2) * NX];
__syncthreads();
if ((threadIdx.x <= (BLOCK_SIZE_X - 1) && (threadIdx.y <= (BLOCK_SIZE_Y ‐ 1))) && (i>0 && i<NX-1 && j>0 && j<NY-1))
T_new[P] = 0.25 * (T_sh[threadIdx.x + 1][threadIdx.y] + T_sh[threadIdx.x + 1][threadIdx.y + 2] + T_sh[threadIdx.x][threadIdx.y + 1] + T_sh[threadIdx.x + 2][threadIdx.y + 1]);
}
/*********************************************/
/* JACOBI ITERATION FUNCTION - GPU - TEXTURE */
/*********************************************/
__global__ void Jacobi_Iterator_GPU_texture(float * __restrict__ T_new, const bool flag, const int NX, const int NY) {
const int i = blockIdx.x * blockDim.x + threadIdx.x ;
const int j = blockIdx.y * blockDim.y + threadIdx.y ;
float P, N, S, E, W;
if (flag) {
// N
P = tex2D(tex_T_old, i, j); // node (i,j) |
N = tex2D(tex_T_old, i, j + 1); // node (i,j+1) |
S = tex2D(tex_T_old, i, j - 1); // node (i,j-1) W ---- P ---- E
E = tex2D(tex_T_old, i + 1, j); // node (i+1,j) |
W = tex2D(tex_T_old, i - 1, j); // node (i-1,j) |
// S
} else {
// N
P = tex2D(tex_T, i, j); // node (i,j) |
N = tex2D(tex_T, i, j + 1); // node (i,j+1) |
S = tex2D(tex_T, i, j - 1); // node (i,j-1) W ---- P ---- E
E = tex2D(tex_T, i + 1, j); // node (i+1,j) |
W = tex2D(tex_T, i - 1, j); // node (i-1,j) |
// S
}
// --- Only update "interior" (not boundary) node points
if (i>0 && i<NX-1 && j>0 && j<NY-1) T_new[i + j*NX] = 0.25 * (E + W + N + S);
}
/***********************************/
/* JACOBI ITERATION FUNCTION - CPU */
/***********************************/
void Jacobi_Iterator_CPU(float * __restrict T, float * __restrict T_new, const int NX, const int NY, const int MAX_ITER)
{
for(int iter=0; iter<MAX_ITER; iter=iter+2)
{
// --- Only update "interior" (not boundary) node points
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T[(i+1) + NX*j];
float T_W = T[(i-1) + NX*j];
float T_N = T[i + NX*(j+1)];
float T_S = T[i + NX*(j-1)];
T_new[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
for(int j=1; j<NY-1; j++)
for(int i=1; i<NX-1; i++) {
float T_E = T_new[(i+1) + NX*j];
float T_W = T_new[(i-1) + NX*j];
float T_N = T_new[i + NX*(j+1)];
float T_S = T_new[i + NX*(j-1)];
T[i+NX*j] = 0.25*(T_E + T_W + T_N + T_S);
}
}
}
/******************************/
/* TEMPERATURE INITIALIZATION */
/******************************/
void Initialize(float * __restrict h_T, const int NX, const int NY)
{
// --- Set left wall to 1
for(int j=0; j<NY; j++) h_T[j * NX] = 1.0;
}
/********/
/* MAIN */
/********/
int main()
{
const int NX = 256; // --- Number of discretization points along the x axis
const int NY = 256; // --- Number of discretization points along the y axis
const int MAX_ITER = 100; // --- Number of Jacobi iterations
// --- CPU temperature distributions
float *h_T = (float *)calloc(NX * NY, sizeof(float));
float *h_T_old = (float *)calloc(NX * NY, sizeof(float));
Initialize(h_T, NX, NY);
Initialize(h_T_old, NX, NY);
float *h_T_GPU_result = (float *)malloc(NX * NY * sizeof(float));
float *h_T_GPU_tex_result = (float *)malloc(NX * NY * sizeof(float));
float *h_T_GPU_sh1_result = (float *)malloc(NX * NY * sizeof(float));
float *h_T_GPU_sh2_result = (float *)malloc(NX * NY * sizeof(float));
float *h_T_GPU_sh3_result = (float *)malloc(NX * NY * sizeof(float));
// --- GPU temperature distribution
float *d_T; gpuErrchk(cudaMalloc((void**)&d_T, NX * NY * sizeof(float)));
float *d_T_old; gpuErrchk(cudaMalloc((void**)&d_T_old, NX * NY * sizeof(float)));
float *d_T_tex; gpuErrchk(cudaMalloc((void**)&d_T_tex, NX * NY * sizeof(float)));
float *d_T_old_tex; gpuErrchk(cudaMalloc((void**)&d_T_old_tex, NX * NY * sizeof(float)));
float *d_T_sh1; gpuErrchk(cudaMalloc((void**)&d_T_sh1, NX * NY * sizeof(float)));
float *d_T_old_sh1; gpuErrchk(cudaMalloc((void**)&d_T_old_sh1, NX * NY * sizeof(float)));
float *d_T_sh2; gpuErrchk(cudaMalloc((void**)&d_T_sh2, NX * NY * sizeof(float)));
float *d_T_old_sh2; gpuErrchk(cudaMalloc((void**)&d_T_old_sh2, NX * NY * sizeof(float)));
float *d_T_sh3; gpuErrchk(cudaMalloc((void**)&d_T_sh3, NX * NY * sizeof(float)));
float *d_T_old_sh3; gpuErrchk(cudaMalloc((void**)&d_T_old_sh3, NX * NY * sizeof(float)));
gpuErrchk(cudaMemcpy(d_T, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_tex, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_sh1, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_sh2, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_sh3, h_T, NX * NY * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_T_old, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_T_old_tex, d_T_tex, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_T_old_sh1, d_T_sh1, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_T_old_sh2, d_T_sh2, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
gpuErrchk(cudaMemcpy(d_T_old_sh3, d_T_sh3, NX * NY * sizeof(float), cudaMemcpyDeviceToDevice));
//cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
cudaChannelFormatDesc desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
gpuErrchk(cudaBindTexture2D(NULL, &tex_T, d_T_tex, &desc, NX, NY, sizeof(float) * NX));
gpuErrchk(cudaBindTexture2D(NULL, &tex_T_old, d_T_old_tex, &desc, NX, NY, sizeof(float) * NX));
tex_T.addressMode[0] = cudaAddressModeWrap;
tex_T.addressMode[1] = cudaAddressModeWrap;
tex_T.filterMode = cudaFilterModePoint;
tex_T.normalized = false;
tex_T_old.addressMode[0] = cudaAddressModeWrap;
tex_T_old.addressMode[1] = cudaAddressModeWrap;
tex_T_old.filterMode = cudaFilterModePoint;
tex_T_old.normalized = false;
// --- Grid size
dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid (iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y));
// --- Jacobi iterations on the host
Jacobi_Iterator_CPU(h_T, h_T_old, NX, NY, MAX_ITER);
// --- Jacobi iterations on the device
TimingGPU timerGPU;
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T, d_T_old, NX, NY); // --- Update d_T_old starting from data stored in d_T
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU<<<dimGrid, dimBlock>>>(d_T_old, d_T , NX, NY); // --- Update d_T starting from data stored in d_T_old
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing = %f ms\n", timerGPU.GetCounter());
// --- Jacobi iterations on the device - shared memory v1
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU_shared_v1<<<dimGrid, dimBlock>>>(d_T_sh1, d_T_old_sh1, NX, NY); // --- Update d_T_old starting from data stored in d_T
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU_shared_v1<<<dimGrid, dimBlock>>>(d_T_old_sh1, d_T_sh1 , NX, NY); // --- Update d_T starting from data stored in d_T_old
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing with shared memory v1 = %f ms\n", timerGPU.GetCounter());
// --- Jacobi iterations on the device - shared memory v2
dim3 dimBlock2(BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimGrid2 (iDivUp(NX, BLOCK_SIZE_X - 2), iDivUp(NY, BLOCK_SIZE_Y - 2));
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU_shared_v2<<<dimGrid2, dimBlock>>>(d_T_sh2, d_T_old_sh2, NX, NY); // --- Update d_T_old starting from data stored in d_T
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU_shared_v2<<<dimGrid2, dimBlock>>>(d_T_old_sh2, d_T_sh2 , NX, NY); // --- Update d_T starting from data stored in d_T_old
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing with shared memory v2 = %f ms\n", timerGPU.GetCounter());
// --- Jacobi iterations on the device - shared memory v3
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU_shared_v3<<<dimGrid, dimBlock>>>(d_T_sh3, d_T_old_sh3, NX, NY); // --- Update d_T_old starting from data stored in d_T
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU_shared_v3<<<dimGrid, dimBlock>>>(d_T_old_sh3, d_T_sh3 , NX, NY); // --- Update d_T starting from data stored in d_T_old
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing with shared memory v3 = %f ms\n", timerGPU.GetCounter());
// --- Jacobi iterations on the device - texture case
timerGPU.StartCounter();
for (int k=0; k<MAX_ITER; k=k+2) {
Jacobi_Iterator_GPU_texture<<<dimGrid, dimBlock>>>(d_T_old_tex, 0, NX, NY); // --- Update d_T_tex starting from data stored in d_T_old_tex
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
Jacobi_Iterator_GPU_texture<<<dimGrid, dimBlock>>>(d_T_tex, 1, NX, NY); // --- Update d_T_old_tex starting from data stored in d_T_tex
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
printf("Timing with texture = %f ms\n", timerGPU.GetCounter());
saveCPUrealtxt(h_T, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\CPU_result.txt", NX * NY);
saveGPUrealtxt(d_T_tex, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_tex.txt", NX * NY);
saveGPUrealtxt(d_T, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result.txt", NX * NY);
saveGPUrealtxt(d_T_sh1, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh1.txt", NX * NY);
saveGPUrealtxt(d_T_sh2, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh2.txt", NX * NY);
saveGPUrealtxt(d_T_sh3, "C:\\Users\\Documents\\Project\\Differential_Equations\\Heat_Equation\\2D\\DiffusionEquationJacobi\\DiffusionEquation\\GPU_result_sh3.txt", NX * NY);
// --- Copy results from device to host
gpuErrchk(cudaMemcpy(h_T_GPU_result, d_T, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_T_GPU_tex_result, d_T_tex, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_T_GPU_sh1_result, d_T_sh1, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_T_GPU_sh2_result, d_T_sh2, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_T_GPU_sh3_result, d_T_sh3, NX * NY * sizeof(float), cudaMemcpyDeviceToHost));
// --- Calculate percentage root mean square error between host and device results
float sum = 0.f, sum_tex = 0.f, sum_ref = 0.f, sum_sh1 = 0.f, sum_sh2 = 0.f, sum_sh3 = 0.f;
for (int j=0; j<NY; j++)
for (int i=0; i<NX; i++) {
sum = sum + (h_T_GPU_result [j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_result [j * NX + i] - h_T[j * NX + i]);
sum_tex = sum_tex + (h_T_GPU_tex_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_tex_result[j * NX + i] - h_T[j * NX + i]);
sum_sh1 = sum_sh1 + (h_T_GPU_sh1_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh1_result[j * NX + i] - h_T[j * NX + i]);
sum_sh2 = sum_sh2 + (h_T_GPU_sh2_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh2_result[j * NX + i] - h_T[j * NX + i]);
sum_sh3 = sum_sh3 + (h_T_GPU_sh3_result[j * NX + i] - h_T[j * NX + i]) * (h_T_GPU_sh3_result[j * NX + i] - h_T[j * NX + i]);
sum_ref = sum_ref + h_T[j * NX + i] * h_T[j * NX + i];
}
printf("Percentage root mean square error = %f\n", 100.*sqrt(sum / sum_ref));
printf("Percentage root mean square error texture = %f\n", 100.*sqrt(sum_tex / sum_ref));
printf("Percentage root mean square error shared v1 = %f\n", 100.*sqrt(sum_sh1 / sum_ref));
printf("Percentage root mean square error shared v2 = %f\n", 100.*sqrt(sum_sh2 / sum_ref));
printf("Percentage root mean square error shared v3 = %f\n", 100.*sqrt(sum_sh3 / sum_ref));
return 0;
}

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

How to use register memory for each thread in CUDA? - cuda

Related

Why PyCUDA is faster than C CUDA in this example

Calculating indices for nested loops in CUDA

Dot product in Cuda by example does not work for me

Matrix-vector multiplication in CUDA: benchmarking & performance

Optimizing the solution of the 2D diffusion (heat) equation in CUDA

Categories

Resources