CUDA Jacobian Relaxation - cuda

I am in the process of mapping this sequential computation to a CUDA computation. This computation is a 2-dimensional Jacobian relaxation on an NxN grid, where N is unknown. N is evenly divisible by 32.
Jacobi(float *a,float *b,int N){
for (i=1; i<N+1; i++){
for (j=1; j<N+1; j++) {
a[i][j]=0.8*(b[i+1][j]+b[i+1][j]+b[i][j+1]+b[i][j+1]);
}
}
}
I'm parallelizing the outer two loops, and each thread should compute just one element. The goal is to parallelize it to use a cyclic distribution in the the x and y dimensions. Can some one aid me in implementing a Jacobi_GPU that has the appropriate indexing functions in CUDA that results in the following distribution?
dim3 dimGrid(N/32,N/32);
dim3 dimBlock(32,32);
Jacobi_GPU<<<dimGrid,dimBlock>>>(A,B,N)

forThis is the simple implementation. You can use shared memory optimization for this kernel function
__global__ void jacobi(int* a, const int* b,const int N)
{
int i= blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i<N && j<N)
{
a[j*N+i] = 0.8* (2*b[(i+1)+j*N] + 2*b[i+N*(j+1)]);
}
}

Or, if you want to use "arrays of arrays" rather than arrays:
__global__ void Jacobi(int** a, const int** b,const int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i<N && j<N)
{
a[i][j]=0.8*(b[i+1][j]+b[i+1][j]+b[i][j+1]+b[i][j+1]);
}
}

Related

cudaMallocManaged for 2D and 3D array

If one wants to copy the arrays to device from host one does cudamalloc and cudaMemcpy. But to lessen the hassle one just does cudaMallocManaged without the former two things and life was never simpler before.
The code looks like this(more or less)
__global__ void convert(float kelvin[], float celsius[]) //can pass
arrays in kernel
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<N)
kelvin[i]=celsius[i]+273.15;
}
int main()
{
float *celsius =(float *)malloc(N*sizeof(float));
float *kelvin =(float *)malloc(N*sizeof(float));
cudaMallocManaged(&celsius, N*sizeof(float));
cudaMallocManaged(&kelvin, N*sizeof(float));
// init celsius here
dim3 blocksPerGrid(1,1,1); //use only one block
dim3 threadsPerBlock(N,1,1); //use N threads in the block
convert<<<blocksPerGrid, threadsPerBlock>>>(kelvin,celsius);
cudaDeviceSynchronize();
//Doing stuff with the output here
return 0;
}
The previous example seems clear to me. But, how to do cudaMallocManaged for 2D and 3D array? I've been trying
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{ // I thonk, 2D arrays can be passed as pointer to pointers
float **A = (float **)malloc(N*N*sizeof(float));
float **B = (float **)malloc(N*N*sizeof(float));
float **C = (float **)malloc(N*N*sizeof(float));
cudaMallocManaged(&A, N*N*sizeof(float));
cudaMallocManaged(&B, N*N*sizeof(float));
cudaMallocManaged(&C, N*N*sizeof(float));
A[N][N]={{1,0,0},{0,1,0},{0,0,1}};
B[N][N]={{1,0,0},{0,1,0},{0,0,1}};
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
//outputs and all
}
But, It shows the following error
matrix_add.cu(22): error: too many initializer values
matrix_add.cu(25): error: argument of type "float **" is incompatible with parameter of type "float (*)[3]"
Your help is highly appreciated.
You got a lot wrong in your attempt, so much that it was faster to write a working version than list out all the individual problems in the code in your question. So here is a working version of what it appears you were trying to do:
#include <algorithm>
#include <iostream>
const int N = 3;
__global__ void MatAdd(float A[][N], float B[][N], float C[][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
float* A; cudaMallocManaged(&A, N*N*sizeof(float));
float* B; cudaMallocManaged(&B, N*N*sizeof(float));
float* C; cudaMallocManaged(&C, N*N*sizeof(float));
const float A_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
const float B_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
float (*C_vals)[N] = reinterpret_cast<float (*)[N]>(C);
std::copy(&A_vals[0][0], &A_vals[0][0] + N*N, A);
std::copy(&B_vals[0][0], &B_vals[0][0] + N*N, B);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(1, 1);
MatAdd<<<numBlocks, threadsPerBlock>>>( reinterpret_cast<float (*)[N]>(A),
reinterpret_cast<float (*)[N]>(B),
C_vals );
cudaDeviceSynchronize();
for(int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
std::cout << C_vals[i][j] << " ";
}
std::cout << std::endl;
}
return 0;
}
Some important points:
Managed memory allocation replaces standard host memory allocation and produces memory which is directly accessible on both the host and the device.
All arrays decay to a pointer when passed as arguments to a function by value. That decay is not recursive. See here for more details.
You can (and will need to) cast in order to use the [][] access syntax on linear memory allocated dynamically at runtime (this applies to malloc, new, or any of the CUDA host memory allocation APIs. See here for more details).
Initialization syntax and assignment syntax for arrays are not interchangeable.
All I can suggest is that you study it thoroughly until you understand how it works.

correctly computing gridDim for CUDA kernel

i expected to see numbers from 0.0 to 999.0 but instead getting some very weird and long number for some of the indices for the below code:
__global__ void kernel(double *res, int N)
{
int i = (gridDim.y*blockIdx.y+
blockIdx.x)*blockDim.x*blockDim.y+
blockDim.y*threadIdx.y+threadIdx.x;
if(i<N) res[i] = i;
}
void callGPU(int N)
{
dim3 dimBlock(8, 8);
dim3 dimGrid(2, 8);
...
kernel<<<dimGrid, dimBlock>>>(res, N);
...
}
even if i change the dimGrid to (8,2) and (1,16), but if I change the gridDim to (16,1) then i am getting the indices right. plz can you show how to correctly compute the gridDim for this case? if possible to arbitrary N. many thanks!
Your indexing pattern is wrong.
Firstly, You should compute index by x and y dimensions.
int i_x = blockIdx.x * blockDim.x + threadIdx.x;
int i_y = blockIdx.y * blockDim.y + threadIdx.y;
Then you should compute pitch as count of whole threads by x dimension
int pitch = gridDim.x * blockDim.x;
Finally, You can compute your 1D index from 2D grid.
int i = i_y * pitch + i_x;

How to use 2D Arrays in CUDA?

How to allocate a 2D array of size MXN? And how to traverse that array in CUDA?
__global__ void test(int A[BLOCK_SIZE][BLOCK_SIZE], int B[BLOCK_SIZE][BLOCK_SIZE],int C[BLOCK_SIZE][BLOCK_SIZE])
{
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < BLOCK_SIZE && j < BLOCK_SIZE)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
int d_A[BLOCK_SIZE][BLOCK_SIZE];
int d_B[BLOCK_SIZE][BLOCK_SIZE];
int d_C[BLOCK_SIZE][BLOCK_SIZE];
int C[BLOCK_SIZE][BLOCK_SIZE];
for(int i=0;i<BLOCK_SIZE;i++)
for(int j=0;j<BLOCK_SIZE;j++)
{
d_A[i][j]=i+j;
d_B[i][j]=i+j;
}
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(GRID_SIZE, GRID_SIZE);
test<<<dimGrid, dimBlock>>>(d_A,d_B,d_C);
cudaMemcpy(C,d_C,BLOCK_SIZE*BLOCK_SIZE , cudaMemcpyDeviceToHost);
for(int i=0;i<BLOCK_SIZE;i++)
for(int j=0;j<BLOCK_SIZE;j++)
{
printf("%d\n",C[i][j]);
}
}
How to allocate 2D array:
int main(){
#define BLOCK_SIZE 16
#define GRID_SIZE 1
int d_A[BLOCK_SIZE][BLOCK_SIZE];
int d_B[BLOCK_SIZE][BLOCK_SIZE];
/* d_A initialization */
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // so your threads are BLOCK_SIZE*BLOCK_SIZE, 256 in this case
dim3 dimGrid(GRID_SIZE, GRID_SIZE); // 1*1 blocks in a grid
YourKernel<<<dimGrid, dimBlock>>>(d_A,d_B); //Kernel invocation
}
How to traverse that array:
__global__ void YourKernel(int d_A[BLOCK_SIZE][BLOCK_SIZE], int d_B[BLOCK_SIZE][BLOCK_SIZE]){
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= h || col >= w)return;
/* whatever you wanna do with d_A[][] and d_B[][] */
}
i hope this is helpful
and also you can refer to CUDA Programming Guide page 22 about Matrix Multiplication
The best way would be storing a two-dimensional array A in its vector form.
For example you have a matrix A size nxm, and it's (i,j) element in pointer to pointer representation will be
A[i][j] (with i=0..n-1 and j=0..m-1).
In a vector form you can write
A[i*n+j] (with i=0..n-1 and j=0..m-1).
Using one-dimensional array in this case will simplify the copy process, which would be simple:
double *A,*dev_A; //A-hous pointer, dev_A - device pointer;
A=(double*)malloc(n*m*sizeof(double));
cudaMalloc((void**)&dev_A,n*m*sizeof(double));
cudaMemcpy(&dev_A,&A,n*m*sizeof(double),cudaMemcpyHostToDevice); //In case if A is double

How to write CUDA global function for this?

I want to convert the following function into CUDA.
void fun()
{
for(i = 0; i < terrainGridLength; i++)
{
for(j = 0; j < terrainGridWidth; j++)
{
//CODE of function
}
}
}
I wrote the function like this :
__global__ void fun()
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if((i < terrainGridLength)&&(j<terrainGridWidth))
{
//CODE of function
}
}
I declared both terrainGridLength and terrainGridWidth as constants and assigned value 120 to both. And I am calling function like
fun<<<30,500>>>()
But i am not getting correct output.
Is the code which i wrote is correct?.I didn't understood much about the parellel execution of the code.Please explain me how the code will work and correct me if i done any mistakes.
You use y dimension which means you are using 2D array threads, so you cannot invoke the kernel with only:
int numBlock = 30;
int numThreadsPerBlock = 500;
fun<<<numBlock,numThreadsPerBlock>>>()
The invocation should be: (Note that now Blocks have 2D Threads)
dim3 dimGrid(GRID_SIZE, GRID_SIZE); // 2D Grids with size = GRID_SIZE*GRID_SIZE
dim3 dimBlocks(BLOCK_SIZE, BLOCK_SIZE); //2D Blocks with size = BLOCK_SIZE*BLOCK_SIZE
fun<<<dimGrid, dimBlocks>>>()
Refer to CUDA Programming Guide for further info, and also if you want to do 2D array or 3D, you better use cudaMalloc3D or cudaMallocPitch
As of your code, I think this would work (but I haven't tried though, hope you can grab the idea with this):
//main
dim3 dimGrid(1, 1); // 2D Grids with size = 1
dim3 dimBlocks(Width, Height); //2D Blocks with size = Height*Width
fun<<<dimGrid, dimBlocks>>>(Width, Height)
//kernel
__global__ void fun(int Width, int Height)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if((i < Width)&&(j<Height))
{
//CODE of function
}
}

My kernel only works in block (0,0)

I am trying to write a simple matrixMultiplication application that multiplies two square matrices using CUDA. I am having a problem where my kernel is only computing correctly in block (0,0) of the grid.
This is my invocation code:
dim3 dimBlock(4,4,1);
dim3 dimGrid(4,4,1);
//Launch the kernel;
MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);
This is my Kernel function
__global__ void MatrixMulKernel(int* Md, int* Nd, int* Pd, int Width)
{
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int bx = blockIdx.x;
const int by = blockIdx.y;
const int row = (by * blockDim.y + ty);
const int col = (bx * blockDim.x + tx);
//Pvalue stores the Pd element that is computed by the thread
int Pvalue = 0;
for (int k = 0; k < Width; k++)
{
Pvalue += Md[row * Width + k] * Nd[k * Width + col];
}
__syncthreads();
//Write the matrix to device memory each thread writes one element
Pd[row * Width + col] = Pvalue;
}
I think the problem may have something to do with memory but I'm a bit lost. What should I do to make this code work across several blocks?
The problem was with my CUDA kernel invocation. The grid was far too small for the matrices being processed.