How to use 2D Arrays in CUDA? - cuda

How to allocate a 2D array of size MXN? And how to traverse that array in CUDA?
__global__ void test(int A[BLOCK_SIZE][BLOCK_SIZE], int B[BLOCK_SIZE][BLOCK_SIZE],int C[BLOCK_SIZE][BLOCK_SIZE])
{
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if (i < BLOCK_SIZE && j < BLOCK_SIZE)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
int d_A[BLOCK_SIZE][BLOCK_SIZE];
int d_B[BLOCK_SIZE][BLOCK_SIZE];
int d_C[BLOCK_SIZE][BLOCK_SIZE];
int C[BLOCK_SIZE][BLOCK_SIZE];
for(int i=0;i<BLOCK_SIZE;i++)
for(int j=0;j<BLOCK_SIZE;j++)
{
d_A[i][j]=i+j;
d_B[i][j]=i+j;
}
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(GRID_SIZE, GRID_SIZE);
test<<<dimGrid, dimBlock>>>(d_A,d_B,d_C);
cudaMemcpy(C,d_C,BLOCK_SIZE*BLOCK_SIZE , cudaMemcpyDeviceToHost);
for(int i=0;i<BLOCK_SIZE;i++)
for(int j=0;j<BLOCK_SIZE;j++)
{
printf("%d\n",C[i][j]);
}
}

How to allocate 2D array:
int main(){
#define BLOCK_SIZE 16
#define GRID_SIZE 1
int d_A[BLOCK_SIZE][BLOCK_SIZE];
int d_B[BLOCK_SIZE][BLOCK_SIZE];
/* d_A initialization */
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // so your threads are BLOCK_SIZE*BLOCK_SIZE, 256 in this case
dim3 dimGrid(GRID_SIZE, GRID_SIZE); // 1*1 blocks in a grid
YourKernel<<<dimGrid, dimBlock>>>(d_A,d_B); //Kernel invocation
}
How to traverse that array:
__global__ void YourKernel(int d_A[BLOCK_SIZE][BLOCK_SIZE], int d_B[BLOCK_SIZE][BLOCK_SIZE]){
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= h || col >= w)return;
/* whatever you wanna do with d_A[][] and d_B[][] */
}
i hope this is helpful
and also you can refer to CUDA Programming Guide page 22 about Matrix Multiplication

The best way would be storing a two-dimensional array A in its vector form.
For example you have a matrix A size nxm, and it's (i,j) element in pointer to pointer representation will be
A[i][j] (with i=0..n-1 and j=0..m-1).
In a vector form you can write
A[i*n+j] (with i=0..n-1 and j=0..m-1).
Using one-dimensional array in this case will simplify the copy process, which would be simple:
double *A,*dev_A; //A-hous pointer, dev_A - device pointer;
A=(double*)malloc(n*m*sizeof(double));
cudaMalloc((void**)&dev_A,n*m*sizeof(double));
cudaMemcpy(&dev_A,&A,n*m*sizeof(double),cudaMemcpyHostToDevice); //In case if A is double

Related

cudaMallocManaged for 2D and 3D array

If one wants to copy the arrays to device from host one does cudamalloc and cudaMemcpy. But to lessen the hassle one just does cudaMallocManaged without the former two things and life was never simpler before.
The code looks like this(more or less)
__global__ void convert(float kelvin[], float celsius[]) //can pass
arrays in kernel
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<N)
kelvin[i]=celsius[i]+273.15;
}
int main()
{
float *celsius =(float *)malloc(N*sizeof(float));
float *kelvin =(float *)malloc(N*sizeof(float));
cudaMallocManaged(&celsius, N*sizeof(float));
cudaMallocManaged(&kelvin, N*sizeof(float));
// init celsius here
dim3 blocksPerGrid(1,1,1); //use only one block
dim3 threadsPerBlock(N,1,1); //use N threads in the block
convert<<<blocksPerGrid, threadsPerBlock>>>(kelvin,celsius);
cudaDeviceSynchronize();
//Doing stuff with the output here
return 0;
}
The previous example seems clear to me. But, how to do cudaMallocManaged for 2D and 3D array? I've been trying
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{ // I thonk, 2D arrays can be passed as pointer to pointers
float **A = (float **)malloc(N*N*sizeof(float));
float **B = (float **)malloc(N*N*sizeof(float));
float **C = (float **)malloc(N*N*sizeof(float));
cudaMallocManaged(&A, N*N*sizeof(float));
cudaMallocManaged(&B, N*N*sizeof(float));
cudaMallocManaged(&C, N*N*sizeof(float));
A[N][N]={{1,0,0},{0,1,0},{0,0,1}};
B[N][N]={{1,0,0},{0,1,0},{0,0,1}};
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
//outputs and all
}
But, It shows the following error
matrix_add.cu(22): error: too many initializer values
matrix_add.cu(25): error: argument of type "float **" is incompatible with parameter of type "float (*)[3]"
Your help is highly appreciated.
You got a lot wrong in your attempt, so much that it was faster to write a working version than list out all the individual problems in the code in your question. So here is a working version of what it appears you were trying to do:
#include <algorithm>
#include <iostream>
const int N = 3;
__global__ void MatAdd(float A[][N], float B[][N], float C[][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
float* A; cudaMallocManaged(&A, N*N*sizeof(float));
float* B; cudaMallocManaged(&B, N*N*sizeof(float));
float* C; cudaMallocManaged(&C, N*N*sizeof(float));
const float A_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
const float B_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
float (*C_vals)[N] = reinterpret_cast<float (*)[N]>(C);
std::copy(&A_vals[0][0], &A_vals[0][0] + N*N, A);
std::copy(&B_vals[0][0], &B_vals[0][0] + N*N, B);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(1, 1);
MatAdd<<<numBlocks, threadsPerBlock>>>( reinterpret_cast<float (*)[N]>(A),
reinterpret_cast<float (*)[N]>(B),
C_vals );
cudaDeviceSynchronize();
for(int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
std::cout << C_vals[i][j] << " ";
}
std::cout << std::endl;
}
return 0;
}
Some important points:
Managed memory allocation replaces standard host memory allocation and produces memory which is directly accessible on both the host and the device.
All arrays decay to a pointer when passed as arguments to a function by value. That decay is not recursive. See here for more details.
You can (and will need to) cast in order to use the [][] access syntax on linear memory allocated dynamically at runtime (this applies to malloc, new, or any of the CUDA host memory allocation APIs. See here for more details).
Initialization syntax and assignment syntax for arrays are not interchangeable.
All I can suggest is that you study it thoroughly until you understand how it works.

Create 2D Array with CUDA

in cuda c programming guide document there is a sample that show a 2d array:
// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
...
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
...
}
i use 2d array with below form and works correctly:
dim3 grid[COLUMNS][ROWS];
kernel_Matrix<<<grid,1>>>(dev_strA, dev_strB, dev_Matrix);
__global__ void add(int *a, int *b, int *c)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}
there is a way that implement 2d array with [ ][ ] definition? i tested this way but not works.
dim3 is not array but structure defined in CUDA header file (vector_types.h). This structure is used to specify dimensions of GRID in execution configuration of global functions, i.e. in <<< >>>. It doesn't keep the 'real' blocks it just configures a number of blocks that will be executed.
The only two ways (to my knowledge) to initialize this structure are:
1. dim3 grid(x, y, z);
2. dim3 grid = {x, y, z};
EDIT:
Host code with dim3 initialization and with passing the arrays to kernel function in a way you will be able to access its elements via [][]:
float A[N][N];
float B[N][N];
float C[N][N];
float (*d_A)[N]; //pointers to arrays of dimension N
float (*d_B)[N];
float (*d_C)[N];
for(int i = 0; i < N; i++) {
for(int j = 0; j < N; j++) {
A[i][j] = i;
B[i][j] = j;
}
}
//allocation
cudaMalloc((void**)&d_A, (N*N)*sizeof(float));
cudaMalloc((void**)&d_B, (N*N)*sizeof(float));
cudaMalloc((void**)&d_C, (N*N)*sizeof(float));
//copying from host to device
cudaMemcpy(d_A, A, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);
//copying from device to host
cudaMemcpy(A, (d_A), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(B, (d_B), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(C, (d_C), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);

Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D

I'm new in CUDA, I appreciate your help and hope you can help me.
I need to store multiple elements of a 2D array into a vector, and then work with the vector, but my code does not work well, when I debug, I find a mistake in allocating the 2D array in the device with cudaMallocPitch and copying to that array with cudaMemcpy2D. This is my code:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cmath>
#define maxThreads 96
__global__ void extract(int mSize, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while(idx<N)
{
dev_vector[idx] = *(dev_matrix+(mSize*idx+N));
idx += blockDim.x * gridDim.x;
}
}
int main()
{
//CPU variables
int mSize = 5;
float* matrix;
int N = 4; // Vector size
int i,j;
float* vector;
int blocks, threads;
float* dev_matrix;
float* dev_vector;
blocks = 1+((N-1)/maxThreads);
threads = 1+((N-1)/blocks);
unsigned long int pitch;
unsigned long int memsize_vector = N*sizeof(float);
unsigned long int memsize_matrix = mSize*sizeof(float);
matrix = new float[memsize_matrix*memsize_matrix];
vector = new float[memsize_vector];
//Create 2D array
for(i=0; i<mSize; i++)
for(j=0; j<mSize; j++)
{
matrix[i+mSize*j] = ((i+1)+(j+1));
}
printf("\n");
for (i=0; i<mSize; i++){
for(j=0; j<mSize; j++){
printf("% 1.5f ", matrix[i+mSize*j]);
}
printf("\n");
}
printf("\n");
cudaMallocPitch((void **)&dev_matrix, &pitch, memsize_matrix, mSize);
cudaMalloc((void **)&dev_vector, memsize_vector);
cudaMemcpy2D(dev_matrix, pitch, matrix, memsize_matrix, memsize_matrix, mSize,
cudaMemcpyHostToDevice);
extract<<<blocks,threads>>>(mSize, dev_vector, dev_matrix, N);
cudaDeviceSynchronize();
cudaMemcpy(vector, dev_vector, memsize_vector, cudaMemcpyDeviceToHost);
printf("Vector values are:\n");
for(i=0; i<N; i++)
printf(" % 1.5f ", vector[i]);
printf("\n");
cudaFree(dev_matrix);
cudaFree(dev_vector);
}
There are lots of problems in this code, including but not limited to using array sizes in bytes and word sizes interchangeably in several places in code, using incorrect types (note that size_t exists for a very good reason) , potential truncation and type casting problems, and more.
But the core problem is the addressing of pitched memory inside the kernel, to which you are never even passing the pitch value. Reading the documentation for cudaMallocPitch will give you the correct method for addressing pitched memory inside a kernel. Your kernel might then look like this:
__global__ void extract(size_t mpitch, float* dev_vector, float* dev_matrix, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * mpitch) + N );
idx += stride;
}
}
[disclaimer: never compiled or tested, use at own risk].
You will have to fix then all the problems in the host code to reflect whatever kernel changes you make.
Thanks to all, Alex I had not seen that, and fix it, thanks.
talonmies, thank you, my code works, with your suggestions. thanks a lot, finally this my kernel:
__global__ void sumreduct(size_t pitch, float* dev_vector, float* dev_matrix, int columns, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
while(idx<N)
{
dev_vector[idx] = *(float *)( ((char*)dev_matrix + idx * pitch) + columns);
idx += stride;
}
}
About "size_t", I was using "Unsigned int" because Nsight show me the next warning:
Type 'size_t' could not be resolved
Thanks
Did you really mean to declare a source matrix of length [memsizeMatrix*memsizeMatrix] ?
This will allocate 400 floats, or 1600 bytes. This means your source-pitch is off, and the Memcpy2D call is failing.
I'm assuming you meant to say
matrix = new float[mSize*mSize];

Get an int from a threadId in CUDA

I am pretty new to CUDA. I need to use a thread id in a computation but it doesn't work. rem is always 0. I need the index of the thread to computes indices in arrays so I can't convert them to floats to do the computations.
the kernel is as follows :
_global__ void initializationCubes(float* dVer, float* dCub, int gridSize, float* test)
{
int index=blockIdx.x*blockDim.x+threadIdx.x;
if(index<(gridSize*gridSize*gridSize))
{
// conversion index -> i,j,k
int rem=index;
int qot=(rem/gridSize);
int i=rem-(qot*gridSize);
rem=(rem)/(gridSize);
qot=(rem/gridSize);
int j=rem-(qot*gridSize);
rem=(rem)/(gridSize);
qot=(rem/gridSize);
int k=rem-(qot*gridSize);
for(int x=0;x<7;x++){
// these first three are used to test
dCub[index*56+0+x] =index;
dCub[index*56+7+x] =rem;
dCub[index*56+14+x]=k;
dCub[index*56+21+x]=dVer[((i*(gridSize+1)+(j+1))*(gridSize+1)+k)*7+x];
dCub[index*56+28+x]=dVer[(((i+1)*(gridSize+1)+(j))*(gridSize+1)+k)*7+x];
dCub[index*56+35+x]=dVer[(((i+1)*(gridSize+1)+(j))*(gridSize+1)+k+1)*7+x];
dCub[index*56+42+x]=dVer[(((i+1)*(gridSize+1)+(j+1))*(gridSize+1)+k+1)*7+x];
dCub[index*56+49+x]=dVer[(((i+1)*(gridSize+1)+(j+1))*(gridSize+1)+k)*7+x];
}
}
}
__global__ void initializationVertices(float* dVer, int gridSize){
int currentVertex=0;
for(int i=0; i<gridSize+1; i++)
{
for(int j=0; j<gridSize+1; j++)
{
for(int k=0; k<gridSize+1; k++)
{
dVer[currentVertex+0]=((i*2.0f)/(gridSize)-1.0f)*2.0f;
dVer[currentVertex+1]=((j*2.0f)/(gridSize)-1.0f)*2.0f;
dVer[currentVertex+2]=((k*2.0f)/(gridSize)-1.0f)*2.0f;
currentVertex+=7;
}
}
}
extern "C"
void initializationCUDA1( const int verticesAtEndsOfEdges[24], const int eTable[256], int gSize, int numberParticles ) {
numParticles=numberParticles;
gridSize=gSize;
numVertices=(gridSize+1)*(gridSize+1)*(gridSize+1);
numCubes=(gridSize)*(gridSize)*(gridSize);
size_t pitchv=7;
cudaMallocPitch((void**)&dVer, &pitchv, 7 * sizeof(float), (gridSize+1)*(gridSize+1)*(gridSize+1));
size_t pitchc=7;
cudaMallocPitch((void**)&dCub, &pitchc, 7 * sizeof(float), (gridSize)*(gridSize)*(gridSize)*8);
cudaMalloc((void **)&verticesAtEnds, 24*sizeof(int));
cudaMalloc((void **)&dedgeTable, 256*sizeof(int));
cudaMalloc((void **)&dtriTable, 256*16*sizeof(int));
cudaMalloc((void **)&ballPoint, 3*sizeof(float));
cudaMalloc((void **)&dpositions, 3*numberParticles*sizeof(float));
cudaMalloc((void **)&dedgeVertices, numCubes*6*12*sizeof(float));
cudaMalloc((void **)&result, numCubes*18*sizeof(float));
output=(float*)malloc(numCubes*18*sizeof(float));
cudaMalloc((void **)&numFaces, 10*sizeof(int));
cudaMalloc((void **)&test, sizeof(float));
initializationVertices<<<1,1>>>(dVer, gridSize);
initializationCubes<<<128,256>>>( dVer, dCub, gridSize, test);
float* tmp =(float*)malloc(numCubes*56*(sizeof(float)));
cudaMemcpy(tmp, dCub, numCubes*56*sizeof(float), cudaMemcpyDeviceToHost);
for(int a=0;a<100;a++){
printf("%f\n",tmp[a]);
}
}
EDIT
gridSize is 40 -> the iteration of the threads go from 0 to 64000
and when I print the values outside of my function, rem, i, j and k are all equal to 0.
size_t pitchv=7;
cudaMallocPitch((void**)&dVer, &pitchv, 7 * sizeof(float), (gridSize+1)(gridSize+1)(gridSize+1));
size_t pitchc=7;
cudaMallocPitch((void**)&dCub, &pitchc, 7 * sizeof(float), (gridSize)(gridSize)(gridSize)*8);
initializationCubes<<<1,1>>>( dVer, dCub, gridSize, test);
If gridSize is the size of the grid, as the name suggests, both rem and qot will always be zero after execution of your code because they get divided by a value larger than themselves.
If you are looking for indices into a three-dimensional grid, that is exactly why threadIdx and blockIdx have three components. No expensive division is required at all, just use this standard code snippet:
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
if (i < myBlockSize.x && j < myBlockSize.y && k<myBlockSize.z) {
// your kernel code...
}
and launch your kernel with appropriate values for the y and z components of block- and gridsize, as well as a parameter or global variable myBlockSize set to the desired grid size (in case it cannot be factored into integer block- and grid dimensions).

CUDA Jacobian Relaxation

I am in the process of mapping this sequential computation to a CUDA computation. This computation is a 2-dimensional Jacobian relaxation on an NxN grid, where N is unknown. N is evenly divisible by 32.
Jacobi(float *a,float *b,int N){
for (i=1; i<N+1; i++){
for (j=1; j<N+1; j++) {
a[i][j]=0.8*(b[i+1][j]+b[i+1][j]+b[i][j+1]+b[i][j+1]);
}
}
}
I'm parallelizing the outer two loops, and each thread should compute just one element. The goal is to parallelize it to use a cyclic distribution in the the x and y dimensions. Can some one aid me in implementing a Jacobi_GPU that has the appropriate indexing functions in CUDA that results in the following distribution?
dim3 dimGrid(N/32,N/32);
dim3 dimBlock(32,32);
Jacobi_GPU<<<dimGrid,dimBlock>>>(A,B,N)
forThis is the simple implementation. You can use shared memory optimization for this kernel function
__global__ void jacobi(int* a, const int* b,const int N)
{
int i= blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i<N && j<N)
{
a[j*N+i] = 0.8* (2*b[(i+1)+j*N] + 2*b[i+N*(j+1)]);
}
}
Or, if you want to use "arrays of arrays" rather than arrays:
__global__ void Jacobi(int** a, const int** b,const int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i<N && j<N)
{
a[i][j]=0.8*(b[i+1][j]+b[i+1][j]+b[i][j+1]+b[i][j+1]);
}
}