in cuda c programming guide document there is a sample that show a 2d array:
// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
...
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
...
}
i use 2d array with below form and works correctly:
dim3 grid[COLUMNS][ROWS];
kernel_Matrix<<<grid,1>>>(dev_strA, dev_strB, dev_Matrix);
__global__ void add(int *a, int *b, int *c)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}
there is a way that implement 2d array with [ ][ ] definition? i tested this way but not works.
dim3 is not array but structure defined in CUDA header file (vector_types.h). This structure is used to specify dimensions of GRID in execution configuration of global functions, i.e. in <<< >>>. It doesn't keep the 'real' blocks it just configures a number of blocks that will be executed.
The only two ways (to my knowledge) to initialize this structure are:
1. dim3 grid(x, y, z);
2. dim3 grid = {x, y, z};
EDIT:
Host code with dim3 initialization and with passing the arrays to kernel function in a way you will be able to access its elements via [][]:
float A[N][N];
float B[N][N];
float C[N][N];
float (*d_A)[N]; //pointers to arrays of dimension N
float (*d_B)[N];
float (*d_C)[N];
for(int i = 0; i < N; i++) {
for(int j = 0; j < N; j++) {
A[i][j] = i;
B[i][j] = j;
}
}
//allocation
cudaMalloc((void**)&d_A, (N*N)*sizeof(float));
cudaMalloc((void**)&d_B, (N*N)*sizeof(float));
cudaMalloc((void**)&d_C, (N*N)*sizeof(float));
//copying from host to device
cudaMemcpy(d_A, A, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);
//copying from device to host
cudaMemcpy(A, (d_A), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(B, (d_B), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(C, (d_C), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
Related
I'm trying out the difference between using a tiled and naive implementation in CUDA C++. I expect to see a performance gap in these variations because of the repeated usage of shared memory. However, the speedup was only about twice as fast (naive ~12ms and tiled ~6ms). Here are the code snippets:
#include <iostream>
#include <assert.h>
using namespace std;
# define N 1024
# define THREADS 16
# define IDX(x, y, s) (x*s + y)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void init_values(int *a, int *b, int sz) {
for(int i=0; i<sz; i++) {
a[i] = rand()%513 - 256;
b[i] = rand()%513 - 256;
}
}
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int t = 0;
for(int i=0; i<n; i++) {
t += (a[IDX(x, i, n)] * b[IDX(i, y, n)]);
}
c[IDX(x, y, n)] = t;
}
void matmul_verify(int *a, int *b, int *c, int n) {
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++) {
int t = 0;
for(int k=0; k<n; k++)
t += a[IDX(i, k, n)] * b[IDX(k, j, n)];
// cout << i << " " << j << " " << c[IDX(i, j, n)] << " " << t << endl;
assert(c[IDX(i, j, n)] == t);
}
}
}
int main()
{
int *a, *b, *c;
int *da, *db, *dc;
size_t sz = N * N * sizeof(int);
a = (int*)malloc(sz);
b = (int*)malloc(sz);
c = (int*)malloc(sz);
init_values(a, b, N*N);
gpuErrchk(cudaMalloc((void**)&da, sz));
gpuErrchk(cudaMalloc((void**)&db, sz));
gpuErrchk(cudaMalloc((void**)&dc, sz));
gpuErrchk(cudaMemcpy(da, a, sz, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(db, b, sz, cudaMemcpyHostToDevice));
// init grid size
dim3 grids(N/THREADS, N/THREADS);
dim3 blocks(THREADS, THREADS);
// time it
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
matmul<<<grids, blocks>>>(da, db, dc, N);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
cout << "Took " << milliseconds << " milliseconds.\n";
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(c, dc, sz, cudaMemcpyDeviceToHost));
matmul_verify(a, b, c, N);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
free(a);
free(b);
free(c);
cudaEventDestroy(start);
cudaEventDestroy(stop);
return 0;
}
and for the tiled implementation, I change the kernel as
__global__
void matmul(int *a, int *b, int *c, int n) {
// perform parallel matmul
int ty = threadIdx.y, by = blockIdx.y;
int tx = threadIdx.x, bx = blockIdx.x;
int x = bx * blockDim.x + tx;
int y = by * blockDim.y + ty;
// block IDs tell us which block to solve for
// (bx, by) --> (bx: bx + tx, by:by + ty)
__shared__ int A[SHMEM_SIZE];
__shared__ int B[SHMEM_SIZE];
const int tile_size = THREADS;
// to get value of tile [tx, ty] in block [bx, by], we need blocks A[bx, *] and blocks B[*, by]
int res = 0;
for(int blk=0; blk < n; blk+=tile_size) {
// block index
A[IDX(tx, ty, tile_size)] = a[IDX(x, blk + ty, n)];
B[IDX(tx, ty, tile_size)] = b[IDX(blk + tx, y, n)];
__syncthreads();
for(int k=0; k<tile_size; k++) {
res += (A[IDX(tx, k, tile_size)] * B[IDX(k, ty, tile_size)]);
}
__syncthreads();
}
// for(int k=0; k<n; k++)
// res += a[IDX(x, k, n)] * b[IDX(k, y, n)];
c[IDX(x, y, n)] = res;
}
nothing else really changes. However, in the tiled implementation, if I simply change
int ty = threadIdx.x, by = blockIdx.x;
int tx = threadIdx.y, bx = blockIdx.y;
for the initialization of thread and block indices, I get about a ~1ms runtime (12x speedup). How is this happening? I read from the book "CUDA By Example" that the thread and block indices in 2 dimensions are just for programmer convenience and do not reflect any difference in performance. This seems to be false. Any clarification is really appreciated.
CUDA thread blocks are partitioned into warps of 32 threads. Ideally the neighboring lanes of a warp should always load neighboring elements from global memory. This is called coalescing and allows for maximum memory bandwidth. In hardware all the coalesced loads from a warp will be bundled into a minimal number of memory transactions.
Other factors that can deteriorate memory bandwidth are the size of the load (one can try to use the builtin vector types to get bigger loads for optimization, e.g. int2, int4, float2, etc.) and alignment.
The mapping from 3D threadIdx to warp lanes always takes the first dimension .x as the continuous dimension, i.e. a block of dimensions (32, 2, 1) will have one warp with threadIdx.y == 0 and one warp with threadIdx.y == 1 where the lanes of each warp correspond to threadIdx.x.
Therefore to allow for coalescing, you have to access memory as
A[ty * s + tx] // coalesced access
instead of
A[tx * s + ty] // strided access
to achieve optimal performance.
What is probably meant in the book you mentioned is that there shouldn't be a performance difference between launching a grid of (32, 2, 1) blocks and a grid of (64, 1, 1) blocks while manually getting ty = threadIdx.x / 32 and tx = threadIdx.x % 32. These divisions probably happen internally when having a block that is not flat in the first place.
This code is printing "Result is 0", it seems not to be copying the results from the device. Why is this ?
__global__ void add_arrays(int *a, int *b, int *c, int size){
int index = threadIdx.x + blockIdx.x * blockDim.x;
//avoid accessing beyond the end of the array
if (index < size)
{ c[index] = a[index] + b[index]; }
}
#define N (2048 * 2048)
#define THREADS_PER_BLOCK 512
int main()
{
int *a, *b, *c; // local (host) copy
int *d_a, *d_b, *d_c; // device copy
int size = N * sizeof(int);
//allocating space for device copies ON THE DEVICE
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_a, size));
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_b, size));
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_c, size));
//allocating space for local copies ON THE HOST and initialize: a, b
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size); //random_ints(c, N);
//copy inputs to device
CUDA_CHECK_RETURN(cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice));
CUDA_CHECK_RETURN(cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice));
int num_block = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
add_arrays<<<num_block, THREADS_PER_BLOCK>>>(d_a, d_b, d_c, N);
// Copy result back from Device to Host
CUDA_CHECK_RETURN(cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost));
for (int i =0; i<N; ++i) {
printf("Result is %d \n", *(c+i));
}
You may want to consider posting a complete reproducible example with your question so that it contains your definition of CUDA_CHECK_RETURN, random_ints, included headers, etc. As mentioned in the comments, the obvious problem in your code though is that
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice)
should be
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice)
and similarly for b as a and b are pointers.
If one wants to copy the arrays to device from host one does cudamalloc and cudaMemcpy. But to lessen the hassle one just does cudaMallocManaged without the former two things and life was never simpler before.
The code looks like this(more or less)
__global__ void convert(float kelvin[], float celsius[]) //can pass
arrays in kernel
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<N)
kelvin[i]=celsius[i]+273.15;
}
int main()
{
float *celsius =(float *)malloc(N*sizeof(float));
float *kelvin =(float *)malloc(N*sizeof(float));
cudaMallocManaged(&celsius, N*sizeof(float));
cudaMallocManaged(&kelvin, N*sizeof(float));
// init celsius here
dim3 blocksPerGrid(1,1,1); //use only one block
dim3 threadsPerBlock(N,1,1); //use N threads in the block
convert<<<blocksPerGrid, threadsPerBlock>>>(kelvin,celsius);
cudaDeviceSynchronize();
//Doing stuff with the output here
return 0;
}
The previous example seems clear to me. But, how to do cudaMallocManaged for 2D and 3D array? I've been trying
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{ // I thonk, 2D arrays can be passed as pointer to pointers
float **A = (float **)malloc(N*N*sizeof(float));
float **B = (float **)malloc(N*N*sizeof(float));
float **C = (float **)malloc(N*N*sizeof(float));
cudaMallocManaged(&A, N*N*sizeof(float));
cudaMallocManaged(&B, N*N*sizeof(float));
cudaMallocManaged(&C, N*N*sizeof(float));
A[N][N]={{1,0,0},{0,1,0},{0,0,1}};
B[N][N]={{1,0,0},{0,1,0},{0,0,1}};
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
//outputs and all
}
But, It shows the following error
matrix_add.cu(22): error: too many initializer values
matrix_add.cu(25): error: argument of type "float **" is incompatible with parameter of type "float (*)[3]"
Your help is highly appreciated.
You got a lot wrong in your attempt, so much that it was faster to write a working version than list out all the individual problems in the code in your question. So here is a working version of what it appears you were trying to do:
#include <algorithm>
#include <iostream>
const int N = 3;
__global__ void MatAdd(float A[][N], float B[][N], float C[][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
float* A; cudaMallocManaged(&A, N*N*sizeof(float));
float* B; cudaMallocManaged(&B, N*N*sizeof(float));
float* C; cudaMallocManaged(&C, N*N*sizeof(float));
const float A_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
const float B_vals[N][N]={{1,0,0},{0,1,0},{0,0,1}};
float (*C_vals)[N] = reinterpret_cast<float (*)[N]>(C);
std::copy(&A_vals[0][0], &A_vals[0][0] + N*N, A);
std::copy(&B_vals[0][0], &B_vals[0][0] + N*N, B);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(1, 1);
MatAdd<<<numBlocks, threadsPerBlock>>>( reinterpret_cast<float (*)[N]>(A),
reinterpret_cast<float (*)[N]>(B),
C_vals );
cudaDeviceSynchronize();
for(int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
std::cout << C_vals[i][j] << " ";
}
std::cout << std::endl;
}
return 0;
}
Some important points:
Managed memory allocation replaces standard host memory allocation and produces memory which is directly accessible on both the host and the device.
All arrays decay to a pointer when passed as arguments to a function by value. That decay is not recursive. See here for more details.
You can (and will need to) cast in order to use the [][] access syntax on linear memory allocated dynamically at runtime (this applies to malloc, new, or any of the CUDA host memory allocation APIs. See here for more details).
Initialization syntax and assignment syntax for arrays are not interchangeable.
All I can suggest is that you study it thoroughly until you understand how it works.
I have a cuda kernel that copies from i+1 th location to the ith location in a device array. The copying is not done from the locations whose index values are multiples of 32. [32]->[31] not copied, [64]->[63] not copied. This happens irrespective of the block size. How this could be resolved?
Here is the full program. No calls for syncthreads(). Still the problem exists.
#include <cstdio>
struct SodA { float *df0; size_t pitch; };
__global__ void stream_kernel (SodA dA1, SodA dA2, int M, int N);
int main(int argc, char **argv){
int i, M=32, N=32;float *f0;
SodA dA1, dA2;
dim3 blockSize = dim3(32,32);
dim3 gridSize = dim3(1,1);
f0 = (float *)malloc(M*N*sizeof(float));
cudaMallocPitch((void **)&dA1.df0, &dA1.pitch, sizeof(float)*M, N);
cudaMallocPitch((void **)&dA2.df0, &dA2.pitch, sizeof(float)*M, N);
for (i=0; i<M*N; i++) f0[i] = (float)rand()/RAND_MAX;
cudaMemcpy2D((void *)dA1.df0, dA1.pitch, (void *)f0, sizeof(float)*M, sizeof(float)*M, N, cudaMemcpyHostToDevice);
printf("\n");
for(int i=28;i<70; i++)
printf("%5d ", i);
printf("\n\n");
printf("\n");
for(int i=28;i<70; i++)
printf("%.3f ", f0[i]);
printf("\n\n");
stream_kernel<<<gridSize, blockSize>>>(dA1, dA2, M, N);
cudaMemcpy2D( (void *)f0, sizeof(float)*M, (void *)dA2.df0, dA2.pitch,sizeof(float)*M, N, cudaMemcpyDeviceToHost);
printf("\n");
for(int i=28;i<70; i++)
printf("%.3f ", f0[i]);
printf("\n\n");
free(f0);cudaFree(dA2.df0);
cudaFree(dA1.df0);
printf("\n\n");
return 0;
}
__global__ void stream_kernel (SodA dA1, SodA dA2, int M, int N)
{
int i, j, i2d;
i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;
i2d = i + j * M;
if (i2d>0) { dA2.df0[i2d-1] = dA1.df0[i2d];}
}
The output
28 29 30 31 32 33 ....
0.999 0.218 0.513 0.839 0.613 0.296 0.638....
0.218 0.513 0.839 0.198 0.296 0.638 ....
Thanks for the comments. In a 2D array stored in row major order, this kernel moves the (i,j)th position to its previous position. Since the array is pitched, as mentioned in the comments, the previous element of the first element in each row could not be found using -1 offset. This special case is handled by computing the last element in the previous array. I got the answer. Thanks.
I am pretty new to CUDA. I need to use a thread id in a computation but it doesn't work. rem is always 0. I need the index of the thread to computes indices in arrays so I can't convert them to floats to do the computations.
the kernel is as follows :
_global__ void initializationCubes(float* dVer, float* dCub, int gridSize, float* test)
{
int index=blockIdx.x*blockDim.x+threadIdx.x;
if(index<(gridSize*gridSize*gridSize))
{
// conversion index -> i,j,k
int rem=index;
int qot=(rem/gridSize);
int i=rem-(qot*gridSize);
rem=(rem)/(gridSize);
qot=(rem/gridSize);
int j=rem-(qot*gridSize);
rem=(rem)/(gridSize);
qot=(rem/gridSize);
int k=rem-(qot*gridSize);
for(int x=0;x<7;x++){
// these first three are used to test
dCub[index*56+0+x] =index;
dCub[index*56+7+x] =rem;
dCub[index*56+14+x]=k;
dCub[index*56+21+x]=dVer[((i*(gridSize+1)+(j+1))*(gridSize+1)+k)*7+x];
dCub[index*56+28+x]=dVer[(((i+1)*(gridSize+1)+(j))*(gridSize+1)+k)*7+x];
dCub[index*56+35+x]=dVer[(((i+1)*(gridSize+1)+(j))*(gridSize+1)+k+1)*7+x];
dCub[index*56+42+x]=dVer[(((i+1)*(gridSize+1)+(j+1))*(gridSize+1)+k+1)*7+x];
dCub[index*56+49+x]=dVer[(((i+1)*(gridSize+1)+(j+1))*(gridSize+1)+k)*7+x];
}
}
}
__global__ void initializationVertices(float* dVer, int gridSize){
int currentVertex=0;
for(int i=0; i<gridSize+1; i++)
{
for(int j=0; j<gridSize+1; j++)
{
for(int k=0; k<gridSize+1; k++)
{
dVer[currentVertex+0]=((i*2.0f)/(gridSize)-1.0f)*2.0f;
dVer[currentVertex+1]=((j*2.0f)/(gridSize)-1.0f)*2.0f;
dVer[currentVertex+2]=((k*2.0f)/(gridSize)-1.0f)*2.0f;
currentVertex+=7;
}
}
}
extern "C"
void initializationCUDA1( const int verticesAtEndsOfEdges[24], const int eTable[256], int gSize, int numberParticles ) {
numParticles=numberParticles;
gridSize=gSize;
numVertices=(gridSize+1)*(gridSize+1)*(gridSize+1);
numCubes=(gridSize)*(gridSize)*(gridSize);
size_t pitchv=7;
cudaMallocPitch((void**)&dVer, &pitchv, 7 * sizeof(float), (gridSize+1)*(gridSize+1)*(gridSize+1));
size_t pitchc=7;
cudaMallocPitch((void**)&dCub, &pitchc, 7 * sizeof(float), (gridSize)*(gridSize)*(gridSize)*8);
cudaMalloc((void **)&verticesAtEnds, 24*sizeof(int));
cudaMalloc((void **)&dedgeTable, 256*sizeof(int));
cudaMalloc((void **)&dtriTable, 256*16*sizeof(int));
cudaMalloc((void **)&ballPoint, 3*sizeof(float));
cudaMalloc((void **)&dpositions, 3*numberParticles*sizeof(float));
cudaMalloc((void **)&dedgeVertices, numCubes*6*12*sizeof(float));
cudaMalloc((void **)&result, numCubes*18*sizeof(float));
output=(float*)malloc(numCubes*18*sizeof(float));
cudaMalloc((void **)&numFaces, 10*sizeof(int));
cudaMalloc((void **)&test, sizeof(float));
initializationVertices<<<1,1>>>(dVer, gridSize);
initializationCubes<<<128,256>>>( dVer, dCub, gridSize, test);
float* tmp =(float*)malloc(numCubes*56*(sizeof(float)));
cudaMemcpy(tmp, dCub, numCubes*56*sizeof(float), cudaMemcpyDeviceToHost);
for(int a=0;a<100;a++){
printf("%f\n",tmp[a]);
}
}
EDIT
gridSize is 40 -> the iteration of the threads go from 0 to 64000
and when I print the values outside of my function, rem, i, j and k are all equal to 0.
size_t pitchv=7;
cudaMallocPitch((void**)&dVer, &pitchv, 7 * sizeof(float), (gridSize+1)(gridSize+1)(gridSize+1));
size_t pitchc=7;
cudaMallocPitch((void**)&dCub, &pitchc, 7 * sizeof(float), (gridSize)(gridSize)(gridSize)*8);
initializationCubes<<<1,1>>>( dVer, dCub, gridSize, test);
If gridSize is the size of the grid, as the name suggests, both rem and qot will always be zero after execution of your code because they get divided by a value larger than themselves.
If you are looking for indices into a three-dimensional grid, that is exactly why threadIdx and blockIdx have three components. No expensive division is required at all, just use this standard code snippet:
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
if (i < myBlockSize.x && j < myBlockSize.y && k<myBlockSize.z) {
// your kernel code...
}
and launch your kernel with appropriate values for the y and z components of block- and gridsize, as well as a parameter or global variable myBlockSize set to the desired grid size (in case it cannot be factored into integer block- and grid dimensions).