Vector addition in cuda - cuda

I am trying to learn cuda programming and when I tried to execute vector addition program from this tutorial here, it's not working as expected, no error is thrown, but the answer isn't correct.
#include<stdio.h>
#include<stdlib.h>
#define N 20
__global__ void add(int *a, int *b, int *c){
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int), i;
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(size);
for(i = 0; i< N ; i++){
a[i] = rand() % 100;
b[i] = rand() % 50;
}
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU with N blocks
add<<<N,1>>>(d_a, d_b, d_c);
cudaDeviceSynchronize();
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
for(i = 0; i< N; i++)
printf("%d + %d = %d\n", a[i], b[i], c[i]);
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
The result is always shown as -1, I don't know why, can someone help me out here?

I just used my GPU to compile and run the program and it is working as expected. You have to check the error codes returned by cudaMalloc and cudaMemCpy and make sure they are not returning any errors.

Related

Why is this cuda program printing 0?

This code is printing "Result is 0", it seems not to be copying the results from the device. Why is this ?
__global__ void add_arrays(int *a, int *b, int *c, int size){
int index = threadIdx.x + blockIdx.x * blockDim.x;
//avoid accessing beyond the end of the array
if (index < size)
{ c[index] = a[index] + b[index]; }
}
#define N (2048 * 2048)
#define THREADS_PER_BLOCK 512
int main()
{
int *a, *b, *c; // local (host) copy
int *d_a, *d_b, *d_c; // device copy
int size = N * sizeof(int);
//allocating space for device copies ON THE DEVICE
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_a, size));
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_b, size));
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_c, size));
//allocating space for local copies ON THE HOST and initialize: a, b
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size); //random_ints(c, N);
//copy inputs to device
CUDA_CHECK_RETURN(cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice));
CUDA_CHECK_RETURN(cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice));
int num_block = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
add_arrays<<<num_block, THREADS_PER_BLOCK>>>(d_a, d_b, d_c, N);
// Copy result back from Device to Host
CUDA_CHECK_RETURN(cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost));
for (int i =0; i<N; ++i) {
printf("Result is %d \n", *(c+i));
}
You may want to consider posting a complete reproducible example with your question so that it contains your definition of CUDA_CHECK_RETURN, random_ints, included headers, etc. As mentioned in the comments, the obvious problem in your code though is that
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice)
should be
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice)
and similarly for b as a and b are pointers.

Wrong results using CUDA streams and memCpyAsync, become correct adding cudaDeviceSynchronize

I'm working on a CUDA matrix multiplication, but I did some modifications to observe how they affect performances.
I want to observe the behavior and performances of a matrix multiplication kernel, making some changes. I'm measuring the changes in GPU events time, I'm testing it in two speicific different conditions:
I have an amount of matrices (say matN) for A, B and C, then I transfer (H2D) one matrix for A, one for B and multply them, to transfer back (D2H) one C;
I have matN for A, B and C, but I transfer >1(say chunk) matrices for A and for B, I compute exactly chunk multiplications, and transfer back chunk result matrices C.
In the first case (chunk = 1) all works as expected, but in the second case (chunk > 1) I get some of Cs are correct, while others are not.
But if I put a cudaDeviceSynchronize() after the cudaMemcpyAsync, I get correct results.
Here's the code doing what I've just described:
/**** main.cpp ****/
int chunk = matN/iters;
#ifdef LOWPAR
GRIDx= 1;
GRIDy= 1;
label="LOW";
#else
int sizeX = M;
int sizeY = N;
GRIDx = ceil((sizeX)/BLOCK);
GRIDy = ceil((sizeY)/BLOCK);
label="";
#endif
const int bytesA = M*K*sizeof(float);
const int bytesB = K*N*sizeof(float);
const int bytesC = M*N*sizeof(float);
//device mem allocation
float *Ad, *Bd, *Cd;
gpuErrchk( cudaMalloc((void **)&Ad, bytesA*chunk) );
gpuErrchk( cudaMalloc((void **)&Bd, bytesB*chunk) );
gpuErrchk( cudaMalloc((void **)&Cd, bytesC*chunk) );
//host pinned mem allocation
float *A, *B, *C;
gpuErrchk( cudaMallocHost((void **)&A, bytesA*matN) );
gpuErrchk( cudaMallocHost((void **)&B, bytesB*matN) );
gpuErrchk( cudaMallocHost((void **)&C, bytesC*matN) );
//host data init
for(int i=0; i<matN; ++i){
randomMatrix(M, K, A+(i*M*K));
randomMatrix(K, N, B+(i*K*N));
}
//event start
createAndStartEvent(&startEvent, &stopEvent);
if (square)
{
label += "SQUARE";
int size = N*N;
for (int i = 0; i < iters; ++i) {
int j = i%nStream;
int idx = i*size*chunk;
newSquareMatMulKer(A+idx, B+idx, C+idx, Ad, Bd, Cd, N, chunk, stream[j]);
}
}
else {
...
}
msTot = endEvent(&startEvent, &stopEvent);
#ifdef MEASURES
printMeasures(square, label, msTot, millis.count(), matN, iters, devId);
#else
float *_A, *_B, *_C, *tmpC;
tmpC = (float *)calloc(1,bytesC*chunk);
for (int s=0; s<matN; ++s)
{
_A = A+(s*M*K);
_B = B+(s*K*N);
_C = C+(s*M*N);
memset(tmpC, 0, bytesC*chunk);
hostMatMul(_A, _B, tmpC, M, K, N);
checkMatEquality(_C, tmpC, M, N);
}
#endif
/**** matmul.cu ****/
__global__ void squareMatMulKernel(float* A, float* B, float* C, int N, int chunk) {
int ROW = blockIdx.x*blockDim.x+threadIdx.x;
int COL = blockIdx.y*blockDim.y+threadIdx.y;
if (ROW<N && COL<N) {
int size=N*N;
int offs = 0;
float tmpSum=0.0f;
for (int s=0; s<chunk; ++s)
{
offs = s*size;
tmpSum = 0.0f;
for (int i = 0; i < N; ++i) {
tmpSum += A[offs+(ROW*N)+i] * B[offs+(i*N)+COL];
}
C[offs+(ROW*N)+COL] = tmpSum;
}
}
return ;
}
void newSquareMatMulKer(float *A, float *B, float *C, float *Ad, float *Bd, float *Cd,
int n, int chunk, cudaStream_t strm)
{
int size = n*n;
int bytesMat = size*sizeof(float);
dim3 dimBlock(BLOCK,BLOCK,1);
dim3 dimGrid(GRIDx, GRIDy,1);
gpuErrchk( cudaMemcpyAsync(Ad, A, bytesMat*chunk, cudaMemcpyHostToDevice, strm) );
gpuErrchk( cudaMemcpyAsync(Bd, B, bytesMat*chunk, cudaMemcpyHostToDevice, strm) );
#ifdef LOWPAR
squareMatMulGridStrideKer<<<dimGrid, dimBlock, 0, strm>>>(Ad, Bd, Cd, n, chunk);
#else
squareMatMulKernel<<<dimGrid, dimBlock, 0, strm>>>(Ad, Bd, Cd, n, chunk);
#endif
squareMatMulKernel<<<dimGrid, dimBlock, 0, strm>>>(Ad, Bd, Cd, n, chunk);
gpuErrchk( cudaMemcpyAsync( C, Cd, bytesMat*chunk, cudaMemcpyDeviceToHost, strm) );
cudaDeviceSynchronize();
^ ^ ^ ^ ^ ^
}
I tried to debug using cuda-gdb but nothing strange showed up, gpuErrchk doesn't throw errors in CUDA API calls.
I run the code using memcheck too, both with and without cudaDeviceSynchronize and I got no error.
I think it can be a synchronization issue, but I can't understand the reason behind that.
Can someone spot where I'm wrong?
Other code advices are really appreciated too.
If you are using multiples streams, you may override Ad and Bd before using them.
Example with iters = 2 and nStream = 2 :
for (int i = 0; i < iters; ++i) {
int j = i%nStream;
int idx = i*size*chunk;
newSquareMatMulKer(A+idx, B+idx, C+idx, Ad, Bd, Cd, N, chunk, stream[j]);
}
From this loop, you will call
newSquareMatMulKer(A, B, C, Ad, Bd, Cd, N, chunk, stream[0]); // call 0
newSquareMatMulKer(A+idx, B+idx, C+idx, Ad, Bd, Cd, N, chunk, stream[1]); // call 1
As you are using the same memory area on device for both call, you may have several synchronizations issues:
call 1 start to copy A and B on device before call 0:squareMatMulKernel end, so you may use incorrect values of A and/or B to compute your first iteration.
call 1:squareMatMulKernel start before you retrieve the values of C from call 0, so you may override C with values from call 1.
To fix this problem, I see two approaches:
Using synchronization as in your example with cudaDeviceSynchronize();.
You can allocate more memory two device side (one workspace per stream), for example.
''
//device mem allocation
float *Ad, *Bd, *Cd;
gpuErrchk( cudaMalloc((void **)&Ad, bytesA*chunk*nStream) );
gpuErrchk( cudaMalloc((void **)&Bd, bytesB*chunk*nStream) );
gpuErrchk( cudaMalloc((void **)&Cd, bytesC*chunk*nStream) );
/* code here */
for (int i = 0; i < iters; ++i) {
int j = i%nStream;
int idx = i*size*chunk;
int offset_stream = j*size*chunk;
newSquareMatMulKer(A+idx, B+idx, C+idx,
Ad + offset_stream ,
Bd + offset_stream ,
Cd + offset_stream , N, chunk, stream[j]);
}
In this case you don't need synchronization before the end of the loop.

loop unrolling with dynamic parallelism decrease the time performance

I have a simple program to calculate square root, loop unrolling was done as
loop unrolling
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
//printf("%d\n",n);
for(int q=0;q<2;q++)
{
if(N<2000)
{
arr[idx+q] = arr[idx+q] * arr[idx+q];
}
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
then realizing that the loop calculation can be minimized with dynamic parallelism .
unrolling with dynamic parallelism was implemented as
unrolling with dynamic parallelism
#include <stdio.h>
#include <cuda.h>
__global__ void square(float *a, int N,int idx);
// Kernel that executes on the CUDA device
__global__ void first(float *arr, int N)
{
int idx = 2*(blockIdx.x * blockDim.x + threadIdx.x);
int n=N;
square <<< 1,2 >>> (arr, n,idx);
}
__global__ void square(float *a, int N,int idx)
{
int tdx = blockIdx.x * blockDim.x + threadIdx.x;
printf("%d\n",N);
if(N<2000)
{
a[tdx+idx] = a[tdx+idx] * a[tdx+idx];
}
}
// main routine that executes on the host
int main(void)
{
clock_t start = clock(),diff;
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 1000; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
//int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
first <<< 4, 128 >>> (a_d, N);
//cudaThreadSynchronize();
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
diff = clock() - start;
int msec = diff * 1000 / CLOCKS_PER_SEC;
printf("Time taken %d seconds %d milliseconds\n", msec/1000, msec%1000);
}
the implementation of dynamic parallelism with unrolling takes more time for executio than only unrolling. Aren,t we suppose to improve execution time with dynamic parallelism in such case?
Dynamic parallelism is mainly useful in cases where you have parallelism that is dynamic. That is: cases where you don't know how much parallelism you're going to need until you've done some calculation. Rather than transfer data back to the host which is then instantly fed into parameterising another launch, you launch from within the kernel. In this pattern, with memcpys between kernel launches avoided, you'll see speedup.
In your example above this is not the case. You could have just launched twice as many threads from the host. There's nothing dynamic required as there's no parallelism available there that you didn't know about at the time of the first kernel launch.
Furthermore, performance requirements for kernels launched using dynamic parallelism are similar to that of those launched from the host. You have to launch a reasonable amount of work or the launch latency will dominate your computation time.

Create 2D Array with CUDA

in cuda c programming guide document there is a sample that show a 2d array:
// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
...
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
...
}
i use 2d array with below form and works correctly:
dim3 grid[COLUMNS][ROWS];
kernel_Matrix<<<grid,1>>>(dev_strA, dev_strB, dev_Matrix);
__global__ void add(int *a, int *b, int *c)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}
there is a way that implement 2d array with [ ][ ] definition? i tested this way but not works.
dim3 is not array but structure defined in CUDA header file (vector_types.h). This structure is used to specify dimensions of GRID in execution configuration of global functions, i.e. in <<< >>>. It doesn't keep the 'real' blocks it just configures a number of blocks that will be executed.
The only two ways (to my knowledge) to initialize this structure are:
1. dim3 grid(x, y, z);
2. dim3 grid = {x, y, z};
EDIT:
Host code with dim3 initialization and with passing the arrays to kernel function in a way you will be able to access its elements via [][]:
float A[N][N];
float B[N][N];
float C[N][N];
float (*d_A)[N]; //pointers to arrays of dimension N
float (*d_B)[N];
float (*d_C)[N];
for(int i = 0; i < N; i++) {
for(int j = 0; j < N; j++) {
A[i][j] = i;
B[i][j] = j;
}
}
//allocation
cudaMalloc((void**)&d_A, (N*N)*sizeof(float));
cudaMalloc((void**)&d_B, (N*N)*sizeof(float));
cudaMalloc((void**)&d_C, (N*N)*sizeof(float));
//copying from host to device
cudaMemcpy(d_A, A, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);
//copying from device to host
cudaMemcpy(A, (d_A), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(B, (d_B), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(C, (d_C), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);

the Kernel delay increase by increasing the blocksPerGrid and threadsPerBlock in CUDA VecAdd example, what does it mean?

when i tested the following example, i found that by increasing the blocksPerGrid and threadsPerBlock the Kernel delay increase
such that if
int threadsPerBlock = 1;
int blocksPerGrid = 1;
blocksPerGrid and threadsPerBlock equal 1 the delay of the kernel = .0072 ms
but when i make the following it the delay become higher = .049 ms
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
where
N = 50000; //the no. of array elements
on the following the complete VecAdd example. you can test it
// Includes
#include <stdio.h>
#include <cutil_inline.h>
#include <shrQATest.h>
// Variables
float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
bool noprompt = false;
// Functions
void CleanupResources(void);
void RandomInit(float*, int);
void ParseArguments(int, char**);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Host code
int main(int argc, char** argv)
{
shrQAStart(argc, argv);
cudaEvent_t event1, event2;
cudaEventCreate(&event1);
cudaEventCreate(&event2);
printf("Vector Addition\n");
int N = 50000;
size_t size = N * sizeof(float);
ParseArguments(argc, argv);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
if (h_A == 0) CleanupResources();
h_B = (float*)malloc(size);
if (h_B == 0) CleanupResources();
h_C = (float*)malloc(size);
if (h_C == 0) CleanupResources();
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// Allocate vectors in device memory
cutilSafeCall( cudaMalloc((void**)&d_A, size) );
cutilSafeCall( cudaMalloc((void**)&d_B, size) );
cutilSafeCall( cudaMalloc((void**)&d_C, size) );
// Copy vectors from host memory to device memory
cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );
// Invoke kernel
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
cudaEventRecord(event1, 0);
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaEventRecord(event2, 0);
cudaEventSynchronize(event1); //optional
cudaEventSynchronize(event2);
float dt_ms;
cudaEventElapsedTime(&dt_ms, event1, event2);
printf("delay_time = %f\n", dt_ms);
cutilCheckMsg("kernel launch failure");
#ifdef _DEBUG
cutilSafeCall( cutilDeviceSynchronize() );
#endif
// Copy result from device memory to host memory
// h_C contains the result in host memory
cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-5)
break;
}
CleanupResources();
shrQAFinishExit(argc, (const char **)argv, (i==N) ? QA_PASSED : QA_FAILED);
}
void CleanupResources(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
cutilDeviceReset();
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
// Parse program arguments
void ParseArguments(int argc, char** argv)
{
for (int i = 0; i < argc; ++i) {
if (strcmp(argv[i], "--noprompt") == 0 ||
strcmp(argv[i], "-noprompt") == 0)
{
noprompt = true;
break;
}
}
}
can any one explain for me what does it mean?
In case 1, a kernel of size 1 thread is launched and performs 2 reads and 1 write operation. In case 2, a kernel of size 50176 threads are launched and perform 100,000 reads and 50,000 writes operations. Increasing the workload by 50,000 increased execution time by ~7x. The work done by the two launches is significantly different.