Why is this cuda program printing 0? - cuda

This code is printing "Result is 0", it seems not to be copying the results from the device. Why is this ?
__global__ void add_arrays(int *a, int *b, int *c, int size){
int index = threadIdx.x + blockIdx.x * blockDim.x;
//avoid accessing beyond the end of the array
if (index < size)
{ c[index] = a[index] + b[index]; }
}
#define N (2048 * 2048)
#define THREADS_PER_BLOCK 512
int main()
{
int *a, *b, *c; // local (host) copy
int *d_a, *d_b, *d_c; // device copy
int size = N * sizeof(int);
//allocating space for device copies ON THE DEVICE
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_a, size));
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_b, size));
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_c, size));
//allocating space for local copies ON THE HOST and initialize: a, b
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size); //random_ints(c, N);
//copy inputs to device
CUDA_CHECK_RETURN(cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice));
CUDA_CHECK_RETURN(cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice));
int num_block = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
add_arrays<<<num_block, THREADS_PER_BLOCK>>>(d_a, d_b, d_c, N);
// Copy result back from Device to Host
CUDA_CHECK_RETURN(cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost));
for (int i =0; i<N; ++i) {
printf("Result is %d \n", *(c+i));
}

You may want to consider posting a complete reproducible example with your question so that it contains your definition of CUDA_CHECK_RETURN, random_ints, included headers, etc. As mentioned in the comments, the obvious problem in your code though is that
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice)
should be
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice)
and similarly for b as a and b are pointers.

Related

Profiling a CUDA matrix addition code, using nvprof: the code API profiles, the kernel does not

I am using a remote workstation with nvidia Geforce Gpu , and after compiling and executing when I try to profile this shows up in the screen
and this is the output when i run nvidia-smi
#include <stdio.h>
#include <cuda.h>
#include <math.h>
__global__ void matrixInit(double *matrix, int width, int height, double value){
for(int i = (threadIdx.x + blockIdx.x * blockDim.x); i<width; i+=(blockDim.x * gridDim.x)){
for(int j = (threadIdx.y + blockIdx.y * blockDim.y); j<height; j+=(blockDim.y * gridDim.y)){
matrix[j * width +i] = value;
}
}
}
__global__ void matrixAdd(double *d_A, double *d_B, double *d_C, int width, int height){
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int stride_x = blockDim.x * gridDim.x;
int stride_y = blockDim.y * gridDim.y;
for(int j=iy; j<height; j+=stride_y){
for(int i=ix; i<width; i+=stride_x){
int index = j * width +i;
d_C[index] = d_A[index-1] + d_B[index];
}
}
}
int main(){
int Nx = 1<<12;
int Ny = 1<<15;
size_t size = Nx*Ny*sizeof(double);
// host memory pointers
double *A, *B, *C;
// device memory pointers
double *d_A, *d_B, *d_C;
// allocate host memory
A = (double*)malloc(size);
B = (double*)malloc(size);
C = (double*)malloc(size);
// kernel call
int thread = 32;
int block_x = ceil(Nx + thread -1)/thread;
int block_y = ceil(Ny + thread -1)/thread;
dim3 THREADS(thread,thread);
dim3 BLOCKS(block_y,block_x);
// initialize variables
matrixInit<<<BLOCKS,THREADS>>>(A, Nx, Ny, 1.0);
matrixInit<<<BLOCKS,THREADS>>>(B, Nx, Ny, 2.0);
matrixInit<<<BLOCKS,THREADS>>>(C, Nx, Ny, 0.0);
//allocated device memory
cudaMalloc(&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
//copy to device
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
// Add matrix at GPU
matrixAdd<<<BLOCKS,THREADS>>>(A, B, C, Nx, Ny);
//copy back to host
cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
cudaFree(A);
cudaFree(B);
cudaFree(C);
return 0;
}
This is my code. in summary, the result shows these 2 warning messages:
==525867== Warning: 4 records have invalid timestamps due to insufficient device buffer space. You can configure the buffer space using the option --device-buffer-size.
==525867== Warning: 1 records have invalid timestamps due to insufficient semaphore pool size. You can configure the pool size using the option --profiling-semaphore-pool-size.
==525867== Profiling result: No kernels were profiled.
matrixInit<<<BLOCKS,THREADS>>>(A, Nx, Ny, 1.0);
matrixInit<<<BLOCKS,THREADS>>>(B, Nx, Ny, 2.0);
matrixInit<<<BLOCKS,THREADS>>>(C, Nx, Ny, 0.0);
You are writing to host memory here, which is not allowed.
Instead you can do matrixInit() directly on the device arrays d_A, d_B and d_C, after the allocations.
Another mistake here:
cudaFree(A);
cudaFree(B);
cudaFree(C);
Those should be d_A, d_B and d_C. Use regular free() for A, B and C.
Your kernels are also not doing what you want. You launch them with a thread per matrix entry, which means there should be no for() loops in the kernels.

Vector addition in cuda

I am trying to learn cuda programming and when I tried to execute vector addition program from this tutorial here, it's not working as expected, no error is thrown, but the answer isn't correct.
#include<stdio.h>
#include<stdlib.h>
#define N 20
__global__ void add(int *a, int *b, int *c){
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
int main(void) {
int *a, *b, *c; // host copies of a, b, c
int *d_a, *d_b, *d_c; // device copies of a, b, c
int size = N * sizeof(int), i;
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(size);
for(i = 0; i< N ; i++){
a[i] = rand() % 100;
b[i] = rand() % 50;
}
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU with N blocks
add<<<N,1>>>(d_a, d_b, d_c);
cudaDeviceSynchronize();
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
for(i = 0; i< N; i++)
printf("%d + %d = %d\n", a[i], b[i], c[i]);
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
The result is always shown as -1, I don't know why, can someone help me out here?
I just used my GPU to compile and run the program and it is working as expected. You have to check the error codes returned by cudaMalloc and cudaMemCpy and make sure they are not returning any errors.

Using thrust::device_vector in device code [duplicate]

I am a newbie to Thrust. I see that all Thrust presentations and examples only show host code.
I would like to know if I can pass a device_vector to my own kernel? How?
If yes, what are the operations permitted on it inside kernel/device code?
As it was originally written, Thrust is purely a host side abstraction. It cannot be used inside kernels. You can pass the device memory encapsulated inside a thrust::device_vector to your own kernel like this:
thrust::device_vector< Foo > fooVector;
// Do something thrust-y with fooVector
Foo* fooArray = thrust::raw_pointer_cast( fooVector.data() );
// Pass raw array and its size to kernel
someKernelCall<<< x, y >>>( fooArray, fooVector.size() );
and you can also use device memory not allocated by thrust within thrust algorithms by instantiating a thrust::device_ptr with the bare cuda device memory pointer.
Edited four and half years later to add that as per #JackOLantern's answer, thrust 1.8 adds a sequential execution policy which means you can run single threaded versions of thrust's alogrithms on the device. Note that it still isn't possible to directly pass a thrust device vector to a kernel and device vectors can't be directly used in device code.
Note that it is also possible to use the thrust::device execution policy in some cases to have parallel thrust execution launched by a kernel as a child grid. This requires separate compilation/device linkage and hardware which supports dynamic parallelism. I am not certain whether this is actually supported in all thrust algorithms or not, but certainly works with some.
This is an update to my previous answer.
Starting from Thrust 1.8.1, CUDA Thrust primitives can be combined with the thrust::device execution policy to run in parallel within a single CUDA thread exploiting CUDA dynamic parallelism. Below, an example is reported.
#include <stdio.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include "TimingGPU.cuh"
#include "Utilities.cuh"
#define BLOCKSIZE_1D 256
#define BLOCKSIZE_2D_X 32
#define BLOCKSIZE_2D_Y 32
/*************************/
/* TEST KERNEL FUNCTIONS */
/*************************/
__global__ void test1(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {
const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::seq, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);
}
__global__ void test2(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {
const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::device, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);
}
/********/
/* MAIN */
/********/
int main() {
const int Nrows = 64;
const int Ncols = 2048;
gpuErrchk(cudaFree(0));
// size_t DevQueue;
// gpuErrchk(cudaDeviceGetLimit(&DevQueue, cudaLimitDevRuntimePendingLaunchCount));
// DevQueue *= 128;
// gpuErrchk(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, DevQueue));
float *h_data = (float *)malloc(Nrows * Ncols * sizeof(float));
float *h_results = (float *)malloc(Nrows * sizeof(float));
float *h_results1 = (float *)malloc(Nrows * sizeof(float));
float *h_results2 = (float *)malloc(Nrows * sizeof(float));
float sum = 0.f;
for (int i=0; i<Nrows; i++) {
h_results[i] = 0.f;
for (int j=0; j<Ncols; j++) {
h_data[i*Ncols+j] = i;
h_results[i] = h_results[i] + h_data[i*Ncols+j];
}
}
TimingGPU timerGPU;
float *d_data; gpuErrchk(cudaMalloc((void**)&d_data, Nrows * Ncols * sizeof(float)));
float *d_results1; gpuErrchk(cudaMalloc((void**)&d_results1, Nrows * sizeof(float)));
float *d_results2; gpuErrchk(cudaMalloc((void**)&d_results2, Nrows * sizeof(float)));
gpuErrchk(cudaMemcpy(d_data, h_data, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));
timerGPU.StartCounter();
test1<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
printf("Timing approach nr. 1 = %f\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<Nrows; i++) {
if (h_results1[i] != h_results[i]) {
printf("Approach nr. 1; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
return 0;
}
}
timerGPU.StartCounter();
test2<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
printf("Timing approach nr. 2 = %f\n", timerGPU.GetCounter());
gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<Nrows; i++) {
if (h_results1[i] != h_results[i]) {
printf("Approach nr. 2; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
return 0;
}
}
printf("Test passed!\n");
}
The above example performs reductions of the rows of a matrix in the same sense as Reduce matrix rows with CUDA, but it is done differently from the above post, namely, by calling CUDA Thrust primitives directly from user written kernels. Also, the above example serves to compare the performance of the same operations when done with two execution policies, namely, thrust::seq and thrust::device. Below, some graphs showing the difference in performance.
The performance has been evaluated on a Kepler K20c and on a Maxwell GeForce GTX 850M.
I would like to provide an updated answer to this question.
Starting from Thrust 1.8, CUDA Thrust primitives can be combined with the thrust::seq execution policy to run sequentially within a single CUDA thread (or sequentially within a single CPU thread). Below, an example is reported.
If you want parallel execution within a thread, then you may consider using CUB which provides reduction routines that can be called from within a threadblock, provided that your card enables dynamic parallelism.
Here is the example with Thrust
#include <stdio.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void test(float *d_A, int N) {
float sum = thrust::reduce(thrust::seq, d_A, d_A + N);
printf("Device side result = %f\n", sum);
}
int main() {
const int N = 16;
float *h_A = (float*)malloc(N * sizeof(float));
float sum = 0.f;
for (int i=0; i<N; i++) {
h_A[i] = i;
sum = sum + h_A[i];
}
printf("Host side result = %f\n", sum);
float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N * sizeof(float)));
gpuErrchk(cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice));
test<<<1,1>>>(d_A, N);
}
If you mean to use the data allocated / processed by thrust yes you can, just get the raw pointer of the allocated data.
int * raw_ptr = thrust::raw_pointer_cast(dev_ptr);
if you want to allocate thrust vectors in the kernel I never tried but I don't think will work
and also if it works I don't think it will provide any benefit.

Create 2D Array with CUDA

in cuda c programming guide document there is a sample that show a 2d array:
// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{
...
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
...
}
i use 2d array with below form and works correctly:
dim3 grid[COLUMNS][ROWS];
kernel_Matrix<<<grid,1>>>(dev_strA, dev_strB, dev_Matrix);
__global__ void add(int *a, int *b, int *c)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}
there is a way that implement 2d array with [ ][ ] definition? i tested this way but not works.
dim3 is not array but structure defined in CUDA header file (vector_types.h). This structure is used to specify dimensions of GRID in execution configuration of global functions, i.e. in <<< >>>. It doesn't keep the 'real' blocks it just configures a number of blocks that will be executed.
The only two ways (to my knowledge) to initialize this structure are:
1. dim3 grid(x, y, z);
2. dim3 grid = {x, y, z};
EDIT:
Host code with dim3 initialization and with passing the arrays to kernel function in a way you will be able to access its elements via [][]:
float A[N][N];
float B[N][N];
float C[N][N];
float (*d_A)[N]; //pointers to arrays of dimension N
float (*d_B)[N];
float (*d_C)[N];
for(int i = 0; i < N; i++) {
for(int j = 0; j < N; j++) {
A[i][j] = i;
B[i][j] = j;
}
}
//allocation
cudaMalloc((void**)&d_A, (N*N)*sizeof(float));
cudaMalloc((void**)&d_B, (N*N)*sizeof(float));
cudaMalloc((void**)&d_C, (N*N)*sizeof(float));
//copying from host to device
cudaMemcpy(d_A, A, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, (N*N)*sizeof(float), cudaMemcpyHostToDevice);
// Kernel invocation
dim3 threadsPerBlock(16, 16);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);
//copying from device to host
cudaMemcpy(A, (d_A), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(B, (d_B), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(C, (d_C), (N*N)*sizeof(float), cudaMemcpyDeviceToHost);

the Kernel delay increase by increasing the blocksPerGrid and threadsPerBlock in CUDA VecAdd example, what does it mean?

when i tested the following example, i found that by increasing the blocksPerGrid and threadsPerBlock the Kernel delay increase
such that if
int threadsPerBlock = 1;
int blocksPerGrid = 1;
blocksPerGrid and threadsPerBlock equal 1 the delay of the kernel = .0072 ms
but when i make the following it the delay become higher = .049 ms
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
where
N = 50000; //the no. of array elements
on the following the complete VecAdd example. you can test it
// Includes
#include <stdio.h>
#include <cutil_inline.h>
#include <shrQATest.h>
// Variables
float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;
float* d_C;
bool noprompt = false;
// Functions
void CleanupResources(void);
void RandomInit(float*, int);
void ParseArguments(int, char**);
// Device code
__global__ void VecAdd(const float* A, const float* B, float* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
// Host code
int main(int argc, char** argv)
{
shrQAStart(argc, argv);
cudaEvent_t event1, event2;
cudaEventCreate(&event1);
cudaEventCreate(&event2);
printf("Vector Addition\n");
int N = 50000;
size_t size = N * sizeof(float);
ParseArguments(argc, argv);
// Allocate input vectors h_A and h_B in host memory
h_A = (float*)malloc(size);
if (h_A == 0) CleanupResources();
h_B = (float*)malloc(size);
if (h_B == 0) CleanupResources();
h_C = (float*)malloc(size);
if (h_C == 0) CleanupResources();
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// Allocate vectors in device memory
cutilSafeCall( cudaMalloc((void**)&d_A, size) );
cutilSafeCall( cudaMalloc((void**)&d_B, size) );
cutilSafeCall( cudaMalloc((void**)&d_C, size) );
// Copy vectors from host memory to device memory
cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );
// Invoke kernel
int threadsPerBlock = 1024;
int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock;
cudaEventRecord(event1, 0);
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaEventRecord(event2, 0);
cudaEventSynchronize(event1); //optional
cudaEventSynchronize(event2);
float dt_ms;
cudaEventElapsedTime(&dt_ms, event1, event2);
printf("delay_time = %f\n", dt_ms);
cutilCheckMsg("kernel launch failure");
#ifdef _DEBUG
cutilSafeCall( cutilDeviceSynchronize() );
#endif
// Copy result from device memory to host memory
// h_C contains the result in host memory
cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-5)
break;
}
CleanupResources();
shrQAFinishExit(argc, (const char **)argv, (i==N) ? QA_PASSED : QA_FAILED);
}
void CleanupResources(void)
{
// Free device memory
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
cutilDeviceReset();
}
// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX;
}
// Parse program arguments
void ParseArguments(int argc, char** argv)
{
for (int i = 0; i < argc; ++i) {
if (strcmp(argv[i], "--noprompt") == 0 ||
strcmp(argv[i], "-noprompt") == 0)
{
noprompt = true;
break;
}
}
}
can any one explain for me what does it mean?
In case 1, a kernel of size 1 thread is launched and performs 2 reads and 1 write operation. In case 2, a kernel of size 50176 threads are launched and perform 100,000 reads and 50,000 writes operations. Increasing the workload by 50,000 increased execution time by ~7x. The work done by the two launches is significantly different.