Related
I'm using the following the code to learn about how to use "CUDA graphs". The parameter NSTEP is set as 1000, and the parameter NKERNEL is set as 20. The kernel function shortKernel has three parameters, it will perform a simple calculation.
#include <cuda_runtime.h>
#include <iostream>
#define N 131072 // tuned such that kernel takes a few microseconds
#define NSTEP 1000
#define NKERNEL 20
#define BLOCKS 256
#define THREADS 512
#define CHECK(call) \
do { \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) { \
printf("CUDA Error\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
__global__ void shortKernel(float * out_d, float * in_d, int i){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) out_d[idx]=1.23*in_d[idx] + i;
}
void test2() {
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaSetDevice(0);
float x_host[N], y_host[N];
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x_host[i] = 2.0f;
y_host[i] = 2.0f;
}
float *x, *y, *z;
CHECK(cudaMalloc((void**)&x, N*sizeof(float)));
CHECK(cudaMalloc((void**)&y, N*sizeof(float)));
CHECK(cudaMalloc((void**)&z, N*sizeof(float)));
cudaMemcpy(x, x_host, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaEvent_t begin, end;
CHECK(cudaEventCreate(&begin));
CHECK(cudaEventCreate(&end));
// start recording
cudaEventRecord(begin, stream);
bool graphCreated=false;
cudaGraph_t graph;
cudaGraphExec_t instance;
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
cudaEventRecord(end, stream);
cudaEventSynchronize(end);
float time_ms = 0;
cudaEventElapsedTime(&time_ms, begin, end);
std::cout << "CUDA Graph - CUDA Kernel overall time: " << time_ms << " ms" << std::endl;
cudaMemcpy(y_host, y, sizeof(float) * N, cudaMemcpyDeviceToHost);
for(int i = 0; i < N; i++) {
std::cout << "res " << y_host[i] << std::endl;
}
// Free memory
cudaFree(x);
cudaFree(y);
}
int main() {
test2();
std::cout << "end" << std::endl;
return 0;
}
My expected results are shown as the following:
res 2.46
res 3.46
res 4.46
res 5.46
res 6.46
...
However, the actual results are shown like this:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...
It seems that the all kernels' parameter i is set as NKERNEL-1. I am very confused about it, could someone give any explanations? Thanks!
I had changed the for loop as follows:
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
if(ikrnl == 0)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 0);
else if(ikrnl == 1)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 1);
else if(ikrnl == 2)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 2);
else
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
However, the results are still the same:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...
The results are expected and correct.
Every time you run the graph, this entire for-loop gets executed:
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
After the first iteration of that for-loop, the results will all be 2.46, after the second iteration the results will all be 3.46, and after the 20th iteration (ikrnl = 19) the results will all be 21.46.
Every time you run the graph, you will get that same result.
Expecting any kind of variation in the result such as this:
res 2.46
res 3.46
res 4.46
res 5.46
res 6.46
Is completely illogical, because every thread is doing precisely the same thing. Every thread starts with the same value in x, and does the same calculation on it. There is no reason to expect any difference between y[0] and y[1], for example.
Rather than trying to wade through CUDA graphs, its clear you don't have a good grasp of what the kernel is doing. My suggestion would be that you write an ordinary CUDA code that calls that kernel just once, without any CUDA graph usage, and study the output. After that, you can put a for-loop around the kernel, and watch the result behavior after every iteration of the for-loop. You don't need CUDA graphs to understand what is going on here.
A lot of cuda samples show that you have to put data from global memory into shared memory before using it.
For example let's consider a function that sums values in 5x5 squares. Profiler shows that version with no shared memory works like 20% faster.
Do i have to put my data into shared memory or maxwell will put the data into L1 cache automatically?
Shared memory is still a useful optimization for many codes, even on Maxwell.
If you have a 2D stencil code (appears to be what you are describing) I would certainly expect the version that runs out of shared memory to perform faster, assuming you are doing the shared memory adaptation/usage correctly.
Here's a fully worked example of a 2D stencil code, in both shared memory and non-shared-memory versions, running on a GTX 960. The shared memory version runs about 33% faster:
non-shared memory version:
$ cat example3a_imp.cu
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// these are just for timing measurments
#include <time.h>
// Code that reads values from a 2D grid and for each node in the grid finds the minumum
// value among all values stored in cells sharing that node, and stores the minumum
// value in that node.
//define the window size (square window) and the data set size
#define WSIZE 16
#define DATAHSIZE 8000
#define DATAWSIZE 16000
#define CHECK_VAL 1
#define MIN(X,Y) ((X<Y)?X:Y)
#define BLKWSIZE 32
#define BLKHSIZE 32
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int oArray[DATAHSIZE];
typedef int iArray[DATAHSIZE+WSIZE];
__global__ void cmp_win(oArray *output, const iArray *input)
{
int tempout, i, j;
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
if ((idx < DATAHSIZE) && (idy < DATAWSIZE)){
tempout = output[idy][idx];
#pragma unroll
for (i=0; i<WSIZE; i++)
#pragma unroll
for (j=0; j<WSIZE; j++)
if (input[idy + i][idx + j] < tempout)
tempout = input[idy + i][idx + j];
output[idy][idx] = tempout;
}
}
int main(int argc, char *argv[])
{
int i, j;
const dim3 blockSize(BLKHSIZE, BLKWSIZE, 1);
const dim3 gridSize(((DATAHSIZE+BLKHSIZE-1)/BLKHSIZE), ((DATAWSIZE+BLKWSIZE-1)/BLKWSIZE), 1);
// these are just for timing
clock_t t0, t1, t2;
double t1sum=0.0;
double t2sum=0.0;
// overall data set sizes
const int nr = DATAHSIZE;
const int nc = DATAWSIZE;
// window dimensions
const int wr = WSIZE;
const int wc = WSIZE;
// pointers for data set storage via malloc
iArray *h_in, *d_in;
oArray *h_out, *d_out;
// start timing
t0 = clock();
// allocate storage for data set
if ((h_in = (iArray *)malloc(((nr+wr)*(nc+wc))*sizeof(int))) == 0) {printf("malloc Fail \n"); exit(1);}
if ((h_out = (oArray *)malloc((nr*nc)*sizeof(int))) == 0) {printf("malloc Fail \n"); exit(1); }
// synthesize data
printf("Begin init\n");
memset(h_in, 0x7F, (nr+wr)*(nc+wc)*sizeof(int));
memset(h_out, 0x7F, (nr*nc)*sizeof(int));
for (i=0; i<nc+wc; i+=wc)
for (j=0; j< nr+wr; j+=wr)
h_in[i][j] = CHECK_VAL;
t1 = clock();
t1sum = ((double)(t1-t0))/CLOCKS_PER_SEC;
printf("Init took %f seconds. Begin compute\n", t1sum);
// allocate GPU device buffers
cudaMalloc((void **) &d_in, (((nr+wr)*(nc+wc))*sizeof(int)));
cudaCheckErrors("Failed to allocate device buffer");
cudaMalloc((void **) &d_out, ((nr*nc)*sizeof(int)));
cudaCheckErrors("Failed to allocate device buffer2");
// copy data to GPU
cudaMemcpy(d_out, h_out, ((nr*nc)*sizeof(int)), cudaMemcpyHostToDevice);
cudaCheckErrors("CUDA memcpy failure");
cudaMemcpy(d_in, h_in, (((nr+wr)*(nc+wc))*sizeof(int)), cudaMemcpyHostToDevice);
cudaCheckErrors("CUDA memcpy2 failure");
cmp_win<<<gridSize,blockSize>>>(d_out, d_in);
cudaCheckErrors("Kernel launch failure");
// copy output data back to host
cudaMemcpy(h_out, d_out, ((nr*nc)*sizeof(int)), cudaMemcpyDeviceToHost);
cudaCheckErrors("CUDA memcpy3 failure");
t2 = clock();
t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
printf ("Done. Compute took %f seconds\n", t2sum);
for (i=0; i < nc; i++)
for (j=0; j < nr; j++)
if (h_out[i][j] != CHECK_VAL) {printf("mismatch at %d,%d, was: %d should be: %d\n", i,j,h_out[i][j], CHECK_VAL); return 1;}
printf("Results pass\n");
return 0;
}
shared memory version:
$ cat example3b_imp.cu
#include <stdio.h>
#include <stdlib.h>
// these are just for timing measurments
#include <time.h>
// Code that reads values from a 2D grid and for each node in the grid finds the minumum
// value among all values stored in cells sharing that node, and stores the minumum
// value in that node.
//define the window size (square window) and the data set size
#define WSIZE 16
#define DATAHSIZE 8000
#define DATAWSIZE 16000
#define CHECK_VAL 1
#define MIN(X,Y) ((X<Y)?X:Y)
#define BLKWSIZE 32
#define BLKHSIZE 32
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
typedef int oArray[DATAHSIZE];
typedef int iArray[DATAHSIZE+WSIZE];
__global__ void cmp_win(oArray *output, const iArray *input)
{
__shared__ int smem[(BLKHSIZE + (WSIZE-1))][(BLKWSIZE + (WSIZE-1))];
int tempout, i, j;
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
if ((idx < DATAHSIZE) && (idy < DATAWSIZE)){
smem[threadIdx.y][threadIdx.x]=input[idy][idx];
if (threadIdx.y > (BLKWSIZE - WSIZE))
smem[threadIdx.y + (WSIZE-1)][threadIdx.x] = input[idy+(WSIZE-1)][idx];
if (threadIdx.x > (BLKHSIZE - WSIZE))
smem[threadIdx.y][threadIdx.x + (WSIZE-1)] = input[idy][idx+(WSIZE-1)];
if ((threadIdx.x > (BLKHSIZE - WSIZE)) && (threadIdx.y > (BLKWSIZE - WSIZE)))
smem[threadIdx.y + (WSIZE-1)][threadIdx.x + (WSIZE-1)] = input[idy+(WSIZE-1)][idx+(WSIZE-1)];
__syncthreads();
tempout = output[idy][idx];
for (i=0; i<WSIZE; i++)
for (j=0; j<WSIZE; j++)
if (smem[threadIdx.y + i][threadIdx.x + j] < tempout)
tempout = smem[threadIdx.y + i][threadIdx.x + j];
output[idy][idx] = tempout;
}
}
int main(int argc, char *argv[])
{
int i, j;
const dim3 blockSize(BLKHSIZE, BLKWSIZE, 1);
const dim3 gridSize(((DATAHSIZE+BLKHSIZE-1)/BLKHSIZE), ((DATAWSIZE+BLKWSIZE-1)/BLKWSIZE), 1);
// these are just for timing
clock_t t0, t1, t2;
double t1sum=0.0;
double t2sum=0.0;
// overall data set sizes
const int nr = DATAHSIZE;
const int nc = DATAWSIZE;
// window dimensions
const int wr = WSIZE;
const int wc = WSIZE;
// pointers for data set storage via malloc
iArray *h_in, *d_in;
oArray *h_out, *d_out;
// start timing
t0 = clock();
// allocate storage for data set
if ((h_in = (iArray *)malloc(((nr+wr)*(nc+wc))*sizeof(int))) == 0) {printf("malloc Fail \n"); exit(1);}
if ((h_out = (oArray *)malloc((nr*nc)*sizeof(int))) == 0) {printf("malloc Fail \n"); exit(1); }
// synthesize data
printf("Begin init\n");
memset(h_in, 0x7F, (nr+wr)*(nc+wc)*sizeof(int));
memset(h_out, 0x7F, (nr*nc)*sizeof(int));
for (i=0; i<nc+wc; i+=wc)
for (j=0; j< nr+wr; j+=wr)
h_in[i][j] = CHECK_VAL;
t1 = clock();
t1sum = ((double)(t1-t0))/CLOCKS_PER_SEC;
printf("Init took %f seconds. Begin compute\n", t1sum);
// allocate GPU device buffers
cudaMalloc((void **) &d_in, (((nr+wr)*(nc+wc))*sizeof(int)));
cudaCheckErrors("Failed to allocate device buffer");
cudaMalloc((void **) &d_out, ((nr*nc)*sizeof(int)));
cudaCheckErrors("Failed to allocate device buffer2");
// copy data to GPU
cudaMemcpy(d_out, h_out, ((nr*nc)*sizeof(int)), cudaMemcpyHostToDevice);
cudaCheckErrors("CUDA memcpy failure");
cudaMemcpy(d_in, h_in, (((nr+wr)*(nc+wc))*sizeof(int)), cudaMemcpyHostToDevice);
cudaCheckErrors("CUDA memcpy2 failure");
cmp_win<<<gridSize,blockSize>>>(d_out, d_in);
cudaCheckErrors("Kernel launch failure");
// copy output data back to host
cudaMemcpy(h_out, d_out, ((nr*nc)*sizeof(int)), cudaMemcpyDeviceToHost);
cudaCheckErrors("CUDA memcpy3 failure");
t2 = clock();
t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
printf ("Done. Compute took %f seconds\n", t2sum);
for (i=0; i < nc; i++)
for (j=0; j < nr; j++)
if (h_out[i][j] != CHECK_VAL) {printf("mismatch at %d,%d, was: %d should be: %d\n", i,j,h_out[i][j], CHECK_VAL); return 1;}
printf("Results pass\n");
return 0;
}
test:
$ nvcc -O3 -arch=sm_52 example3a_imp.cu -o ex3
$ nvcc -O3 -arch=sm_52 example3b_imp.cu -o ex3_shared
$ ./ex3
Begin init
Init took 0.986819 seconds. Begin compute
Done. Compute took 2.162276 seconds
Results pass
$ ./ex3_shared
Begin init
Init took 0.987281 seconds. Begin compute
Done. Compute took 1.522475 seconds
Results pass
$
I am new to CUDA programming. I was working on a sample code which multiplies a matrix with a vector and prints the results. I am using Cublas Dgemv API for doing the multiplication. On running the program using cuda-memcheck I get the following error,
Error: process didn't terminate successfully
========= The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under cuda-gdb or Nsight Eclipse Edition to catch host side errors.
========= Internal error (20)
========= No CUDA-MEMCHECK results found
The minimal complete code is here,
#include <thrust/device_vector.h>
#include <cublas_v2.h>
#include <iostream>
int main(void)
{
int rowDimension = 3; // number of rows
int columnDimension = 6; // number of columns
// initialize data
thrust::device_vector<double> weightMatrix;
weightMatrix.resize(rowDimension * columnDimension);
thrust::device_vector<double> inputVector;
inputVector.resize(columnDimension);
thrust::device_vector<double> F;
F.resize(rowDimension);
for (size_t i = 0; i < rowDimension; i++)
for (size_t j = 0; j < columnDimension; j++)
weightMatrix[j * rowDimension + i]=i;
for (size_t j = 0; j < columnDimension; j++)
inputVector[j] = j;
for (size_t i = 0; i < rowDimension; i++)
F[i]=0;
cublasHandle_t handle;
/* Initialize CUBLAS */
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! CUBLAS initialization error\n";
double alpha = 1.0f;
// cudaDeviceSynchronize();
status = cublasDgemv(handle, CUBLAS_OP_N, rowDimension, columnDimension, &alpha, thrust::raw_pointer_cast(weightMatrix.data()), rowDimension,
thrust::raw_pointer_cast(inputVector.data()), 1, 0, thrust::raw_pointer_cast(F.data()), 1) ;;
// cudaDeviceSynchronize();
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! kernel execution error.\n";
for (size_t j = 0; j < rowDimension; j++)
std::cout << F[j] << " ";
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
std::cerr << "!!!! shutdown error (A)\n";
return 0;
}
The above prgram produces a segmentation fault at the cublasDgemv function. Onrunning cuda-memcheck i get the message reported above. On Googling i couldn't find much help.
Can someone please help me resolve this issue.
Have a look at the documentation of cublasDgemv.
The signature is:
cublasDgemv(cublasHandle_t handle,
cublasOperation_t trans,
int m,
int n,
const double *alpha,
const double *A,
int lda,
const double *x,
int incx,
const double *beta,
double *y,
int incy)
beta has to be supplied as a pointer. But you pass a NULL pointer to it instead of a pointer pointing to the value 0.
So the following will fix your problem:
double alpha = 1.0;
double beta = 0;
status = cublasDgemv(handle,
CUBLAS_OP_N,
rowDimension,
columnDimension,
&alpha,
thrust::raw_pointer_cast(weightMatrix.data()),
rowDimension,
thrust::raw_pointer_cast(inputVector.data()),
1,
&beta, // note the change here!
thrust::raw_pointer_cast(F.data()),
1);
I want to calculate the average of the values over the whole image in Cuda. To test how reduction in 2D array work, I write this kernel below. The final output o should be the sum of all the image values. The input g is a 2D array with value 1 in every pixel. But the result of this program is 0 as the sum. A bit weird to me.
I imitate the reduction in 1D array in this tutorial http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf I write this 2D form. I am new to Cuda. And suggestions to potential bugs and improvement are welcomed!
Just add one comment. I know it makes sense just to calculate the average in 1D array. But I want to exploit more and test more complicated reduction behaviours. It might not be right. But just a test. Hope anyone can give me suggestions more about reduction common practices.
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
cudaEvent_t start, stop;
float elapsedTime;
__global__ void
reduce(float *g, float *o, const int dimx, const int dimy)
{
extern __shared__ float sdata[];
unsigned int tid_x = threadIdx.x;
unsigned int tid_y = threadIdx.y;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= dimx || j >= dimy)
return;
sdata[tid_x*blockDim.y + tid_y] = g[i*dimy + j];
__syncthreads();
for(unsigned int s_y = blockDim.y/2; s_y > 0; s_y >>= 1)
{
if (tid_y < s_y)
{
sdata[tid_x * dimy + tid_y] += sdata[tid_x * dimy + tid_y + s_y];
}
__syncthreads();
}
for(unsigned int s_x = blockDim.x/2; s_x > 0; s_x >>= 1 )
{
if(tid_x < s_x)
{
sdata[tid_x * dimy] += sdata[(tid_x + s_x) * dimy];
}
__syncthreads();
}
float sum;
if( tid_x == 0 && tid_y == 0)
{
sum = sdata[0];
atomicAdd (o, sum); // The result should be the sum of all pixel values. But the program produce 0
}
//if(tid_x==0 && tid__y == 0 )
//o[blockIdx.x] = sdata[0];
}
int
main()
{
int dimx = 320;
int dimy = 160;
int num_bytes = dimx*dimy*sizeof(float);
float *d_a, *h_a, // device and host pointers
*d_o=0, *h_o=0;
h_a = (float*)malloc(num_bytes);
h_o = (float*)malloc(sizeof(float));
srand(time(NULL));
for (int i=0; i < dimx; i++)
{
for (int j=0; j < dimy; j++)
{
h_a[i*dimy + j] = 1;
}
}
cudaMalloc( (void**)&d_a, num_bytes );
cudaMalloc( (void**)&d_o, sizeof(int) );
cudaMemcpy( d_a, h_a, num_bytes, cudaMemcpyHostToDevice);
cudaMemcpy( d_o, h_o, sizeof(int), cudaMemcpyHostToDevice);
dim3 grid, block;
block.x = 4;
block.y = 4;
grid.x = dimx / block.x;
grid.y = dimy / block.y;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
int sizeofSharedMemory = dimx*dimy*sizeof(float);
reduce<<<grid, block, sizeofSharedMemory>>> (d_a, d_o, block.x, block.y);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
std::cout << "This kernel runs: " << elapsedTime << "ms" << std::endl;
std::cout << block.x << " " << block.y << std::endl;
std::cout << grid.x << " " << grid.y << std::endl;
std::cout << dimx << " " << dimy << " " << dimx*dimy << std::endl;
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
cudaMemcpy( h_o, d_o, sizeof(int), cudaMemcpyDeviceToHost );
std::cout << "The sum is:" << *h_o << std::endl;
free(h_a);
free(h_o);
cudaFree(d_a);
cudaFree(d_o);
}
If you do basic cuda error checking you will discover that your reduce kernel is not even running. The reason is as follows:
int dimx = 320;
int dimy = 160;
...
int sizeofSharedMemory = dimx*dimy*sizeof(float); // = 204800
reduce<<<grid, block, sizeofSharedMemory>>> (d_a, d_o, block.x, block.y);
^
|
204800 is illegal here
You cannot request 204800 bytes of shared memory dynamically (or any other way). The maximum is slightly less than 48K bytes.
If you had done proper cuda error checking, you would discover your kernel is not running and would have gotten an instructive error message which suggests the launch configuration (the numbers between the <<< ... >>> ) is invalid. Shared memory is requested on a per-block basis, and it's probably not sensible that you need to request enough shared memory to cover your entire 2D data set, when each block only consists of a 4x4 thread array. You probably just need enough data for what will be accessed by each 4x4 thread array.
After you have properly instrumented your code with cuda error checking, and detected and corrected all the errors, then run your code with cuda-memcheck. This will do an additional level of error checking to point out any kernel access errors. You may also use cuda-memcheck if you are getting an unspecified launch failure, and it may help pinpoint the issue.
After you have done these basic trouble shooting steps, then it might make sense to ask others for help. But use the power of the tools you have been given first.
I also want to point out one other error before you come back and post this code again, asking for help.
This will not be useful:
std::cout << "The sum is:" << *h_o << std::endl;
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
cudaMemcpy( h_o, d_o, sizeof(int), cudaMemcpyDeviceToHost );
You are printing out the sum before you have copied the sum from the device to the host.
Reverse the order of these steps:
cudaMemcpy( h_a, d_a, num_bytes, cudaMemcpyDeviceToHost );
cudaMemcpy( h_o, d_o, sizeof(int), cudaMemcpyDeviceToHost );
std::cout << "The sum is:" << *h_o << std::endl;
Wrote my first program using CUDA+CUBLAS. It just uses a 'cublasDgemm' function and computes a product of 2 N*N matrices.
However, all the time I was launching my program, it keeped producing the same wrong answer (e.g. when multiplying 1*1 matrix containing 5 as a single element by 1*1 matrix containing element 6, it always said the result is 36, not 30).
I checked the program several times with no success. But, when I came back to it the nexy day (i.e. after reboot), it worked just fine. I don't remember whether I recompiled it or not, but the truth is that it is the same VS project, same code, same computer with its GPU.
So, can anyone explain me why could that have happened? And do I have to expect same strange behaviour further?
Here is the code I was launching:
#include <iostream>
#include <string>
#include <iomanip>
#include <cuda_runtime.h>
#include <cublas_v2.h>
const int N = 5;
#define IDX2F(i,j) ((i) * N + j)
void fail(const cudaError_t& cudaStatus, const std::string& errorMessage) {
if (cudaStatus != cudaSuccess) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void fail(const cublasStatus_t& status, const std::string& errorMessage) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << errorMessage << std::endl;
exit(EXIT_FAILURE);
}
}
void printMatrix(const double *C) {
for (int i=0; i<N; i++) {
for (int j=0; j<N; j++) {
std::cout << std::fixed << std::setprecision(2) << C[IDX2F(i,j)] << ' ';
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int main(int argc, char **argv) {
cudaError_t cudaStatus;
cublasStatus_t status;
cublasHandle_t handle;
double *A = new double[N*N];
double *devPtrA;
double *B = new double[N*N];
double *devPtrB;
double *C = new double[N*N];
double *devPtrC;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
A[IDX2F(i,j)] = i + j;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
B[IDX2F(i,j)] = i + j * 0.5;
// do not have to set anything into matrix C, because beta = 0
// allocate mamory on GPU
cudaStatus = cudaMalloc((void**)&devPtrC, N*N*sizeof(*C));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrA, N*N*sizeof(*A));
fail(cudaStatus, "device memory allocation failed");
cudaStatus = cudaMalloc((void**)&devPtrB, N*N*sizeof(*B));
fail(cudaStatus, "device memory allocation failed");
// create GPU handle
status = cublasCreate(&handle);
fail(status, "CUBLAS initialization failed");
// copying matrices from host to GPU
status = cublasSetMatrix(N, N, sizeof (*B), B, N, devPtrB, N);
fail(status, "failed to load data from host to GPU");
status = cublasSetMatrix(N, N, sizeof (*A), A, N, devPtrA, N);
fail(status, "failed to load data from host to GPU");
const double ONE = 1;
const double ZERO = 0;
printMatrix(A);
printMatrix(B);
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_N,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
fail(status, "error cublasDgemm");
status = cublasGetMatrix(N, N, sizeof (*C), devPtrC, N, C, N);
fail(status, "could not load result back from GPU to host");
printMatrix(C);
status = cublasDestroy(handle);
fail(status, "could not destroy CUBLAS handle");
cudaStatus = cudaFree(devPtrC);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrB);
fail(cudaStatus, "device memory freeing failed");
cudaStatus = cudaFree(devPtrA);
fail(cudaStatus, "device memory freeing failed");
delete[] C;
delete[] B;
delete[] A;
return EXIT_SUCCESS;
}
op(B) must be CUBLAS_OP_T
.
.
status = cublasDgemm( handle,
CUBLAS_OP_N, CUBLAS_OP_T,
N, N, N,
&ONE,
devPtrA, N,
devPtrB, N,
&ZERO,
devPtrC, N);
.
.
.
.
definition is : C = α op ( A ) op ( B ) + β C
http://docs.nvidia.com/cuda/cublas/index.html#topic_8_1