difference between elapsed time in matrix multiplication with and without using shared memory in cuda - cuda

I am new in cuda programming. My program is matrix multiplication with and without shared memory . I use the codes in Cuda_C_Programming_Guide ebook . In book we see the program which uses shared memory has less elapsed time than the elapse time in none_shared program. But when I run the programs it’s vice versa. Does anyone know why? Or am I wrong?
none-shared memory :
#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include <iostream>
#include <thrust/system_error.h>
#include <thrust/system/cuda_error.h>
#include <sstream>
typedef struct _Matrix
{
int height;//number of rows
int width;//number of columns
float *elements;
}Matrix;
#define BLOCK_SIZE 20
__global__ void add_matrix_kernel(const Matrix a,const Matrix b,Matrix c)
{
int N=a.width;
int row=blockIdx.y * blockDim.y + threadIdx.y;
int col=blockIdx.x * blockDim.x+threadIdx.x;
c.elements[row * N + col]=a.elements[row * N + col]+b.elements[row * N + col];
}
__global__ void simpleMultiply(const Matrix a,const Matrix b, Matrix c)
{
int N=a.width;
int TILE_DIM=a.width;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
for (int i = 0; i < TILE_DIM; i++)
{
sum += a.elements[row*TILE_DIM+i] * b.elements[i*N+col];
}
c.elements[row*N+col] = sum;
}
void add_matrix(const Matrix A,const Matrix B,Matrix C)
{
// Load A and B to device memory
Matrix d_A;
Matrix d_B;
Matrix d_C;
d_A.width = A.width; d_A.height = A.height;
d_B.width = B.width; d_B.height = B.height;
d_C.width = C.width; d_C.height = C.height;
size_t sizeA = A.width * A.height * sizeof(float);
size_t sizeB = B.width * B.height * sizeof(float);
size_t sizeC = C.width * C.height * sizeof(float);
//allocate space for device copies of A,B,C
cudaMalloc((void **)&d_A.elements, sizeA);
//gpuErrchk( cudaMalloc(&a_d, size*sizeof(int)) );
cudaMalloc((void **)&d_B.elements, sizeB);
cudaMalloc((void **)&d_C.elements, sizeC);
//copy inputs to device
cudaMemcpy(d_A.elements, A.elements, sizeA,cudaMemcpyHostToDevice);
cudaMemcpy(d_B.elements, B.elements, sizeA,cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(B.width/dimBlock.x, A.height/dimBlock.y);
//add_matrix_kernel<<<grid_size,block_size>>>(d_A, d_B, d_C);
simpleMultiply<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, sizeA,cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
//
//void print_matrix(int *c,int row,int col)
//{
// for (int i = 0; i < row; ++i){
// for (int j = 0; j < col; ++j)
// printf("%d ",c[col*i+j]);
// printf("\n\n");
// }
//}
void print_matrix(Matrix A){
printf("Matrix:\n");
int i;
int rows=0;
//printf("row %d\n",rows);
for(i=0; i<A.width*A.height; i++){
if(i%A.width==0){ printf("\n");printf("row %d\n",rows);rows++;}
printf("%6.4f\t",A.elements[i]);
}
printf("\n");
}
void throw_on_cuda_error(cudaError_t code, const char *file, int line)
{
if(code != cudaSuccess)
{
std::stringstream ss;
ss << file << "(" << line << ")";
std::string file_and_line;
ss >> file_and_line;
throw thrust::system_error(code, thrust::cuda_category(), file_and_line);
}
}
int main()
{
cudaEvent_t start,stop;
try{
int i,j;
Matrix A,B;
Matrix C;
A.width=1200;
A.height=1200;
B.width=1200;
B.height=1200;
C.width=B.width;
C.height=A.height;
size_t sizeA = A.width * A.height * sizeof(float);
A.elements = (float *)malloc(sizeA);
//random_init(A.elements,A.width * A.height );
size_t sizeB = B.width * B.height * sizeof(float);
B.elements= (float *)malloc(sizeB);
//random_init(B.elements,B.width * B.height);
size_t sizeC = C.width * C.height * sizeof(float);
C.elements= (float *)malloc(sizeC);
for(i=0;i<A.width*A.height;i++)
A.elements[i]=1;
for(int i=0;i<B.width*B.height;i++)
B.elements[i]=1;
printf("matrix A(%d,%d) & matrix B(%d,%d) & matrix C(%d,%d)\n",A.height,A.width,B.height,B.width,C.height,C.width);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
add_matrix(A,B,C);
cudaPeekAtLastError() ;
cudaDeviceSynchronize() ;
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("Time to genreat : %3.5f ms\n",elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
/*printf("\nA\n");
print_matrix(A.elements,A.height,A.width);
printf("\nB\n");
print_matrix(B.elements,B.height,B.width);*/
printf("\nC\n");
// print_matrix(C.elements,C.height,C.width);
// print_matrix(C);
printf("C[%d] = %f\n",0,C.elements[0]);
printf("C[%d] = %f\n",(C.width)-1,C.elements[(C.width)-1]);
printf("C[%d] = %f\n",(C.width)*(C.height)-1,C.elements[(C.width)*(C.height)-1]);
free(A.elements);
free(B.elements);
free(C.elements);
getchar();
throw_on_cuda_error(cudaSetDevice(-1), __FILE__, __LINE__);
}
catch(thrust::system_error &e)
{
std::cerr << "CUDA error after cudaSetDevice: " << e.what() << std::endl;
// oops, recover
cudaSetDevice(0);
}
return 0;
}
using shared memory:
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
#include <stdio.h>
#include <iostream>
#include <thrust/system_error.h>
#include <thrust/system/cuda_error.h>
#include <sstream>
#define BLOCK_SIZE 20
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,
float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row+ BLOCK_SIZE * col];
return Asub;
}
// Thread block size
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc(&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
//dim3 dimBlock(C.height, C.width);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
//dim3 dimGrid((B.width+dimBlock.x-1) / dimBlock.x, (A.height+dimBlock.y-1) / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
// Get sub-matrix Asub of A
Matrix Asub = GetSubMatrix(A, blockRow, m);
// Get sub-matrix Bsub of B
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}
//////////////////////////////////////////////////////////
/// print_matrix function ///////////////////////////
////////////////////////////////////////////////////////
void print_matrix(float *c,int row,int col){
for (int i = 0; i < row; ++i){
for (int j = 0; j < col; ++j)
printf("%f ",c[col*i +j]);
printf("\n\n");
}
}
//////////////////////////////////////////////////////////
/// random_init function ///////////////////////////
////////////////////////////////////////////////////////
void random_init(float *a,int size){
for(int i=0;i<size;i++)
a[i]=rand()%10;
}
////////////////////////////////////////////////////////
void throw_on_cuda_error(cudaError_t code, const char *file, int line)
{
if(code != cudaSuccess)
{
std::stringstream ss;
ss << file << "(" << line << ")";
std::string file_and_line;
ss >> file_and_line;
throw thrust::system_error(code, thrust::cuda_category(), file_and_line);
}
}
int main(void){
cudaEvent_t start,stop;
try{
Matrix A,B,C;
A.width=1200;
A.height=1200;/////
B.width=1200;/////
B.height=1200;
C.width=B.width;
C.height=A.height;
size_t size = A.width * A.height * sizeof(float);
A.elements = (float *)malloc(size);
//random_init(A.elements,A.width * A.height );
size = B.width * B.height * sizeof(float);
B.elements= (float *)malloc(size);
//random_init(B.elements,B.width * B.height);
size = C.width * C.height * sizeof(float);
C.elements= (float *)malloc(size);
for(int i=0;i<A.width*A.height;i++)
A.elements[i]=1;
for(int i=0;i<B.width*B.height;i++)
B.elements[i]=1;
printf("matrix A(%d,%d) & matrix B(%d,%d) & matrix C(%d,%d)\n",A.width,A.height,B.width,
B.height,C.width,C.height);
//////////////////////////////////////////////////////\|/
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
MatMul(A,B,C);
cudaPeekAtLastError() ;
cudaDeviceSynchronize() ;
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("Time to genreat : %4.5f ms\n",elapsedTime);
//////////////////////////////////////////////////////\|/
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
//printf("\nA\n");
//print_matrix(A.elements,A.height,A.width);
//printf("\nB\n");
//print_matrix(B.elements,B.height,B.width);
printf("\nC\n");
//print_matrix(C.elements,C.height,C.width);
printf("C[%d]=%f\n",0,C.elements[0]);
printf("C[%d]=%f\n",C.width -1,C.elements[C.width-1]);
printf("C[%d]=%f\n",(C.width * C.height)-1,C.elements[(C.width * C.height)-1]);
getchar();
throw_on_cuda_error(cudaSetDevice(-1), __FILE__, __LINE__);
}
catch(thrust::system_error &e)
{
std::cerr << "CUDA error after cudaSetDevice: " << e.what() << std::endl;
// oops, recover
cudaSetDevice(0);
}
return(0);
}
I run the debug . and the output window when my program is running is:
'GPU_Matrix.exe': Loaded 'E:\FarnAz\Cuda Project\Projects\GPU_Matrix\Debug\GPU_Matrix.exe', Symbols loaded.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\ntdll.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\kernel32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\KernelBase.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\bin\cudart32_42_9.dll', Binary was not built with debug information.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msvcp100d.dll', Symbols loaded.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msvcr100d.dll', Symbols loaded.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\nvcuda.dll', Binary was not built with debug information.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\user32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\gdi32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\lpk.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\usp10.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msvcrt.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\advapi32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\sechost.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\rpcrt4.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\sspicli.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\cryptbase.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\setupapi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\cfgmgr32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\oleaut32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\ole32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\devobj.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\shell32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\shlwapi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\ws2_32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\nsi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\imm32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msctf.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\ProgramData\Wincert\win32cert.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\nvinit.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Program Files (x86)\NVIDIA Corporation\coprocmanager\detoured.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Program Files (x86)\NVIDIA Corporation\coprocmanager\Nvd3d9wrap.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Program Files (x86)\NVIDIA Corporation\coprocmanager\nvdxgiwrap.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Unloaded 'C:\ProgramData\Wincert\win32cert.dll'
The thread 'Win32 Thread' (0x1214) has exited with code 1849301074 (0x6e3a1852).
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\dwmapi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Unloaded 'C:\Windows\SysWOW64\dwmapi.dll'
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\nvapi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\version.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\wintrust.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\crypt32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msasn1.dll', Cannot find or open the PDB file
And the result for example for matrices 1000*1000 is about 1219 ms for the non-shared code and about 1770 ms for the shared memory code.
and when I run the release project the program doesn't run successfully and shows some errors in error list.but I don't know why !!
and the output window in release mode is:
1>------ Build started: Project: GPU_Matrix, Configuration: Release Win32 ------
1>Build started 11/13/2013 10:39:47 AM.
1>InitializeBuildStatus:
1> Touching "Release\GPU_Matrix.unsuccessfulbuild".
1>AddCudaCompilePropsDeps:
1>Skipping target "AddCudaCompilePropsDeps" because all output files are up-to-date with respect to the input files.
1>CudaBuild:
1> Compiling CUDA source file main.cu...
1>
1> E:\FarnAz\Cuda Project\Projects\GPU_Matrix\GPU_Matrix>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\include" --keep-dir "Release" -maxrregcount=0 --machine 32 --compile -Xcompiler "/EHsc /nologo /Od /Zi /MD " -o "Release\main.cu.obj" "E:\FarnAz\Cuda Project\Projects\GPU_Matrix\GPU_Matrix\main.cu"
1> main.cu
1> tmpxft_00001c70_00000000-0_main.cudafe1.gpu
1> tmpxft_00001c70_00000000-5_main.cudafe2.gpu
1> main.cu
1> tmpxft_00001c70_00000000-0_main.cudafe1.cpp
1> tmpxft_00001c70_00000000-11_main.ii
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaFree#4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaConfigureCall#32
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaMemcpy#16
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaMalloc#8
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaGetErrorString#4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaSetDevice#4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventDestroy#4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventElapsedTime#12
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventSynchronize#4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaDeviceSynchronize#0
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaPeekAtLastError#0
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventRecord#8
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventCreate#4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaSetupArgument#12
1>main.cu.obj : error LNK2001: unresolved external symbol ___cudaRegisterFunction#40
1>main.cu.obj : error LNK2001: unresolved external symbol ___cudaRegisterFatBinary#4
1>main.cu.obj : error LNK2001: unresolved external symbol ___cudaUnregisterFatBinary#4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaLaunch#4
1>E:\FarnAz\Cuda Project\Projects\GPU_Matrix\Release\GPU_Matrix.exe : fatal error LNK1120: 18 unresolved externals
1>
1>Build FAILED.
1>
1>Time Elapsed 00:00:08.43
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========
I ran vectorAdd in both mode. Then I paste my code in that project. In debug mode it has no problem and the result in non-shared is about 1372 ms and in shared memory is about 1842 ms. But in release mode however it shows a new window that tells : “Debugging information for ‘vectorAdd.exe’ cannot be found or does not match. Binary was not built with debug information. Do you want to continue debugging ?” , when I click “yes” it continues and runs with no error. And the result in non-shared is about 645 ms and in shared-memory is about 183 ms. I don’t understand why the results are vice versa in release mode and which one is true? Does result in release mode is true for every project or in debug mode?

You're getting this message:
“Debugging information for ‘vectorAdd.exe’ cannot be found or does not match. Binary was not built with debug information. Do you want to continue debugging ?” ,
Because of the way you are starting the executable in Visual Studio. When you build the release project you should just run it, rather than "start debugging". You'll need to explore visual studio a bit more.
The results you get seem correct in Release mode. The shared memory code runs faster, as expected. When building a "Debug" project in visual studio, the -G switch will normally get passed to the nvcc compiler driver which has a significant affect on code generation. It does more than just allow for debugging by adding symbols. It disables many optimizations that the compiler might make, so as to make source debugging easier.
You should not evaluate CUDA code performance in "Debug" mode or by passing the -G switch to nvcc.

Related

Can I update or change a cuda kernel without stopping the process? [duplicate]

I have an application which generates CUDA C++ source code, compiles it into PTX at runtime using NVRTC, and then creates CUDA modules from it using the CUDA driver API.
If I debug this application using cuda-gdb, it displays the kernel (where an error occured) in the backtrace, but does not show the line number.
I export the generated source code into a file, and give the directory to cuda-gdb using the --directory option. I also tried passing its file name to nvrtcCreateProgram() (name argument). I use the compile options --device-debug and --generate-line-info with NVRTC.
Is there a way to let cuda-gdb know the location of the generated source code file, and display the line number information in its backtrace?
For those who may not be familiar with nvrtc, it is a CUDA facility that allows runtime-compilation of CUDA C++ device code. As a result, device code generated at runtime (including modifications) can be used on a CUDA GPU. There is documentation for nvrtc and there are various CUDA sample codes for nvrtc, most or all of which have _nvrtc in the file name.
I was able to do kernel source-level debugging on a nvrtc-generated kernel with cuda-gdb as follows:
start with vectorAdd_nvrtc sample code
modify the compileFileToPTX routine (provided by nvrtc_helper.h) to add the --device-debug switch during the compile-cu-to-ptx step.
modify the loadPTX routine (provided by nvrtc_helper.h) to add the CU_JIT_GENERATE_DEBUG_INFO option (set to 1) for the cuModuleLoadDataEx load/JIT PTX-to-binary step.
compile the main function (vectorAdd.cpp) with -g option.
Here is a complete test case/session. I'm only showing the vectorAdd.cpp file from the project because that is the only file I modified. Other project file(s) are identical to what is in the sample project:
$ cat vectorAdd.cpp
/**
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/**
* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 2
* of the programming guide with some additions like error checking.
*/
#include <stdio.h>
#include <cmath>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda.h>
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <nvrtc_helper.h>
#include <iostream>
#include <fstream>
/**
* Host main routine
*/
void my_compileFileToPTX(char *filename, int argc, char **argv, char **ptxResult,
size_t *ptxResultSize, int requiresCGheaders) {
std::ifstream inputFile(filename,
std::ios::in | std::ios::binary | std::ios::ate);
if (!inputFile.is_open()) {
std::cerr << "\nerror: unable to open " << filename << " for reading!\n";
exit(1);
}
std::streampos pos = inputFile.tellg();
size_t inputSize = (size_t)pos;
char *memBlock = new char[inputSize + 1];
inputFile.seekg(0, std::ios::beg);
inputFile.read(memBlock, inputSize);
inputFile.close();
memBlock[inputSize] = '\x0';
int numCompileOptions = 0;
char *compileParams[2];
std::string compileOptions;
if (requiresCGheaders) {
char HeaderNames[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
#else
snprintf(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
#endif
compileOptions = "--include-path=";
std::string path = sdkFindFilePath(HeaderNames, argv[0]);
if (!path.empty()) {
std::size_t found = path.find(HeaderNames);
path.erase(found);
} else {
printf(
"\nCooperativeGroups headers not found, please install it in %s "
"sample directory..\n Exiting..\n",
argv[0]);
}
compileOptions += path.c_str();
compileParams[0] = reinterpret_cast<char *>(
malloc(sizeof(char) * (compileOptions.length() + 1)));
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(compileParams[0], sizeof(char) * (compileOptions.length() + 1),
"%s", compileOptions.c_str());
#else
snprintf(compileParams[0], compileOptions.size(), "%s",
compileOptions.c_str());
#endif
numCompileOptions++;
}
compileOptions = "--device-debug ";
compileParams[numCompileOptions] = reinterpret_cast<char *>(malloc(sizeof(char) * (compileOptions.length() + 1)));
snprintf(compileParams[numCompileOptions], compileOptions.size(), "%s", compileOptions.c_str());
numCompileOptions++;
// compile
nvrtcProgram prog;
NVRTC_SAFE_CALL("nvrtcCreateProgram",
nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL));
nvrtcResult res = nvrtcCompileProgram(prog, numCompileOptions, compileParams);
// dump log
size_t logSize;
NVRTC_SAFE_CALL("nvrtcGetProgramLogSize",
nvrtcGetProgramLogSize(prog, &logSize));
char *log = reinterpret_cast<char *>(malloc(sizeof(char) * logSize + 1));
NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));
log[logSize] = '\x0';
if (strlen(log) >= 2) {
std::cerr << "\n compilation log ---\n";
std::cerr << log;
std::cerr << "\n end log ---\n";
}
free(log);
NVRTC_SAFE_CALL("nvrtcCompileProgram", res);
// fetch PTX
size_t ptxSize;
NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxSize));
char *ptx = reinterpret_cast<char *>(malloc(sizeof(char) * ptxSize));
NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx));
NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));
*ptxResult = ptx;
*ptxResultSize = ptxSize;
#ifdef DUMP_PTX
std::ofstream my_f;
my_f.open("vectorAdd.ptx");
for (int i = 0; i < ptxSize; i++)
my_f << ptx[i];
my_f.close();
#endif
if (requiresCGheaders) free(compileParams[0]);
}
CUmodule my_loadPTX(char *ptx, int argc, char **argv) {
CUmodule module;
CUcontext context;
int major = 0, minor = 0;
char deviceName[256];
// Picks the best CUDA device available
CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGet(&cuDevice, 0));
checkCudaErrors(cuCtxCreate(&context, 0, cuDevice));
CUjit_option opt[1];
opt[0] = CU_JIT_GENERATE_DEBUG_INFO;
void **vals = new void *[1];
vals[0] = (void *)(size_t)1;
checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 1, opt, vals));
free(ptx);
return module;
}
int main(int argc, char **argv) {
char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
my_compileFileToPTX(kernel_file, argc, argv, &ptx, &ptxSize, 0);
CUmodule module = my_loadPTX(ptx, argc, argv);
CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
// Print the vector length to be used, and compute its size
int numElements = 50000;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
// Allocate the host input vector A
float *h_A = reinterpret_cast<float *>(malloc(size));
// Allocate the host input vector B
float *h_B = reinterpret_cast<float *>(malloc(size));
// Allocate the host output vector C
float *h_C = reinterpret_cast<float *>(malloc(size));
// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / static_cast<float>(RAND_MAX);
h_B[i] = rand() / static_cast<float>(RAND_MAX);
}
// Allocate the device input vector A
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, size));
// Allocate the device input vector B
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, size));
// Allocate the device output vector C
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, size));
// Copy the host input vectors A and B in host memory to the device input
// vectors in device memory
printf("Copy input data from the host memory to the CUDA device\n");
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
threadsPerBlock);
dim3 cudaBlockSize(threadsPerBlock, 1, 1);
dim3 cudaGridSize(blocksPerGrid, 1, 1);
void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
reinterpret_cast<void *>(&d_C),
reinterpret_cast<void *>(&numElements)};
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
// Free host memory
free(h_A);
free(h_B);
free(h_C);
printf("Done\n");
return 0;
}
$ nvcc -g -I/usr/local/cuda/samples/common/inc -o test vectorAdd.cpp -lnvrtc -lcuda
$ cuda-gdb ./test
NVIDIA (R) CUDA Debugger
10.0 release
Portions Copyright (C) 2007-2018 NVIDIA Corporation
GNU gdb (GDB) 7.12
Copyright (C) 2016 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./test...done.
(cuda-gdb) break vectorAdd
Function "vectorAdd" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (vectorAdd) pending.
(cuda-gdb) r
Starting program: /home/user2/misc/junk/vectorAdd_nvrtc/test
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7fffedc00700 (LWP 16789)]
> Using CUDA Device [1]: Tesla K40m
> GPU Device has SM 3.5 compute capability
[New Thread 0x7fffed3ff700 (LWP 16790)]
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
Thread 1 "test" hit Breakpoint 1, vectorAdd<<<(196,1,1),(256,1,1)>>> (A=0x7fffce800000, B=0x7fffce830e00, C=0x7fffce861c00, numElements=50000) at ./vectorAdd_kernel.cu:21
21 int i = blockDim.x * blockIdx.x + threadIdx.x;
(cuda-gdb) step
23 if (i < numElements) {
(cuda-gdb) step
24 C[i] = A[i] + B[i];
(cuda-gdb) step
26 }
(cuda-gdb) quit
A debugging session is active.
Inferior 1 [process 16777] will be killed.
Quit anyway? (y or n) y
$

Using CUDA-gdb with NVRTC

I have an application which generates CUDA C++ source code, compiles it into PTX at runtime using NVRTC, and then creates CUDA modules from it using the CUDA driver API.
If I debug this application using cuda-gdb, it displays the kernel (where an error occured) in the backtrace, but does not show the line number.
I export the generated source code into a file, and give the directory to cuda-gdb using the --directory option. I also tried passing its file name to nvrtcCreateProgram() (name argument). I use the compile options --device-debug and --generate-line-info with NVRTC.
Is there a way to let cuda-gdb know the location of the generated source code file, and display the line number information in its backtrace?
For those who may not be familiar with nvrtc, it is a CUDA facility that allows runtime-compilation of CUDA C++ device code. As a result, device code generated at runtime (including modifications) can be used on a CUDA GPU. There is documentation for nvrtc and there are various CUDA sample codes for nvrtc, most or all of which have _nvrtc in the file name.
I was able to do kernel source-level debugging on a nvrtc-generated kernel with cuda-gdb as follows:
start with vectorAdd_nvrtc sample code
modify the compileFileToPTX routine (provided by nvrtc_helper.h) to add the --device-debug switch during the compile-cu-to-ptx step.
modify the loadPTX routine (provided by nvrtc_helper.h) to add the CU_JIT_GENERATE_DEBUG_INFO option (set to 1) for the cuModuleLoadDataEx load/JIT PTX-to-binary step.
compile the main function (vectorAdd.cpp) with -g option.
Here is a complete test case/session. I'm only showing the vectorAdd.cpp file from the project because that is the only file I modified. Other project file(s) are identical to what is in the sample project:
$ cat vectorAdd.cpp
/**
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/**
* Vector addition: C = A + B.
*
* This sample is a very basic sample that implements element by element
* vector addition. It is the same as the sample illustrating Chapter 2
* of the programming guide with some additions like error checking.
*/
#include <stdio.h>
#include <cmath>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda.h>
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <nvrtc_helper.h>
#include <iostream>
#include <fstream>
/**
* Host main routine
*/
void my_compileFileToPTX(char *filename, int argc, char **argv, char **ptxResult,
size_t *ptxResultSize, int requiresCGheaders) {
std::ifstream inputFile(filename,
std::ios::in | std::ios::binary | std::ios::ate);
if (!inputFile.is_open()) {
std::cerr << "\nerror: unable to open " << filename << " for reading!\n";
exit(1);
}
std::streampos pos = inputFile.tellg();
size_t inputSize = (size_t)pos;
char *memBlock = new char[inputSize + 1];
inputFile.seekg(0, std::ios::beg);
inputFile.read(memBlock, inputSize);
inputFile.close();
memBlock[inputSize] = '\x0';
int numCompileOptions = 0;
char *compileParams[2];
std::string compileOptions;
if (requiresCGheaders) {
char HeaderNames[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
#else
snprintf(HeaderNames, sizeof(HeaderNames), "%s", "cooperative_groups.h");
#endif
compileOptions = "--include-path=";
std::string path = sdkFindFilePath(HeaderNames, argv[0]);
if (!path.empty()) {
std::size_t found = path.find(HeaderNames);
path.erase(found);
} else {
printf(
"\nCooperativeGroups headers not found, please install it in %s "
"sample directory..\n Exiting..\n",
argv[0]);
}
compileOptions += path.c_str();
compileParams[0] = reinterpret_cast<char *>(
malloc(sizeof(char) * (compileOptions.length() + 1)));
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(compileParams[0], sizeof(char) * (compileOptions.length() + 1),
"%s", compileOptions.c_str());
#else
snprintf(compileParams[0], compileOptions.size(), "%s",
compileOptions.c_str());
#endif
numCompileOptions++;
}
compileOptions = "--device-debug ";
compileParams[numCompileOptions] = reinterpret_cast<char *>(malloc(sizeof(char) * (compileOptions.length() + 1)));
snprintf(compileParams[numCompileOptions], compileOptions.size(), "%s", compileOptions.c_str());
numCompileOptions++;
// compile
nvrtcProgram prog;
NVRTC_SAFE_CALL("nvrtcCreateProgram",
nvrtcCreateProgram(&prog, memBlock, filename, 0, NULL, NULL));
nvrtcResult res = nvrtcCompileProgram(prog, numCompileOptions, compileParams);
// dump log
size_t logSize;
NVRTC_SAFE_CALL("nvrtcGetProgramLogSize",
nvrtcGetProgramLogSize(prog, &logSize));
char *log = reinterpret_cast<char *>(malloc(sizeof(char) * logSize + 1));
NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log));
log[logSize] = '\x0';
if (strlen(log) >= 2) {
std::cerr << "\n compilation log ---\n";
std::cerr << log;
std::cerr << "\n end log ---\n";
}
free(log);
NVRTC_SAFE_CALL("nvrtcCompileProgram", res);
// fetch PTX
size_t ptxSize;
NVRTC_SAFE_CALL("nvrtcGetPTXSize", nvrtcGetPTXSize(prog, &ptxSize));
char *ptx = reinterpret_cast<char *>(malloc(sizeof(char) * ptxSize));
NVRTC_SAFE_CALL("nvrtcGetPTX", nvrtcGetPTX(prog, ptx));
NVRTC_SAFE_CALL("nvrtcDestroyProgram", nvrtcDestroyProgram(&prog));
*ptxResult = ptx;
*ptxResultSize = ptxSize;
#ifdef DUMP_PTX
std::ofstream my_f;
my_f.open("vectorAdd.ptx");
for (int i = 0; i < ptxSize; i++)
my_f << ptx[i];
my_f.close();
#endif
if (requiresCGheaders) free(compileParams[0]);
}
CUmodule my_loadPTX(char *ptx, int argc, char **argv) {
CUmodule module;
CUcontext context;
int major = 0, minor = 0;
char deviceName[256];
// Picks the best CUDA device available
CUdevice cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGet(&cuDevice, 0));
checkCudaErrors(cuCtxCreate(&context, 0, cuDevice));
CUjit_option opt[1];
opt[0] = CU_JIT_GENERATE_DEBUG_INFO;
void **vals = new void *[1];
vals[0] = (void *)(size_t)1;
checkCudaErrors(cuModuleLoadDataEx(&module, ptx, 1, opt, vals));
free(ptx);
return module;
}
int main(int argc, char **argv) {
char *ptx, *kernel_file;
size_t ptxSize;
kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
my_compileFileToPTX(kernel_file, argc, argv, &ptx, &ptxSize, 0);
CUmodule module = my_loadPTX(ptx, argc, argv);
CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
// Print the vector length to be used, and compute its size
int numElements = 50000;
size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements);
// Allocate the host input vector A
float *h_A = reinterpret_cast<float *>(malloc(size));
// Allocate the host input vector B
float *h_B = reinterpret_cast<float *>(malloc(size));
// Allocate the host output vector C
float *h_C = reinterpret_cast<float *>(malloc(size));
// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / static_cast<float>(RAND_MAX);
h_B[i] = rand() / static_cast<float>(RAND_MAX);
}
// Allocate the device input vector A
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, size));
// Allocate the device input vector B
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, size));
// Allocate the device output vector C
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, size));
// Copy the host input vectors A and B in host memory to the device input
// vectors in device memory
printf("Copy input data from the host memory to the CUDA device\n");
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
threadsPerBlock);
dim3 cudaBlockSize(threadsPerBlock, 1, 1);
dim3 cudaGridSize(blocksPerGrid, 1, 1);
void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
reinterpret_cast<void *>(&d_C),
reinterpret_cast<void *>(&numElements)};
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
// Free host memory
free(h_A);
free(h_B);
free(h_C);
printf("Done\n");
return 0;
}
$ nvcc -g -I/usr/local/cuda/samples/common/inc -o test vectorAdd.cpp -lnvrtc -lcuda
$ cuda-gdb ./test
NVIDIA (R) CUDA Debugger
10.0 release
Portions Copyright (C) 2007-2018 NVIDIA Corporation
GNU gdb (GDB) 7.12
Copyright (C) 2016 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./test...done.
(cuda-gdb) break vectorAdd
Function "vectorAdd" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (vectorAdd) pending.
(cuda-gdb) r
Starting program: /home/user2/misc/junk/vectorAdd_nvrtc/test
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7fffedc00700 (LWP 16789)]
> Using CUDA Device [1]: Tesla K40m
> GPU Device has SM 3.5 compute capability
[New Thread 0x7fffed3ff700 (LWP 16790)]
[Vector addition of 50000 elements]
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 196 blocks of 256 threads
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
Thread 1 "test" hit Breakpoint 1, vectorAdd<<<(196,1,1),(256,1,1)>>> (A=0x7fffce800000, B=0x7fffce830e00, C=0x7fffce861c00, numElements=50000) at ./vectorAdd_kernel.cu:21
21 int i = blockDim.x * blockIdx.x + threadIdx.x;
(cuda-gdb) step
23 if (i < numElements) {
(cuda-gdb) step
24 C[i] = A[i] + B[i];
(cuda-gdb) step
26 }
(cuda-gdb) quit
A debugging session is active.
Inferior 1 [process 16777] will be killed.
Quit anyway? (y or n) y
$

-ta=tesla:managed:cuda8 but cuMemAllocManaged returned error 2: Out of memory

I'm new to OpenACC. I like it very much so far as I'm familiar with OpenMP.
I have 2 1080Ti cards each with 9GB and I've 128GB of RAM. I'm trying a very basic test to allocate an array, initialize it, then sum it up in parallel. This works for 8 GB but when I increase to 10 GB I get out-of-memory error. My understanding was that with unified memory of Pascal (which these card are) and CUDA 8, I could allocate an array larger than the GPU's memory and the hardware will page in and page out on demand.
Here's my full C code test :
$ cat firstAcc.c
#include <stdio.h>
#include <openacc.h>
#include <stdlib.h>
#define GB 10
int main()
{
float *a;
size_t n = GB*1024*1024*1024/sizeof(float);
size_t s = n * sizeof(float);
a = (float *)malloc(s);
if (!a) { printf("Failed to malloc.\n"); return 1; }
printf("Initializing ... ");
for (int i = 0; i < n; ++i) {
a[i] = 0.1f;
}
printf("done\n");
float sum=0.0;
#pragma acc loop reduction (+:sum)
for (int i = 0; i < n; ++i) {
sum+=a[i];
}
printf("Sum is %f\n", sum);
free(a);
return 0;
}
As per the "Enable Unified Memory" section of this article I compile it with :
$ pgcc -acc -fast -ta=tesla:managed:cuda8 -Minfo firstAcc.c
main:
20, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop
28, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop containing reductions
Generated a prefetch instruction for the loop
I need to understand those messages but for now I don't think they are relevant. Then I run it :
$ ./a.out
malloc: call to cuMemAllocManaged returned error 2: Out of memory
Aborted (core dumped)
This works fine if I change GB to 8. I expected 10GB to work (despite the GPU card having 9GB) thanks to Pascal 1080Ti and CUDA 8.
Have I misunderstand, or what am I doing wrong? Thanks in advance.
$ pgcc -V
pgcc 17.4-0 64-bit target on x86-64 Linux -tp haswell
PGI Compilers and Tools
Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
$ cat /usr/local/cuda-8.0/version.txt
CUDA Version 8.0.61
Besides what Bob mentioned, I made a few more fixes.
First, you're not actually generating an OpenACC compute region since you only have a "#pragma acc loop" directive. This should be "#pragma acc parallel loop". You can see this in the compiler feedback messages where it's only showing host code optimizations.
Second, the "i" index should be declared as a "long". Otherwise, you'll overflow the index.
Finally, you need to add "cc60" to your target accelerator options to tell the compiler to target a Pascal based GPU.
% cat mi.c
#include <stdio.h>
#include <openacc.h>
#include <stdlib.h>
#define GB 20ULL
int main()
{
float *a;
size_t n = GB*1024ULL*1024ULL*1024ULL/sizeof(float);
size_t s = n * sizeof(float);
printf("n = %lu, s = %lu\n", n, s);
a = (float *)malloc(s);
if (!a) { printf("Failed to malloc.\n"); return 1; }
printf("Initializing ... ");
for (int i = 0; i < n; ++i) {
a[i] = 0.1f;
}
printf("done\n");
double sum=0.0;
#pragma acc parallel loop reduction (+:sum)
for (long i = 0; i < n; ++i) {
sum+=a[i];
}
printf("Sum is %f\n", sum);
free(a);
return 0;
}
% pgcc -fast -acc -ta=tesla:managed,cuda8.0,cc60 -Minfo=accel mi.c
main:
21, Accelerator kernel generated
Generating Tesla code
21, Generating reduction(+:sum)
22, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
21, Generating implicit copyin(a[:5368709120])
% ./a.out
n = 5368709120, s = 21474836480
Initializing ... done
Sum is 536870920.000000
I believe a problem is here:
size_t n = GB*1024*1024*1024/sizeof(float);
when I compile that line of code with g++, I get a warning about integer overflow. For some reason the PGI compiler is not warning, but the same badness is occurring under the hood. After the declarations of s, and n, if I add a printout like this:
size_t n = GB*1024*1024*1024/sizeof(float);
size_t s = n * sizeof(float);
printf("n = %lu, s = %lu\n", n, s); // add this line
and compile with PGI 17.04, and run (on a P100, with 16GB) I get output like this:
$ pgcc -acc -fast -ta=tesla:managed:cuda8 -Minfo m1.c
main:
16, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop
22, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop containing reductions
Generated a prefetch instruction for the loop
$ ./a.out
n = 4611686017890516992, s = 18446744071562067968
malloc: call to cuMemAllocManaged returned error 2: Out of memory
Aborted
$
so it's evident that n and s are not what you intended.
We can fix this by marking all of those constants with ULL, and then things seem to work correctly for me:
$ cat m1.c
#include <stdio.h>
#include <openacc.h>
#include <stdlib.h>
#define GB 20ULL
int main()
{
float *a;
size_t n = GB*1024ULL*1024ULL*1024ULL/sizeof(float);
size_t s = n * sizeof(float);
printf("n = %lu, s = %lu\n", n, s);
a = (float *)malloc(s);
if (!a) { printf("Failed to malloc.\n"); return 1; }
printf("Initializing ... ");
for (int i = 0; i < n; ++i) {
a[i] = 0.1f;
}
printf("done\n");
double sum=0.0;
#pragma acc loop reduction (+:sum)
for (int i = 0; i < n; ++i) {
sum+=a[i];
}
printf("Sum is %f\n", sum);
free(a);
return 0;
}
$ pgcc -acc -fast -ta=tesla:managed:cuda8 -Minfo m1.c
main:
16, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop
22, Loop not fused: function call before adjacent loop
Generated vector simd code for the loop containing reductions
Generated a prefetch instruction for the loop
$ ./a.out
n = 5368709120, s = 21474836480
Initializing ... done
Sum is 536870920.000000
$
Note that I've made another change above as well. I changed the sum accumulation variable from float to double. This is necessary to preserve somewhat "sensible" results when doing a very large reduction across very small quantities.
And, as #MatColgrove pointed out in his answer, I missed a few other things as well.

execute CUDA kernel few times

I have problem with execute CUDA kernel few times. Somethings is wrong with environment in my code. First time code works properly, second time during clean up environemnt before third call there are random crashes.
I think for some reason I have memory corruption. Crashes occurs sometimes in CUDA driver, sometimes simple printf crashes or cheap, kernel32.dll. I suppose that I have problem with memory management in my code.
What should be done before again kernel execution ?
This code works when I execute one time.
I'm using CURAND to initialize random generators.
Here is my code:
#define GRID_BLOCK 64
#define GRID_THREAD 8
#define CITIES 100
#define CIPOW2 101
int lenghtPaths = GRID_BLOCK*GRID_THREAD;
int cities = CITIES;
//prepare CURAND
curandState *devStates;
CUDA_CALL(cudaMalloc((void **)&devStates, GRID_BLOCK*GRID_THREAD*sizeof(curandState)));
/* Setup prng states */
setup_kernel<<<GRID_BLOCK ,GRID_THREAD>>>(devStates);
CUDA_CALL(cudaDeviceSynchronize());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "CURAND preparation failed: %s\n", cudaGetErrorString(cudaStatus));
//copy distance grid to constant memory
cudaMemcpyToSymbol(cdist, dist, sizeof(int) *CIPOW2*CIPOW2);
CUDA_CALL(cudaMalloc((void**)&dev_pathsForThreads, lenghtPaths * cities * sizeof(int)));
CUDA_CALL(cudaMalloc((void**)&d_results, GRID_BLOCK*GRID_THREAD * sizeof(int)));
for (int k = 0; k < 5; k++){
int* pathsForThreads;
pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
CUDA_CALL(cudaMemcpy(dev_pathsForThreads, pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyHostToDevice));
GPUAnnealing<<<GRID_BLOCK ,GRID_THREAD >>>(dev_pathsForThreads, devStates, iterationLimit,temperature, coolingRate, absoluteTemperature, cities,d_results);
CUDA_CALL(cudaDeviceSynchronize());
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
fprintf(stderr, "GPUAnnealing launch failed: %s\n", cudaGetErrorString(cudaStatus));
h_results = (int*) malloc(GRID_BLOCK*GRID_THREAD * sizeof(int));
//Copy lenght of each path to CPU
CUDA_CALL(cudaMemcpy(h_results, d_results, GRID_BLOCK*GRID_THREAD * sizeof(int),cudaMemcpyDeviceToHost));
//Copy paths to CPU
CUDA_CALL(cudaMemcpy(pathsForThreads, dev_pathsForThreads, lenghtPaths *cities*sizeof(int), cudaMemcpyDeviceToHost));
//check the shortest path
shortestPath = FindTheShortestPath(h_results);
fprintf (stdout, "Shortest path on index = %d value = %d \n", shortestPath, h_results[shortestPath]);
for (int i = 0; i < GRID_BLOCK*GRID_BLOCK ; i++)
Path[i] = pathsForThreads[shortestPath*CITIES +i];
free(pathsForThreads);
free(h_results);
}
CUDA_CALL(cudaFree(dev_pathsForThreads));
CUDA_CALL(cudaFree(d_results));
CUDA_CALL(cudaFree(devStates));
CUDA_CALL(cudaDeviceReset());
This is a bad idea:
pathsForThreads = (int*)malloc(lenghtPaths * cities * sizeof(int));
pathsForThreads = PreaparePaths(Path, lenghtPaths, cities);
If the call to PreaparePaths assigns some other value to pathsForThreads than what was assigned to it by the malloc operation, then later when you do this:
free(pathsForThreads);
You're going to get unpredictable results.
You should not reassign a pointer that you're subsequently going to pass to free to some other value. The man page for free indicates:
free() frees the memory space pointed to by ptr, which must have been
returned by a previous call to malloc(), calloc() or realloc().
So reassigning the pointer to something else is not allowed if you intend to pass it to free

Link .ll files generated by compiling .cu file with clang

I am compiling the following code using clang with:
clang++ -std=c++11 -emit-llvm -c -S $1 --cuda-gpu-arch=sm_30. This generates vectoradd-cuda-nvptx64-nvidia-cuda-sm_30.ll and vectoradd.ll files. The goal to run some LLVM analysis passes on the kernel, which would possibly instrument it. So i would like to link the post-analysis IR to an executable but i am not sure how. When i try to link the .ll files with llvm-link i am getting the error Linking globals named '_Z9vectoraddPiS_S_i': symbol multiply defined!. I am not really sure how to achieve this, so any help is appreciated.
#define THREADS_PER_BLOCK 512
__global__ void vectoradd(int *A, int *B, int *C, int N) {
int gi = threadIdx.x + blockIdx.x * blockDim.x;
if ( gi < N) {
C[gi] = A[gi] + B[gi];
}
}
int main(int argc, char **argv) {
int N = 10000, *d_A, *d_B, *d_C;
/// allocate host memory
std::vector<int> A(N);
std::vector<int> B(N);
std::vector<int> C(N);
/// allocate device memory
cudaMalloc((void **) &d_A, N * sizeof(int));
cudaMalloc((void **) &d_B, N * sizeof(int));
cudaMalloc((void **) &d_C, N * sizeof(int));
/// populate host data
for ( size_t i = 0; i < N; ++i) {
A[i] = i; B[i] = i;
}
/// copy to device
cudaMemcpy(d_A, &A[0], N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, &B[0], N * sizeof(int), cudaMemcpyHostToDevice);
dim3 block(THREADS_PER_BLOCK, 1, 1);
dim3 grid((N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, 1, 1);
vectoradd<<<grid,block>>>(d_A, d_B, d_C, N);
cudaDeviceSynchronize();
cudaMemcpy(&C[0], d_C, N * sizeof(int), cudaMemcpyDeviceToHost);
return 0;
}
The CUDA compilation trajectory in Clang is rather complicated (as it is in the NVIDIA toolchain) and what you are trying to do cannot work. The LLVM IR from each branch of the compilation process must remain separate until directly linkable objects are available. As a result, there are many intermediate steps which you will need to perform manually.
LLVM IR code for the GPU must be compiled firstly to PTX code, and then assembled to a binary payload which can be linked against host object files.
So in your example, you first do something like:
clang++ -std=c++11 -emit-llvm -c -S test.cu --cuda-gpu-arch=sm_52
which emits two llvm IR files test-cuda-nvptx64-nvidia-cuda-sm_52.ll and test.ll. The GPU code then needs to be compiled to PTX (see more about the nvptx backend here):
llc -mcpu=sm_52 test-cuda-nvptx64-nvidia-cuda-sm_52.ll -o test.ptx
Now the PTX code can be assembled into an ELF file which can later be linked by nvcc (or the host linker with an couple of additional steps) in the normal way:
ptxas --gpu-name=sm_52 test.ptx -o test.ptx.o
fatbinary --cuda -64 --create test.fatbin --image=profile=sm_52,file=test.ptx.o
For the host code you do something like
llc test.ll
clang -m64 -c test.s
to produce assembler output from the LLVM IR and then assemble that to an object file.
Now with a fatbin file containing CUDA the compiled code, and an object file containing the compiled host code, you can perform linkage. I have not been able to test linking a host object file with a fatbinary using clang, that is something you will need to work out yourself. It will be instructive to study both the verbose output of clang during a CUDA compilation call, and also the nvcc documentation to get a better feel for how the device code build system works.