Related
While playing with CUBLAS matrix multiplication sample I realised that nvprof profiler shows an extra call of cudaMemcpy Host to Device.
While 2 appear in source code, 3 actual calls are issued.
Why would that be? Is it an intrinsic effect of using CUBLAS?
Code from CUDA CUBLAS sample:
compiled with flags: -lcublas -I/usr/local/cuda-7.5/samples/common/inc
//////////////////////////////////////////////////////////////////////////
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
#include <helper_cuda.h>
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set matrix multiply on CPU
//! C = A * B
//! #param C reference data, computed but preallocated
//! #param A matrix A as provided to device
//! #param B matrix B as provided to device
//! #param hA height of matrix A
//! #param wB width of matrix B
////////////////////////////////////////////////////////////////////////////////
void
matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j)
{
double sum = 0;
for (unsigned int k = 0; k < wA; ++k)
{
double a = A[i * wA + k];
double b = B[k * wB + j];
sum += a * b;
}
C[i * wB + j] = (float)sum;
}
}
// Allocates a matrix with random float entries.
void randomInit(float *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
int i,j,k;
int error_count=0;
for (j = 0; j < height; j++)
{
if (error_count < iListLength)
{
printf("\n Row %d:\n", j);
}
for (i = 0; i < width; i++)
{
k = j * width + i;
float fDiff = fabs(data1[k] - data2[k]);
if (fDiff > fListTol)
{
if (error_count < iListLength)
{
printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
}
error_count++;
}
}
}
printf(" \n Total Errors = %d\n", error_count);
}
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
if (error != cudaSuccess)
{
printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
{
iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
}
iSizeMultiple = min(iSizeMultiple, 10);
iSizeMultiple = max(iSizeMultiple, 1);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
matrix_size.uiWA = 3 * block_size * iSizeMultiple;
matrix_size.uiHA = 4 * block_size * iSizeMultiple;
matrix_size.uiWB = 2 * block_size * iSizeMultiple;
matrix_size.uiHB = 3 * block_size * iSizeMultiple;
matrix_size.uiWC = 2 * block_size * iSizeMultiple;
matrix_size.uiHC = 4 * block_size * iSizeMultiple;
printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
matrix_size.uiHA, matrix_size.uiWA,
matrix_size.uiHB, matrix_size.uiWB,
matrix_size.uiHC, matrix_size.uiWC);
if( matrix_size.uiWA != matrix_size.uiHB ||
matrix_size.uiHA != matrix_size.uiHC ||
matrix_size.uiWB != matrix_size.uiWC)
{
printf("ERROR: Matrix sizes do not match!\n");
exit(-1);
}
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
// set seed for rand()
srand(2006);
// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// allocate device memory
float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
// allocate host memory for the result
float *h_C = (float *) malloc(mem_size_C);
float *h_CUBLAS = (float *) malloc(mem_size_C);
checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
// create and start timer
printf("Computing result using CUBLAS...");
// execute the kernel
int nIter = 30;
// CUBLAS version 2.0
{
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cudaEvent_t start, stop;
checkCudaErrors(cublasCreate(&handle));
//Perform warmup operation with cublas
checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
// Allocate CUDA events that we'll use for timing
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// Record the start event
checkCudaErrors(cudaEventRecord(start, NULL));
for (int j = 0; j < nIter; j++)
{
//note cublas is column primary!
//need to transpose the order
checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
}
printf("done.\n");
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, NULL));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));
// Destroy the handle
checkCudaErrors(cublasDestroy(handle));
}
// compute reference solution
printf("Computing result using host CPU...");
float *reference = (float *)malloc(mem_size_C);
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
printf("done.\n");
// check result (CUBLAS)
bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
if (resCUBLAS != true)
{
printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
}
printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
// clean up memory
free(h_A);
free(h_B);
free(h_C);
free(reference);
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
if (resCUBLAS == true)
{
return EXIT_SUCCESS; // return value = 1
}
else
{
return EXIT_FAILURE; // return value = 0
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0, sizeMult = 5;
sMatrixSize matrix_size;
initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
return matrix_result;
}
The additional memory transfer seems to be caused by the CUBLAS library and is triggered by a call to cublasInit. You can confirm this by profiling the following code:
#include <cublas_v2.h>
int main()
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaDeviceReset();
return 0;
}
which nvprof reports as calling cudaMemcpy:
$ nvprof ./a.out
==9536== NVPROF is profiling process 9536, command: ./a.out
==9536== Profiling application: ./a.out
==9536== Profiling result:
Time(%) Time Calls Avg Min Max Name
100.00% 1.1190us 1 1.1190us 1.1190us 1.1190us [CUDA memcpy HtoD]
==9536== API calls:
Time(%) Time Calls Avg Min Max Name
76.51% 348.53ms 1 348.53ms 348.53ms 348.53ms cudaFree
23.26% 105.97ms 1 105.97ms 105.97ms 105.97ms cudaDeviceReset
0.09% 420.25us 178 2.3600us 125ns 103.52us cuDeviceGetAttribute
0.08% 349.37us 2 174.69us 110.59us 238.78us cuDeviceTotalMem
0.04% 202.10us 3 67.366us 9.3750us 109.43us cudaMalloc
0.01% 55.217us 2 27.608us 24.529us 30.688us cuDeviceGetName
0.00% 14.365us 1 14.365us 14.365us 14.365us cudaMemcpy
0.00% 10.016us 16 626ns 434ns 2.0440us cudaEventCreateWithFlags
0.00% 4.5000us 11 409ns 271ns 1.2730us cudaDeviceGetAttribute
0.00% 3.4510us 4 862ns 251ns 2.3370us cuDeviceGetCount
0.00% 2.3200us 4 580ns 281ns 1.0350us cuDeviceGet
0.00% 1.3600us 1 1.3600us 1.3600us 1.3600us cudaGetDevice
0.00% 630ns 1 630ns 630ns 630ns cuInit
0.00% 339ns 1 339ns 339ns 339ns cuDriverGetVersion
I doubt that anyone without access to the current CUBLAS source will be able to explain why initialising the CUBLAS library triggers a host to device transfer, but that seems to be the cause of your observation.
I want to perform 441 2D, 32-by-32 FFTs using the batched method provided by the cuFFT library. The parameters of the transform are the following:
int n[2] = {32,32};
int inembed[] = {32,32};
int onembed[] = {32,32/2+1};
cufftPlanMany(&plan,2,n,inembed,1,32*32,onembed,1,32*(32/2+1),CUFFT_D2Z,441);
cufftPlanMany(&inverse_plan,2,n,onembed,1,32*32,inembed,1,32*32,CUFFT_Z2D,441);
After I did the forward and inverse FFTs using the above plans, I could not get the original data back.
Can anyone advise me how to set the parameters correctly for cudaPlanMany? Many thanks in advance.
By the way, is it the best way to use cudaPlanMany for my situation?
Here is a full example on how using cufftPlanMany to perform batched direct and inverse transformations in CUDA. The example refers to float to cufftComplex transformations and back. The final result of the direct+inverse transformation is correct but for a multiplicative constant equal to the overall number of matrix elements nRows*nCols.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
cufftHandle forward_plan, inverse_plan;
int batch = 3;
int rank = 2;
int nRows = 5;
int nCols = 5;
int n[2] = {nRows, nCols};
int idist = nRows*nCols;
int odist = nRows*(nCols/2+1);
int inembed[] = {nRows, nCols};
int onembed[] = {nRows, nCols/2+1};
int istride = 1;
int ostride = 1;
cufftSafeCall(cufftPlanMany(&forward_plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch));
float *h_in = (float*)malloc(sizeof(float)*nRows*nCols*batch);
for(int i=0; i<nRows*nCols*batch; i++) h_in[i] = 1.f;
float2* h_freq = (float2*)malloc(sizeof(float2)*nRows*(nCols/2+1)*batch);
float* d_in; gpuErrchk(cudaMalloc(&d_in, sizeof(float)*nRows*nCols*batch));
float2* d_freq; gpuErrchk(cudaMalloc(&d_freq, sizeof(float2)*nRows*(nCols/2+1)*batch));
gpuErrchk(cudaMemcpy(d_in,h_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyHostToDevice));
cufftSafeCall(cufftExecR2C(forward_plan, d_in, d_freq));
gpuErrchk(cudaMemcpy(h_freq,d_freq,sizeof(float2)*nRows*(nCols/2+1)*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*(nCols/2+1)*batch; i++) printf("Direct transform: %i %f %f\n",i,h_freq[i].x,h_freq[i].y);
cufftSafeCall(cufftPlanMany(&inverse_plan, rank, n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch));
cufftSafeCall(cufftExecC2R(inverse_plan, d_freq, d_in));
gpuErrchk(cudaMemcpy(h_in,d_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*nCols*batch; i++) printf("Inverse transform: %i %f \n",i,h_in[i]);
getchar();
}
I'm now only need to show an intermediate progress of matrix multiplication.
for(unsigned int col=0; col<mtxSize; col++) {
unsigned tmp = 0;
for(unsigned int row=0; row<mtxSize; row++) {
for(unsigned int idx=0; idx<mtxSize; idx++) {
tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
}
h_Rs[col*mtxSize+row] = tmp;
tmp = 0;
int rate_tmp = (col*mtxSize + (row+1))*100;
// Maybe like this...
fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
fflush(stdout);
}
}
In the case of the host code(use CPU), it is very easy beacause it process sequentially so we can check very easily.
But in the case of the GPU which process in parallel, what should I do?
Once the kernel is running, it does not return until finish the kernel execution.
So I can't check mid-data during the kernel execution time.
I think I need to use asynchronous kernel call, but I do not know well.
And even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to write atomicAdd() (in other words, global memory access) function which is including some overhead?
Give me some advice or hint.
And I want to know in the case of CUDA.
Here is a code which demonstrates how to check progress from a matrix multiply kernel:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(volatile int *data){
unsigned long time;
for (int i = 0; i < INCS; i++){
atomicAdd((int *)data,1);
__threadfence_system();
time = clock64();
while((clock64() - time)<TIME_INC) {};
}
printf("progress check finished\n");
}
__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
float temp = 0.0f;
for (unsigned int k = 0; k < colA; k++)
temp += a[(row*colA)+k] * b[(k*colB) + col];
c[(row*colB)+col] = temp;
#if USE_PROGRESS
if (!(threadIdx.x || threadIdx.y)){
atomicAdd((int *)progress, 1);
__threadfence_system();
}
#endif
}
}
int main(){
// simple test to demonstrate reading progress data from kernel
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
printf("kernel starting\n");
mykernel<<<1,1>>>(d_data);
cudaCheckErrors("kernel fail");
int value = 0;
do{
int value1 = *h_data;
if (value1 > value){
printf("h_data = %d\n", value1);
value = value1;}}
while (value < (INCS-1));
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail 2");
// now try matrix multiply with progress
float *h_c, *d_a, *d_b, *d_c;
h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
if (h_c == NULL) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc a fail");
cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc b fail");
cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc c fail");
for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy a fail");
cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy b fail");
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
*h_data=0;
dim3 block(16,16);
dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
printf("matrix multiply kernel starting\n");
cudaEventRecord(start);
matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
cudaEventRecord(stop);
#if USE_PROGRESS
unsigned int num_blocks = grid.x*grid.y;
float my_progress = 0.0f;
value = 0;
printf("Progress:\n");
do{
cudaEventQuery(stop); // may help WDDM scenario
int value1 = *h_data;
float kern_progress = (float)value1/(float)num_blocks;
if ((kern_progress - my_progress)> 0.1f) {
printf("percent complete = %2.1f\n", (kern_progress*100));
my_progress = kern_progress;}}
while (my_progress < 0.9f);
printf("\n");
#endif
cudaEventSynchronize(stop);
cudaCheckErrors("event sync fail");
float et;
cudaEventElapsedTime(&et, start, stop);
cudaCheckErrors("event elapsed time fail");
cudaDeviceSynchronize();
cudaCheckErrors("mat mult kernel fail");
printf("matrix multiply finished. elapsed time = %f milliseconds\n", et);
return 0;
}
The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.
The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.
Some other uses of signalling are discussed here
This function performs the symmetric matrix-matrix multiplication using CUDA. Although, I succeeded in using the nonsymmetric version "cublas{t}gemm()" I couldn't use the "cublas{t}symm()" function properly.
I know that CUBLAS library uses column-major matrix storage. I am using row-major C/C++ matrix and I know how to solve this issue for "cublas{t}gemm()" by replacing the input matrices and etc. However, I couldn't solve it for the symmetric case. The problem is even if I use column-major matrix storage I find unexpectable results. Matrices contain complex floats (cuComplex). I assume I have row-major matrices. Here is the code and the output:
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
int m,n,k;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*(m+1)/2; //size of a symmetric matrix
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
for (i = 0; i < size_A; ++i)
h_A[i] = make_cuComplex( (float)(i+1),(float)0);
for (i = 0; i < size_B; ++i)
h_B[i] = make_cuComplex((float)(i+2), (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(n / threads.x, m / threads.y);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, n,m,&alpha,d_A,m,d_B,m,&beta,d_C,m);
// copy result from device to host
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
printf ("\nComputations completed.\n\n");
printf (" symm matrix A: \n");
int s=0;
for (i=0; i<min(m,4); i++) {
for (j=0; j<=i; j++) {
//printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
printf ("%7.5G", h_A[s].x);
s++;
}
printf ("\n");
}
printf ("\n matrix B: \n");
for (i=0; i<min(k,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
printf ("%7.5G", h_B[j+i*n].x);
}
printf ("\n");
}
printf ("\n matrix C=A*B: \n");
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
printf ("%7.5G", h_CUBLAS[j+i*n].x);
}
printf ("\n");
}
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaDeviceReset();
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0, sizeMult = 5;
initializeCUDA(argc, argv, devID);
int matrix_result = matrixMultiply(argc, argv, devID);
}
I suppose that I have the following matrices for the multiplication:
A =
1 2 4
2 3 5
4 5 6
B =
2 3
4 5
6 7
and expect to obtain
A*B =
34 41
46 56
64 79
But the obtained OUTPUT is as follows:
symm matrix A:
1
2 3
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
78 90
74 97
114 146
What am I missing in this code ? Probably the arguments of "cublasCsymm" function are wrong.
Thanks,
Kagan
EDIT:
Based on questions posed below, I elected to re-work my answer and example code.
You can handle row-major storage without transpose at least for these operations. And this observation is further facilitated by the fact that the symm function does not used the packed storage.
So to answer the additional questions:
the cublasCsymm function does not use a packed storage format (like some other functions such as cublasCspmv for example), because the cublasCsymm function is intended to duplicate the functionality of the corresponding netlib function, which also does not use a packed storage format. Based on my review of the cublas API, I don't see a symmetric-packed-storage matrix-matrix multiply function available.
You can use row-major storage (e.g. C-style) with cublas, without transposing, at least for these operations (matrix-matrix multiply, without packed storage) by following the advice given here.
What follows is a re-worked version of my previous example, that incorporates the information in item 2 above.
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK sa
mples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND
_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMa
trixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on
what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __
LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, dev
iceProp.name, deviceProp.major, deviceProp.minor);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*m; //size of a symmetric matrix
printf("size_A = %d\n", size_A);
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
// for (i = 0; i < size_A; ++i)
// h_A[i] = make_cuComplex( (float)(i+1),(float)0);
h_A[0] = make_cuComplex((float)1, (float)0);
h_A[1] = make_cuComplex((float)2, (float)0);
h_A[2] = make_cuComplex((float)4, (float)0);
h_A[3] = make_cuComplex((float)0, (float)0);
h_A[4] = make_cuComplex((float)3, (float)0);
h_A[5] = make_cuComplex((float)5, (float)0);
h_A[6] = make_cuComplex((float)0, (float)0);
h_A[7] = make_cuComplex((float)0, (float)0);
h_A[8] = make_cuComplex((float)6, (float)0);
// for (i = 0; i < size_B; ++i)
// h_B[i] = make_cuComplex((float)(i+2), (float)0);
h_B[0] = make_cuComplex((float)2, (float)0);
h_B[1] = make_cuComplex((float)3, (float)0);
h_B[2] = make_cuComplex((float)4, (float)0);
h_B[3] = make_cuComplex((float)5, (float)0);
h_B[4] = make_cuComplex((float)6, (float)0);
h_B[5] = make_cuComplex((float)7, (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, n,m,&alpha,d_A,m,d_B,n,&beta,d_C,n);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
// copy result from device to host
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
printf ("\nComputations completed.\n\n");
printf (" symm matrix A: \n");
// int s=0;
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(m,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
// printf ("%7.5G", h_A[s].x);
printf ("%7.5G", h_A[j+(i*m)].x);
// s++;
}
printf ("\n");
}
printf ("\n matrix B: \n");
for (i=0; i<min(k,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
printf ("%7.5G", h_B[j+(i*n)].x);
}
printf ("\n");
}
printf ("\n matrix C=A*B: \n");
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
printf ("%7.5G", h_CUBLAS[j+(i*n)].x);
}
printf ("\n");
}
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaDeviceReset();
return 0;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0;
initializeCUDA(argc, argv, devID);
int matrix_result = matrixMultiply(argc, argv, devID);
cudaCheckErrors("some error");
return 0;
}
$ ./t213
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
size_A = 9
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1 2 4
0 3 5
0 0 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
34 41
46 56
64 79
$
ORIGINAL RESPONSE:
Several problems:
When I run your code as you have it posted right now, I don't get the
results that you show. Here's what I get:
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1
2 3
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
-131 -128
260 -122
-115 266
The code compiles with a number of warnings and also you're not doing proper error checking (for example you're not checking the return value from cublasCsymm
You are wanting to multiply C = A*B This means A is on the LEFT,
but you are passing CUBLAS_SIDE_RIGHT to cublasCsymm Several other cublasCsymm parameters were wrong as well. I think maybe you thought you could do A*B as (B(T)*A(T)) but that only works for square matrices. Not sure what you were thinking, exactly.
You having row-major storage on your matrices and passing them to cublas which interprets them in column-major order. For the following matrix:
1 2
3 4
row-major storage looks like this:
1 2 3 4
column-major storage looks like this:
1 3 2 4
You can transpose these matrices if you wish, using cublasCgeam or you can manually modify your storage.
You're making some sort of assumption about some kind of compressed
storage format for the symmetric matrix A which is not correct.
Read carefully the defintion of the storage
type.
It doesn't say the portion of the matrix that is "supplied" or
"present" it says the portion of the matrix that is filled.
Here is a complete code that has the above problems fixed:
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK sa
mples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*m; //size of a symmetric matrix
printf("size_A = %d\n", size_A);
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
// for (i = 0; i < size_A; ++i)
// h_A[i] = make_cuComplex( (float)(i+1),(float)0);
h_A[0] = make_cuComplex((float)1, (float)0);
h_A[1] = make_cuComplex((float)2, (float)0);
h_A[2] = make_cuComplex((float)4, (float)0);
h_A[3] = make_cuComplex((float)0, (float)0);
h_A[4] = make_cuComplex((float)3, (float)0);
h_A[5] = make_cuComplex((float)5, (float)0);
h_A[6] = make_cuComplex((float)0, (float)0);
h_A[7] = make_cuComplex((float)0, (float)0);
h_A[8] = make_cuComplex((float)6, (float)0);
// for (i = 0; i < size_B; ++i)
// h_B[i] = make_cuComplex((float)(i+2), (float)0);
h_B[0] = make_cuComplex((float)2, (float)0);
h_B[1] = make_cuComplex((float)4, (float)0);
h_B[2] = make_cuComplex((float)6, (float)0);
h_B[3] = make_cuComplex((float)3, (float)0);
h_B[4] = make_cuComplex((float)5, (float)0);
h_B[5] = make_cuComplex((float)7, (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, m,n,&alpha,d_A,m,d_B,m,&beta,d_C,m);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
Here is the output:
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
size_A = 9
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1 0 0
2 3 0
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
34 41
46 56
64 79
I am calling cudaEventQuery() in a periodic ITIMER callback register by the main program.
The thread is at cudaDeviceSynchronize() waiting for a GPU kernel to finish.
I am seeing that cudaEventQuery() does not return and gets blocked.
I have attached the program to this file and the callstack when cudaEventQuery() gets stuck.
I appreciate any information/help on eliminating this problem/bug.
Configuration
CUDA 4.1 on Nvidia Tesla 2070 GPU.
My program
#include <stdio.h>
#include <cuda.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define CHECK_CU_ERROR(err, cufunc) \
if (err != CUDA_SUCCESS) \
{ \
printf ("Error %d for CUDA Driver API function '%s'.\n", \
err, cufunc); \
exit(-1); \
}
#define N 100000
static CUcontext context;
static CUdevice device;
cudaEvent_t event;
void event_handler(int signum)
{
printf("\n Timer triggered!");
if (cudaEventQuery(event) == cudaSuccess) {
printf("\n Event finished");
fflush(stdout);
} else {
printf("\n Event NOT finished");
fflush(stdout);
}
}
// Device code
__global__ void VecAdd(const int *A, const int *B, int *C, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
for (long long m = 0; m < 10; m++)
for (long long n = 0; n < 100000; n++)
if (i < size)
C[i] = A[i] + B[i];
}
static void initVec(int *vec, int n)
{
for (int i = 0; i < n; i++)
vec[i] = i;
}
int *d_A;
int *d_B;
int *d_C;
static void *compute(void *ip)
{
size_t size = N * sizeof(int);
int threadsPerBlock = 0;
int blocksPerGrid = 0;
int *h_A, *h_B, *h_C;
//int id = (int) pthread_self() + 1;
// Allocate input vectors h_A and h_B in host memory
h_A = (int *) malloc(size);
h_B = (int *) malloc(size);
h_C = (int *) malloc(size);
// Initialize input vectors
initVec(h_A, N);
initVec(h_B, N);
memset(h_C, 0, size);
// Allocate vectors in device memory
cudaMalloc((void **) &d_A, size);
cudaMalloc((void **) &d_B, size);
cudaMalloc((void **) &d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice);
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock,0>>>(d_A, d_B, d_C, N);
cudaEventCreate(&event);
cudaEventRecord(event);
printf("\n Record");
fflush(stdout);
struct sigaction sa;
struct itimerval timer;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = &event_handler;
sigaction(SIGALRM, &sa, NULL);
timer.it_value.tv_sec = 0;
timer.it_value.tv_usec = 250;
timer.it_interval.tv_sec = 1;
timer.it_interval.tv_usec = 250;
setitimer(ITIMER_REAL, &timer, NULL);
return 0;
}
int main(int argc, char *argv[])
{
CUresult err;
int deviceNum = 0;
int deviceCount = 0;
#if 0
// Try different flags
if (cudaSetDeviceFlags(cudaDeviceScheduleSpin) != cudaSuccess) {
printf("\n failed cudaSetDeviceFlags");
exit(-1);
}
#endif
err = cuInit(0);
CHECK_CU_ERROR(err, "cuInit");
err = cuDeviceGetCount(&deviceCount);
CHECK_CU_ERROR(err, "cuDeviceGetCount");
if (deviceCount == 0) {
printf("There is no device supporting CUDA.\n");
exit(-1);
}
err = cuDeviceGet(&device, deviceNum);
CHECK_CU_ERROR(err, "cuDeviceGet");
err = cuCtxCreate(&context, 0, device);
CHECK_CU_ERROR(err, "cuCtxCreate");
compute(0);
cudaDeviceSynchronize();
printf("\n SYNCed");
while (1);
return 0;
}
CALLSTACK where cudaEventQuery gets blocked
(gdb) bt
#0 0x00000037f520e034 in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00000037f5209345 in _L_lock_868 () from /lib64/libpthread.so.0
#2 0x00000037f5209217 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x00007f7bb6fd75b7 in ?? () from /usr/lib64/libcuda.so.1
#4 0x00007f7bb6fd575a in ?? () from /usr/lib64/libcuda.so.1
#5 0x00007f7bb70062e3 in ?? () from /usr/lib64/libcuda.so.1
#6 0x00007f7bb700c3ec in ?? () from /usr/lib64/libcuda.so.1
#7 0x00007f7bb6fc95d8 in ?? () from /usr/lib64/libcuda.so.1
#8 0x00007f7bb6fb9c35 in ?? () from /usr/lib64/libcuda.so.1
#9 0x00007f7bb6a5ad57 in ?? () from /usr/local/cuda/lib64/libcudart.so.4
#10 0x00007f7bb6a8c4f2 in cudaEventQuery () from /usr/local/cuda/lib64/libcudart.so.4
#11 0x0000000000400e8d in event_handler (signum=14) at event_sampling.cu:40
#12 <signal handler called>
#13 0x00007f7bb7003791 in ?? () from /usr/lib64/libcuda.so.1
#14 0x00007f7bb6fd5786 in ?? () from /usr/lib64/libcuda.so.1
#15 0x00007f7bb70062e3 in ?? () from /usr/lib64/libcuda.so.1
#16 0x00007f7bb7006646 in ?? () from /usr/lib64/libcuda.so.1
#17 0x00007f7bb6fd5839 in ?? () from /usr/lib64/libcuda.so.1
#18 0x00007f7bb6fc86e0 in ?? () from /usr/lib64/libcuda.so.1
#19 0x00007f7bb6fa7d62 in ?? () from /usr/lib64/libcuda.so.1
#20 0x00007f7bb6a5e9d3 in ?? () from /usr/local/cuda/lib64/libcudart.so.4
#21 0x00007f7bb6a9318c in cudaDeviceSynchronize () from /usr/local/cuda/lib64/libcudart.so.4
#22 0x00000000004012b3 in main (argc=1, argv=0x7fff20bff048) at event_sampling.cu:157
(gdb)
Here is the same code after removing Driver APIs
/*
* Copyright 2011 NVIDIA Corporation. All rights reserved
*
* Sample app to demonstrate use of CUPTI library to obtain profiler
* event values by sampling.
*/
#include <stdio.h>
#include <cuda.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define CHECK_CU_ERROR(err, cufunc) \
if (err != CUDA_SUCCESS) \
{ \
printf ("Error %d for CUDA Driver API function '%s'.\n", \
err, cufunc); \
exit(-1); \
}
#define N 100000
cudaEvent_t event;
void event_handler(int signum)
{
printf("\n Timer triggered!");
if (cudaEventQuery(event) == cudaSuccess) {
printf("\n Event finished");
fflush(stdout);
} else {
printf("\n Event NOT finished");
fflush(stdout);
}
}
// Device code
__global__ void VecAdd(const int *A, const int *B, int *C, int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
for (long long m = 0; m < 10; m++)
for (long long n = 0; n < 100000; n++)
if (i < size)
C[i] = A[i] + B[i];
}
static void initVec(int *vec, int n)
{
for (int i = 0; i < n; i++)
vec[i] = i;
}
int *d_A;
int *d_B;
int *d_C;
static void *compute(void *ip)
{
size_t size = N * sizeof(int);
int threadsPerBlock = 0;
int blocksPerGrid = 0;
int *h_A, *h_B, *h_C;
//int id = (int) pthread_self() + 1;
// Allocate input vectors h_A and h_B in host memory
h_A = (int *) malloc(size);
h_B = (int *) malloc(size);
h_C = (int *) malloc(size);
// Initialize input vectors
initVec(h_A, N);
initVec(h_B, N);
memset(h_C, 0, size);
// Allocate vectors in device memory
cudaMalloc((void **) &d_A, size);
cudaMalloc((void **) &d_B, size);
cudaMalloc((void **) &d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice);
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock,0>>>(d_A, d_B, d_C, N);
cudaEventCreate(&event);
cudaEventRecord(event);
printf("\n Record");
fflush(stdout);
struct sigaction sa;
struct itimerval timer;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = &event_handler;
sigaction(SIGALRM, &sa, NULL);
timer.it_value.tv_sec = 0;
timer.it_value.tv_usec = 250;
timer.it_interval.tv_sec = 1;
timer.it_interval.tv_usec = 250;
setitimer(ITIMER_REAL, &timer, NULL);
return 0;
}
int main(int argc, char *argv[])
{
CUresult err;
#if 0
// Try different flags
if (cudaSetDeviceFlags(cudaDeviceScheduleSpin) != cudaSuccess) {
printf("\n failed cudaSetDeviceFlags");
exit(-1);
}
#endif
compute(0);
cudaDeviceSynchronize();
printf("\n SYNCed");
fflush(stdout);
while (1)sleep(10);
return 0;
}
You have a single thread which launches a kernel, schedules an event to be reached after the kernel, and then calls cudaDeviceSynchronize(). When your signal handler is reached, it tries to call into another CUDA API call which blocks.
You've ignored one of the basic principles of signal handlers which is to do as little as possible!
The real question is what are you actually trying to achieve? You could just wait on the event (cudaEventSynchronize()) for what you're doing here but if you're goal is something more complex you should think in more detail about how to achieve it, signal handlers are not the right way.