How to configure cublas{t}symm() function arguments - cuda

This function performs the symmetric matrix-matrix multiplication using CUDA. Although, I succeeded in using the nonsymmetric version "cublas{t}gemm()" I couldn't use the "cublas{t}symm()" function properly.
I know that CUBLAS library uses column-major matrix storage. I am using row-major C/C++ matrix and I know how to solve this issue for "cublas{t}gemm()" by replacing the input matrices and etc. However, I couldn't solve it for the symmetric case. The problem is even if I use column-major matrix storage I find unexpectable results. Matrices contain complex floats (cuComplex). I assume I have row-major matrices. Here is the code and the output:
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
int m,n,k;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*(m+1)/2; //size of a symmetric matrix
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
for (i = 0; i < size_A; ++i)
h_A[i] = make_cuComplex( (float)(i+1),(float)0);
for (i = 0; i < size_B; ++i)
h_B[i] = make_cuComplex((float)(i+2), (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(n / threads.x, m / threads.y);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, n,m,&alpha,d_A,m,d_B,m,&beta,d_C,m);
// copy result from device to host
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
printf ("\nComputations completed.\n\n");
printf (" symm matrix A: \n");
int s=0;
for (i=0; i<min(m,4); i++) {
for (j=0; j<=i; j++) {
//printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
printf ("%7.5G", h_A[s].x);
s++;
}
printf ("\n");
}
printf ("\n matrix B: \n");
for (i=0; i<min(k,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
printf ("%7.5G", h_B[j+i*n].x);
}
printf ("\n");
}
printf ("\n matrix C=A*B: \n");
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
printf ("%7.5G", h_CUBLAS[j+i*n].x);
}
printf ("\n");
}
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaDeviceReset();
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0, sizeMult = 5;
initializeCUDA(argc, argv, devID);
int matrix_result = matrixMultiply(argc, argv, devID);
}
I suppose that I have the following matrices for the multiplication:
A =
1 2 4
2 3 5
4 5 6
B =
2 3
4 5
6 7
and expect to obtain
A*B =
34 41
46 56
64 79
But the obtained OUTPUT is as follows:
symm matrix A:
1
2 3
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
78 90
74 97
114 146
What am I missing in this code ? Probably the arguments of "cublasCsymm" function are wrong.
Thanks,
Kagan

EDIT:
Based on questions posed below, I elected to re-work my answer and example code.
You can handle row-major storage without transpose at least for these operations. And this observation is further facilitated by the fact that the symm function does not used the packed storage.
So to answer the additional questions:
the cublasCsymm function does not use a packed storage format (like some other functions such as cublasCspmv for example), because the cublasCsymm function is intended to duplicate the functionality of the corresponding netlib function, which also does not use a packed storage format. Based on my review of the cublas API, I don't see a symmetric-packed-storage matrix-matrix multiply function available.
You can use row-major storage (e.g. C-style) with cublas, without transposing, at least for these operations (matrix-matrix multiply, without packed storage) by following the advice given here.
What follows is a re-worked version of my previous example, that incorporates the information in item 2 above.
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK sa
mples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND
_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMa
trixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on
what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __
LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, dev
iceProp.name, deviceProp.major, deviceProp.minor);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*m; //size of a symmetric matrix
printf("size_A = %d\n", size_A);
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
// for (i = 0; i < size_A; ++i)
// h_A[i] = make_cuComplex( (float)(i+1),(float)0);
h_A[0] = make_cuComplex((float)1, (float)0);
h_A[1] = make_cuComplex((float)2, (float)0);
h_A[2] = make_cuComplex((float)4, (float)0);
h_A[3] = make_cuComplex((float)0, (float)0);
h_A[4] = make_cuComplex((float)3, (float)0);
h_A[5] = make_cuComplex((float)5, (float)0);
h_A[6] = make_cuComplex((float)0, (float)0);
h_A[7] = make_cuComplex((float)0, (float)0);
h_A[8] = make_cuComplex((float)6, (float)0);
// for (i = 0; i < size_B; ++i)
// h_B[i] = make_cuComplex((float)(i+2), (float)0);
h_B[0] = make_cuComplex((float)2, (float)0);
h_B[1] = make_cuComplex((float)3, (float)0);
h_B[2] = make_cuComplex((float)4, (float)0);
h_B[3] = make_cuComplex((float)5, (float)0);
h_B[4] = make_cuComplex((float)6, (float)0);
h_B[5] = make_cuComplex((float)7, (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, n,m,&alpha,d_A,m,d_B,n,&beta,d_C,n);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
// copy result from device to host
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
printf ("\nComputations completed.\n\n");
printf (" symm matrix A: \n");
// int s=0;
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(m,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
// printf ("%7.5G", h_A[s].x);
printf ("%7.5G", h_A[j+(i*m)].x);
// s++;
}
printf ("\n");
}
printf ("\n matrix B: \n");
for (i=0; i<min(k,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
printf ("%7.5G", h_B[j+(i*n)].x);
}
printf ("\n");
}
printf ("\n matrix C=A*B: \n");
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
printf ("%7.5G", h_CUBLAS[j+(i*n)].x);
}
printf ("\n");
}
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaDeviceReset();
return 0;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0;
initializeCUDA(argc, argv, devID);
int matrix_result = matrixMultiply(argc, argv, devID);
cudaCheckErrors("some error");
return 0;
}
$ ./t213
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
size_A = 9
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1 2 4
0 3 5
0 0 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
34 41
46 56
64 79
$
ORIGINAL RESPONSE:
Several problems:
When I run your code as you have it posted right now, I don't get the
results that you show. Here's what I get:
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1
2 3
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
-131 -128
260 -122
-115 266
The code compiles with a number of warnings and also you're not doing proper error checking (for example you're not checking the return value from cublasCsymm
You are wanting to multiply C = A*B This means A is on the LEFT,
but you are passing CUBLAS_SIDE_RIGHT to cublasCsymm Several other cublasCsymm parameters were wrong as well. I think maybe you thought you could do A*B as (B(T)*A(T)) but that only works for square matrices. Not sure what you were thinking, exactly.
You having row-major storage on your matrices and passing them to cublas which interprets them in column-major order. For the following matrix:
1 2
3 4
row-major storage looks like this:
1 2 3 4
column-major storage looks like this:
1 3 2 4
You can transpose these matrices if you wish, using cublasCgeam or you can manually modify your storage.
You're making some sort of assumption about some kind of compressed
storage format for the symmetric matrix A which is not correct.
Read carefully the defintion of the storage
type.
It doesn't say the portion of the matrix that is "supplied" or
"present" it says the portion of the matrix that is filled.
Here is a complete code that has the above problems fixed:
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK sa
mples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*m; //size of a symmetric matrix
printf("size_A = %d\n", size_A);
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
// for (i = 0; i < size_A; ++i)
// h_A[i] = make_cuComplex( (float)(i+1),(float)0);
h_A[0] = make_cuComplex((float)1, (float)0);
h_A[1] = make_cuComplex((float)2, (float)0);
h_A[2] = make_cuComplex((float)4, (float)0);
h_A[3] = make_cuComplex((float)0, (float)0);
h_A[4] = make_cuComplex((float)3, (float)0);
h_A[5] = make_cuComplex((float)5, (float)0);
h_A[6] = make_cuComplex((float)0, (float)0);
h_A[7] = make_cuComplex((float)0, (float)0);
h_A[8] = make_cuComplex((float)6, (float)0);
// for (i = 0; i < size_B; ++i)
// h_B[i] = make_cuComplex((float)(i+2), (float)0);
h_B[0] = make_cuComplex((float)2, (float)0);
h_B[1] = make_cuComplex((float)4, (float)0);
h_B[2] = make_cuComplex((float)6, (float)0);
h_B[3] = make_cuComplex((float)3, (float)0);
h_B[4] = make_cuComplex((float)5, (float)0);
h_B[5] = make_cuComplex((float)7, (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, m,n,&alpha,d_A,m,d_B,m,&beta,d_C,m);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
Here is the output:
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
size_A = 9
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1 0 0
2 3 0
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
34 41
46 56
64 79

Related

CUDA matrixMulCUBLAS extra memcpy call

While playing with CUBLAS matrix multiplication sample I realised that nvprof profiler shows an extra call of cudaMemcpy Host to Device.
While 2 appear in source code, 3 actual calls are issued.
Why would that be? Is it an intrinsic effect of using CUBLAS?
Code from CUDA CUBLAS sample:
compiled with flags: -lcublas -I/usr/local/cuda-7.5/samples/common/inc
//////////////////////////////////////////////////////////////////////////
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
#include <helper_cuda.h>
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
typedef struct _matrixSize // Optional Command-line multiplier for matrix sizes
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set matrix multiply on CPU
//! C = A * B
//! #param C reference data, computed but preallocated
//! #param A matrix A as provided to device
//! #param B matrix B as provided to device
//! #param hA height of matrix A
//! #param wB width of matrix B
////////////////////////////////////////////////////////////////////////////////
void
matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j)
{
double sum = 0;
for (unsigned int k = 0; k < wA; ++k)
{
double a = A[i * wA + k];
double b = B[k * wB + j];
sum += a * b;
}
C[i * wB + j] = (float)sum;
}
}
// Allocates a matrix with random float entries.
void randomInit(float *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
{
printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
int i,j,k;
int error_count=0;
for (j = 0; j < height; j++)
{
if (error_count < iListLength)
{
printf("\n Row %d:\n", j);
}
for (i = 0; i < width; i++)
{
k = j * width + i;
float fDiff = fabs(data1[k] - data2[k]);
if (fDiff > fListTol)
{
if (error_count < iListLength)
{
printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
}
error_count++;
}
}
}
printf(" \n Total Errors = %d\n", error_count);
}
void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
if (error != cudaSuccess)
{
printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
{
iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
}
iSizeMultiple = min(iSizeMultiple, 10);
iSizeMultiple = max(iSizeMultiple, 1);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
matrix_size.uiWA = 3 * block_size * iSizeMultiple;
matrix_size.uiHA = 4 * block_size * iSizeMultiple;
matrix_size.uiWB = 2 * block_size * iSizeMultiple;
matrix_size.uiHB = 3 * block_size * iSizeMultiple;
matrix_size.uiWC = 2 * block_size * iSizeMultiple;
matrix_size.uiHC = 4 * block_size * iSizeMultiple;
printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
matrix_size.uiHA, matrix_size.uiWA,
matrix_size.uiHB, matrix_size.uiWB,
matrix_size.uiHC, matrix_size.uiWC);
if( matrix_size.uiWA != matrix_size.uiHB ||
matrix_size.uiHA != matrix_size.uiHC ||
matrix_size.uiWB != matrix_size.uiWC)
{
printf("ERROR: Matrix sizes do not match!\n");
exit(-1);
}
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
// set seed for rand()
srand(2006);
// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// allocate device memory
float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;
// allocate host memory for the result
float *h_C = (float *) malloc(mem_size_C);
float *h_CUBLAS = (float *) malloc(mem_size_C);
checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
// create and start timer
printf("Computing result using CUBLAS...");
// execute the kernel
int nIter = 30;
// CUBLAS version 2.0
{
const float alpha = 1.0f;
const float beta = 0.0f;
cublasHandle_t handle;
cudaEvent_t start, stop;
checkCudaErrors(cublasCreate(&handle));
//Perform warmup operation with cublas
checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
// Allocate CUDA events that we'll use for timing
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// Record the start event
checkCudaErrors(cudaEventRecord(start, NULL));
for (int j = 0; j < nIter; j++)
{
//note cublas is column primary!
//need to transpose the order
checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
}
printf("done.\n");
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, NULL));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));
// Destroy the handle
checkCudaErrors(cublasDestroy(handle));
}
// compute reference solution
printf("Computing result using host CPU...");
float *reference = (float *)malloc(mem_size_C);
matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
printf("done.\n");
// check result (CUBLAS)
bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
if (resCUBLAS != true)
{
printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
}
printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
// clean up memory
free(h_A);
free(h_B);
free(h_C);
free(reference);
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
if (resCUBLAS == true)
{
return EXIT_SUCCESS; // return value = 1
}
else
{
return EXIT_FAILURE; // return value = 0
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0, sizeMult = 5;
sMatrixSize matrix_size;
initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
return matrix_result;
}
The additional memory transfer seems to be caused by the CUBLAS library and is triggered by a call to cublasInit. You can confirm this by profiling the following code:
#include <cublas_v2.h>
int main()
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaDeviceReset();
return 0;
}
which nvprof reports as calling cudaMemcpy:
$ nvprof ./a.out
==9536== NVPROF is profiling process 9536, command: ./a.out
==9536== Profiling application: ./a.out
==9536== Profiling result:
Time(%) Time Calls Avg Min Max Name
100.00% 1.1190us 1 1.1190us 1.1190us 1.1190us [CUDA memcpy HtoD]
==9536== API calls:
Time(%) Time Calls Avg Min Max Name
76.51% 348.53ms 1 348.53ms 348.53ms 348.53ms cudaFree
23.26% 105.97ms 1 105.97ms 105.97ms 105.97ms cudaDeviceReset
0.09% 420.25us 178 2.3600us 125ns 103.52us cuDeviceGetAttribute
0.08% 349.37us 2 174.69us 110.59us 238.78us cuDeviceTotalMem
0.04% 202.10us 3 67.366us 9.3750us 109.43us cudaMalloc
0.01% 55.217us 2 27.608us 24.529us 30.688us cuDeviceGetName
0.00% 14.365us 1 14.365us 14.365us 14.365us cudaMemcpy
0.00% 10.016us 16 626ns 434ns 2.0440us cudaEventCreateWithFlags
0.00% 4.5000us 11 409ns 271ns 1.2730us cudaDeviceGetAttribute
0.00% 3.4510us 4 862ns 251ns 2.3370us cuDeviceGetCount
0.00% 2.3200us 4 580ns 281ns 1.0350us cuDeviceGet
0.00% 1.3600us 1 1.3600us 1.3600us 1.3600us cudaGetDevice
0.00% 630ns 1 630ns 630ns 630ns cuInit
0.00% 339ns 1 339ns 339ns 339ns cuDriverGetVersion
I doubt that anyone without access to the current CUBLAS source will be able to explain why initialising the CUBLAS library triggers a host to device transfer, but that seems to be the cause of your observation.

cublassgemm for row-major matrix

I really tried to implement a function in C to multiply to row-major matrix in cublas. I don't know where I mistaking.
In the function below A, B and C are pointers to an row matrix correctly
allocated.
I'd like to keep the option of translate a matrix before perform the product.
The function below is not working.
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
The entire source code
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA Samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// CUDA and CUBLAS functions
#include <helper_functions.h>
void getFromDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
//A = (m,n)
//B = (n,k)
//C = (m,k)
void matrixMul(cublasHandle_t handle,float *A,float *B,float *C, int m,int n,int k,int transA,int transB){
cublasStatus_t stat ; // CUBLAS functions status
float alfa = 1;
float beta = 0;
int
ma = transA ? n:m,
na = transA ? m:n,
nb = transB ? k:n,
mb = transB ? n:k;
if(na!=mb){
puts("Something wrong");
}
//(mb,nb)(ma,na) = (mb,na)
stat= cublasSgemm_v2(handle, (cublasOperation_t) transB, (cublasOperation_t)transA,
nb,ma,mb,&alfa,
B,k,
A,n,&beta,
C,m);
switch (stat) {
case CUBLAS_STATUS_SUCCESS:
puts("Sucess");
break;
default:
printf(">>>>ERRO %d<<<<\n",stat);
break;
}
}
float *mallocfDevice(int size){
float *d_C = NULL;
cudaError_t err = cudaMalloc((void **)&d_C, size * sizeof(float));
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}else{
size_t freeM, total;
cudaMemGetInfo ( &freeM, &total);
printf("MEM:%.3f\n",freeM,total,100 - ((double)freeM/total)*100 );
}
return d_C;
}
void printHostMatrix(int nl, int nc, float *h_s){
for(int j = 0; j < nl ; j++) {
for(int i = 0; i < (nc) ; i++){
int idx = j*nc + i;
printf("%.2f ", h_s[idx]);
}
printf("\n");
}
}
void printfDeviceMatrix(float *d_s,int m, int p){
float *h_s =(float*) malloc(sizeof(float)*m*p);
getFromDevice(h_s,d_s,sizeof(float)*m*p);
printHostMatrix(m,p,h_s);
free(h_s);
}
void sendTofDevice(float *h_A,float *d_A,int size){
//printf("Copy input data from the host memory to the CUDA device\n");
cudaError_t err = cudaMemcpy(d_A, h_A, size*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
int main(int argc,char **argv){
int ma = 2,
na = 3,
mb = 3,
nb = 2;
float A[] = { 1,2,3,
4,5,6};
float B[] = {7, 8,
9,10,
11,12};
float *C = new float[ma*nb];
float *d_a = mallocfDevice(ma*mb),
*d_b = mallocfDevice(mb*nb),
*d_c = mallocfDevice(ma*nb);
sendTofDevice(A,d_a,ma*na);
sendTofDevice(B,d_b,mb*nb);
cublasHandle_t handle ; // CUBLAS context
cublasCreate (&handle );
puts("A");
printfDeviceMatrix(d_a,ma,na);
puts("B");
printfDeviceMatrix(d_b,mb,nb);
matrixMul(handle, d_a,d_b,d_c,
ma,na,nb,0,0);
puts("AB=C");
printfDeviceMatrix(d_c,ma,nb);
}
CUBLAS assumes that the matrix in the device is stored in column major:
"
where α and β are scalars, and A , B and C are matrices stored in column-major format with dimensions op ( A ) m × k , op ( B ) k × n and C m × n , respectively. Also, for matrix A
Read more at: http://docs.nvidia.com/cuda/cublas/index.html#ixzz3mSDJTWrM "
That means the matrix needs to be treated as differently on the device than on the host.

Solving general sparse linear systems in CUDA

I am currently working on CUDA and trying to solve Ax = b using cuBLAS and cuSPARSE library. I looked through the sample codes including conjugateGradient & conjugateGradientPrecond provided by NVIDIA. However, the conjugate gradient method only works for positive definite matrix and it is an iterative method. Now, I have some general sparse matrices and I think I should take advantage of cuSPARSE library. Does anyone know how can I solve Ax = b using cuSPARSE and cuBLAS libraries? I could not find useful APIs for me. Generally, the matrices are expected to be at least 1000x1000 and in some cases it would go up to 100000x100000. Should I do this using a direct method?
One possibility to solve general sparse linear systems in CUDA is using cuSOLVER.
cuSOLVER has three useful routines:
cusolverSpDcsrlsvlu, which works for square linear systems (number of unknowns equal to the number of equations) and internally uses sparse LU factorization with partial pivoting;
cusolverSpDcsrlsvqr, which works for square linear systems (number of unknowns equal to the number of equations) and internally uses sparse QR factorization;
cusolverSpDcsrlsqvqr, which works for rectangular linear systems (number of unknowns different to the number of equations) and internally solves a least square problem.
For ALL the above routines, the supported matrix type is CUSPARSE_MATRIX_TYPE_GENERAL. If A is symmetric/Hermitian and only lower/upper part is used or meaningful, then its missing upper/lower part must be extended.
NOTES ON cusolverSpDcsrlsvlu
Attention should be paid to two input parameters: tol and reorder. Concerning the former, if the system matrix A is singular, then some diagonal elements of the matrix U of the LU decomposition are zero. The algorithm decides for zero if |U(j,j)|<tol. Concerning the latter, cuSOLVER provides a reordering to reduce
zero fill-in which dramactically affects the performance of LU factorization. reorder toggles between reordering (reorder=1) or not reordering (reorder=0).
Attention should be paid also to an output parameter: singularity. It is -1 if A is invertible, otherwise it provides the first index j such that U(j,j)=0.
NOTES ON cusolverSpDcsrlsvqr
Attention should be paid to the same input/output parameters are before. In particular, tol is used to decide for singularity, reorder has no effect and singularity is -1 if A is invertible, otherwise it returns the first index j such that R(j,j)=0.
NOTES ON cusolverSpDcsrlsqvqr
Attention should be paid to the input parameter tol, which is used to decide the rank of A.
Attention should be also paid to the output parameters rankA, which represents the numerical rank of A, p, a permutation vector of length equal to the number of columns of A (please, see the documentation for further details) and min_norm, which is the norm of the residual ||Ax - b||.
Currently, as of CUDA 10.0, the above three functions are for the host channel only, which means that they do not yet run on GPU. They must be called as:
cusolverSpDcsrlsvluHost;
cusolverSpDcsrlsvqrHost;
cusolverSpDcsrlsqvqrHost,
and the input argument should all reside on the host.
Below, please find a fully worked example using all the above three possibilities:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cusparse.h>
#include <cusolverSp.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
//extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__host__ __device__ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cusolverGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if (CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
_cusolverGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs", __FILE__, __LINE__, err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f;
h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f;
h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f;
h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
// --- Allocating and defining dense host and device data vectors
double *h_y = (double *)malloc(Nrows * sizeof(double));
h_y[0] = 100.0; h_y[1] = 200.0; h_y[2] = 400.0; h_y[3] = 500.0;
double *d_y; gpuErrchk(cudaMalloc(&d_y, Nrows * sizeof(double)));
gpuErrchk(cudaMemcpy(d_y, h_y, Nrows * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_x = (double *)malloc(Ncols * sizeof(double));
double *d_x; gpuErrchk(cudaMalloc(&d_x, Ncols * sizeof(double)));
// --- CUDA solver initialization
cusolverSpHandle_t solver_handle;
cusolverSpCreate(&solver_handle);
// --- Using LU factorization
int singularity;
cusolveSafeCall(cusolverSpDcsrlsvluHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
// --- Using QR factorization
//cusolveSafeCall(cusolverSpDcsrlsvqrHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
//int rankA;
//int *p = (int *)malloc(N * sizeof(int));
//double min_norm;
//cusolveSafeCall(cusolverSpDcsrlsqvqrHost(solver_handle, N, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, &rankA, h_x, p, &min_norm));
printf("Showing the results...\n");
for (int i = 0; i < N; i++) printf("%f\n", h_x[i]);
}

How can I check the progress of matrix multiplication?

I'm now only need to show an intermediate progress of matrix multiplication.
for(unsigned int col=0; col<mtxSize; col++) {
unsigned tmp = 0;
for(unsigned int row=0; row<mtxSize; row++) {
for(unsigned int idx=0; idx<mtxSize; idx++) {
tmp += h_A[col*mtxSize+idx] * h_B[idx*mtxSize+row];
}
h_Rs[col*mtxSize+row] = tmp;
tmp = 0;
int rate_tmp = (col*mtxSize + (row+1))*100;
// Maybe like this...
fprintf(stdout, "Progress : %d.%d %%\r", rate_tmp/actMtxSize, rate_tmp%actMtxSize);
fflush(stdout);
}
}
In the case of the host code(use CPU), it is very easy beacause it process sequentially so we can check very easily.
But in the case of the GPU which process in parallel, what should I do?
Once the kernel is running, it does not return until finish the kernel execution.
So I can't check mid-data during the kernel execution time.
I think I need to use asynchronous kernel call, but I do not know well.
And even if the asynchronous kernel call is used, to see all of the data into several blocks over processors, do I have to write atomicAdd() (in other words, global memory access) function which is including some overhead?
Give me some advice or hint.
And I want to know in the case of CUDA.
Here is a code which demonstrates how to check progress from a matrix multiply kernel:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define TIME_INC 100000000
#define INCS 10
#define USE_PROGRESS 1
#define MAT_DIMX 4000
#define MAT_DIMY MAT_DIMX
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(volatile int *data){
unsigned long time;
for (int i = 0; i < INCS; i++){
atomicAdd((int *)data,1);
__threadfence_system();
time = clock64();
while((clock64() - time)<TIME_INC) {};
}
printf("progress check finished\n");
}
__global__ void matmult(float *a, float *b, float *c, unsigned int rowA, unsigned int colA, unsigned int colB, volatile int *progress){
unsigned int row = threadIdx.x+blockDim.x*blockIdx.x;
unsigned int col = threadIdx.y+blockDim.y*blockIdx.y;
if ((row < rowA) && (col < colB)){
float temp = 0.0f;
for (unsigned int k = 0; k < colA; k++)
temp += a[(row*colA)+k] * b[(k*colB) + col];
c[(row*colB)+col] = temp;
#if USE_PROGRESS
if (!(threadIdx.x || threadIdx.y)){
atomicAdd((int *)progress, 1);
__threadfence_system();
}
#endif
}
}
int main(){
// simple test to demonstrate reading progress data from kernel
volatile int *d_data, *h_data;
cudaSetDeviceFlags(cudaDeviceMapHost);
cudaCheckErrors("cudaSetDeviceFlags error");
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped);
cudaCheckErrors("cudaHostAlloc error");
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0);
cudaCheckErrors("cudaHostGetDevicePointer error");
*h_data = 0;
printf("kernel starting\n");
mykernel<<<1,1>>>(d_data);
cudaCheckErrors("kernel fail");
int value = 0;
do{
int value1 = *h_data;
if (value1 > value){
printf("h_data = %d\n", value1);
value = value1;}}
while (value < (INCS-1));
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail 2");
// now try matrix multiply with progress
float *h_c, *d_a, *d_b, *d_c;
h_c = (float *)malloc(MAT_DIMX*MAT_DIMY*sizeof(float));
if (h_c == NULL) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc a fail");
cudaMalloc((void **)&d_b, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc b fail");
cudaMalloc((void **)&d_c, MAT_DIMX*MAT_DIMY*sizeof(float));
cudaCheckErrors("cudaMalloc c fail");
for (int i = 0; i < MAT_DIMX*MAT_DIMY; i++) h_c[i] = rand()/(float)RAND_MAX;
cudaMemcpy(d_a, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy a fail");
cudaMemcpy(d_b, h_c, MAT_DIMX*MAT_DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy b fail");
cudaEvent_t start, stop;
cudaEventCreate(&start); cudaEventCreate(&stop);
*h_data=0;
dim3 block(16,16);
dim3 grid(((MAT_DIMX+block.x-1)/block.x), ((MAT_DIMY+block.y-1)/block.y));
printf("matrix multiply kernel starting\n");
cudaEventRecord(start);
matmult<<<grid,block>>>(d_a, d_b, d_c, MAT_DIMY, MAT_DIMX, MAT_DIMX, d_data);
cudaEventRecord(stop);
#if USE_PROGRESS
unsigned int num_blocks = grid.x*grid.y;
float my_progress = 0.0f;
value = 0;
printf("Progress:\n");
do{
cudaEventQuery(stop); // may help WDDM scenario
int value1 = *h_data;
float kern_progress = (float)value1/(float)num_blocks;
if ((kern_progress - my_progress)> 0.1f) {
printf("percent complete = %2.1f\n", (kern_progress*100));
my_progress = kern_progress;}}
while (my_progress < 0.9f);
printf("\n");
#endif
cudaEventSynchronize(stop);
cudaCheckErrors("event sync fail");
float et;
cudaEventElapsedTime(&et, start, stop);
cudaCheckErrors("event elapsed time fail");
cudaDeviceSynchronize();
cudaCheckErrors("mat mult kernel fail");
printf("matrix multiply finished. elapsed time = %f milliseconds\n", et);
return 0;
}
The code associated with the first kernel call is just to demonstrate the basic idea of having a kernel report it's progress back.
The second part of the code shows a sample, naive matrix multiply on the GPU, with the GPU reporting it's progress back. I have included the ability to remove the progress check code via a preprocessor macro, as well as the ability to time the matrix multiply kernel. For the case I have here, there was no discernible difference in timing with or without the progress code. So while the progress reporting code probably does add some overhead, when compared to the scope of a reasonable sized matrix multiply kernel, it adds no significant time that I can see.
Some other uses of signalling are discussed here

Changing from for loop to multithreading in kernel

I'm currently working on interpolation of a grid and having some problems regarding multithreading. The code is suppose to read a map represented by a 2x2 matrix, and then interpolate it to increase the number of points by a factor of 100. When using for loops in the kernel, it works great.
Before interpolation: http://bildr.no/view/OWV1UDRO
After interpolation: http://bildr.no/view/eTlmNmpo
When I tried to change the for loops with threads, it produced some weird result. In stead of numbers, it filled the resulting matrix with -1.#QNAN
Here's my working code with for loops in the kernel
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"
using namespace std;
float Z[41][41];
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
texture<float, 2, cudaReadModeElementType> tex;
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
for (float i=0; i<n*k; i++)
{
for (float j=0; j<m*k; j++)
{
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
}
}
int main (void)
{
// Start timer
clock_t tStart = clock();
// Size of map
int n=41;
int m=41;
int g = 0;
float numberOfInterpolationsPerSquare = 100;
float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);
size_t pitch, tex_ofs;
float *f;
float *r;
float *map_d = 0;
// Build read-Streams
ifstream map;
//Create and open a txt file for MATLAB
ofstream file;
// Open data
map.open("Map.txt", ios_base::in);
file.open("Bilinear.txt");
// Store the map in a 2D array
for (int i=0; i<n; i++)
{
for (int j=0; j<m; j++)
{
map >> Z[i][j];
}
}
// Allocate memory on host and device
CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
r = (float*)malloc(numberOfElements*sizeof(float));
// Copy map from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));
// Set texture mode to bilinear interpolation
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
// Bind the map to texture
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));
// Checking for offset
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
// Launch Kernel
kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
// Copy result from device to host
cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);
// Write results to file
for(int h=0;h<numberOfElements;h++)
{
if(g==sqrt(numberOfElements))
{
file << endl;
g=0;
}
file << r[h] << " ";
g++;
}
// Free memory
CUDA_SAFE_CALL (cudaUnbindTexture (tex));
CUDA_SAFE_CALL (cudaFree (map_d));
CUDA_SAFE_CALL (cudaFree (f));
free( r );
// Print out execution time
printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
Here's the kernel with multithreading, which doesn't work
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
int i= blockIdx.x * blockDim.x + threadIdx.x;
int j= blockIdx.y * blockDim.y + threadIdx.y;
if(i>=n*k || j>=m*k)
return;
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
Does anyone know why the multithread version doesn't work?
Regards
Sondre
In the second kernel, i and j are int instead of float. So j/k and i/k in tex2D will result in integer division. Declare k as float to avoid integer division.
Initially, the kernel was launched with the following configuration:
//Find number of blocks
int nthreads = 1024;
int blocksize = 512;
int nblocks = ceil( (n*m*numberOfInterpolationsPerSquare) / nthreads);
// Launch Kernel
kernel <<< nblocks,blocksize >>> (m, n, f, numberOfInterpolationsPerSquare);
The problem with the above code is that it would launch a 1D grid of 1D blocks, but inside the kernel, 2D indexing is used. A 2D grid/block configuration is required for the kernel to work correctly. From the looks of the kernel code, following grid/block configuration should work:
float k = sqrt(numberOfInterpolationsPerSquare);
const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);
const dim3 dimBlock(16,16);
dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;
kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);