CUBLAS: Incorrect inversion for matrix with zero pivot - cuda

Since CUDA 5.5, the CUBLAS library contains routines for batched matrix factorization and inversion (cublas<t>getrfBatched and cublas<t>getriBatched respectively).
Getting guide from the documentation, I wrote a test code for inversion of an N x N matrix using these routines. The code gives correct output only if the matrix has all non zero pivots. Setting any pivot to zero results in incorrect results. I have verified the results using MATLAB.
I realize that I am providing row major matrices as input while CUBLAS expects column major matrices, but it shouldn't matter as it would only transpose the result. To be sure, I also tested on column major input, but getting same behavior.
I am confused as, cublas<t>getriBatched expects pivot exchange information array P as input, which is the output from cublas<t>getrfBatched. So, if any zero pivots are eliminated by row exchange, then the inversion routine should handle it automatically.
How to perform inversion of matrices which contain a zero pivot using CUBLAS?
Following is a self contained compile-able example with different test cases:
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
//Select matrix by setting "a"
float* a = zero_pivot;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,a,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
int n; scanf("%d",&n);
return 0;
}

There seems to be a bug in the current CUBLAS library implementation of cublas<t>getrfBatched for matrices of dimension (n) such that 3<=n<=16, when there is a "zero pivot" as you say.
A possible workaround is to "identity-extend" your A matrix to be inverted, when n<17, to a size of 17x17 (using matlab nomenclature):
LU = getrf( [A 0 ; 0 I]);
continuing, you can then use cublas<t>getriBatched in an "ordinary" fashion:
invA = getri( LU(1:3,1:3) )
(You can also leave everything at n=17, call getri that way, and then extract the result as the first 3x3 rows and columns of invA.)
Here is a fully worked example, borrowing from the code you supplied, showing the inversion of your supplied 3x3 zero_pivot matrix, using the zero_pivot_war matrix as an "identity-extended" workaround:
$ cat t340.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = 17;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == 17)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
/* float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
*/
float zero_pivot_war[17*17] = {
0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
*/
float result[n*n];
//Select matrix by setting "a"
float* a = zero_pivot_war;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*17+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,result,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",result[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
// int n; scanf("%d",&n);
return 0;
}
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Inverse:
-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
========= ERROR SUMMARY: 0 errors
$
The above result appears to me to be correct based on a simple test elsewhere.
I don't have any further technical details about the nature of the possible bug in CUBLAS. From what I can tell, it is present in both CUDA 5.5 and CUDA 6.0 RC. Detailed bug discussions for NVIDIA-supplied assets (e.g. CUBLAS library) should be taken up on the NVIDIA developer forums or directly at the bug filing portal on developer.nvidia.com (you must be a registered developer to file a bug).

Related

How can I Profile a kernel over time with CUPTI?

I am going to profile over time with the help of CUPTI profiler of some of the benchmark kernels in the CUDA sample SDK (for example matrixMul and dxtc and ...). But CUPTI Profiller returns a value of zero. Is it because the kernels are small? Because when I use a larger kernel, it returns some non-zero values for, say, IPC. I set the time interval between two samples to 70 milliseconds.
The question is, can a small or medium kernel be profiled over time with CUPTI? If yes. How?
CUPTI includes a number of sample codes. One that appears fairly similar to your request:
for, say, IPC
is callback_metric:
This sample shows how to use both the callback and metric APIs to record the metric's events during the execution of a simple kernel, and then use those events to calculate the metric value.
On a typical linux install, this sample code would be located at: /usr/local/cuda/extras/CUPTI/samples/callback_metric/
and the sample includes just a single source file and a Makefile.
As it happens, this sample code actually computes the IPC metric (by default, if no command line arguments are specified), and does it on a very short/simple kernel:
__global__ void VecAdd(const int* A, const int* B, int* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
And here is the output, on a V100:
$ LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64 ./callback_metric
Usage: ./callback_metric [device_num] [metric_name]
CUDA Device Number: 0
CUDA Device Name: Tesla V100-PCIE-32GB
Compute Capability of Device: 7.0
Launching kernel: blocks 196, thread/block 256
Pass 0
Launching kernel: blocks 196, thread/block 256
inst_executed = 25043 (384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 339, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 256, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 256, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 256, 256, 384, 256, 384, 256, 384, 256, 384, 256, 384, 256, 256, 256)
inst_executed (normalized) (25043 * 80) / 80 = 25043
active_cycles = 126682 (1670, 1502, 1665, 1543, 1615, 1457, 1654, 1573, 1632, 1533, 1661, 1540, 1583, 1482, 1667, 1522, 1652, 1523, 1607, 1477, 1681, 1576, 1636, 1564, 1657, 1553, 1621, 1506, 1690, 1548, 1636, 1544, 1564, 1455, 1691, 1644, 1594, 1501, 1700, 1573, 1647, 1455, 1677, 1553, 1638, 1497, 1516, 1429, 1694, 1637, 1670, 1487, 1688, 1555, 1692, 1503, 1669, 1551, 1614, 1523, 1699, 1599, 1647, 1505, 1692, 1556, 1599, 1498, 1641, 1535, 1616, 1475, 1659, 1591, 1614, 1419, 1631, 1513, 1559, 1447)
active_cycles (normalized) (126682 * 80) / 80 = 126682
Metric ipc = 0.197684
$
For reference, here is the full sample code:
/*
* Copyright 2011-2017 NVIDIA Corporation. All rights reserved
*
* Sample app to demonstrate use of CUPTI library to obtain metric values
* using callbacks for CUDA runtime APIs
*
*/
#include <stdio.h>
#include <cuda.h>
#include <cupti.h>
#define METRIC_NAME "ipc"
#define DRIVER_API_CALL(apiFuncCall) \
do { \
CUresult _status = apiFuncCall; \
if (_status != CUDA_SUCCESS) { \
fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \
__FILE__, __LINE__, #apiFuncCall, _status); \
exit(-1); \
} \
} while (0)
#define RUNTIME_API_CALL(apiFuncCall) \
do { \
cudaError_t _status = apiFuncCall; \
if (_status != cudaSuccess) { \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
exit(-1); \
} \
} while (0)
#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char *errstr; \
cuptiGetResultString(_status, &errstr); \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #call, errstr); \
exit(-1); \
} \
} while (0)
#define ALIGN_SIZE (8)
#define ALIGN_BUFFER(buffer, align) \
(((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))
// User data for event collection callback
typedef struct MetricData_st {
// the device where metric is being collected
CUdevice device;
// the set of event groups to collect for a pass
CUpti_EventGroupSet *eventGroups;
// the current number of events collected in eventIdArray and
// eventValueArray
uint32_t eventIdx;
// the number of entries in eventIdArray and eventValueArray
uint32_t numEvents;
// array of event ids
CUpti_EventID *eventIdArray;
// array of event values
uint64_t *eventValueArray;
} MetricData_t;
static uint64_t kernelDuration;
// Device code
__global__ void VecAdd(const int* A, const int* B, int* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i] + B[i];
}
static void
initVec(int *vec, int n)
{
for (int i=0; i< n; i++)
vec[i] = i;
}
void CUPTIAPI
getMetricValueCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
{
MetricData_t *metricData = (MetricData_t*)userdata;
unsigned int i, j, k;
// This callback is enabled only for launch so we shouldn't see
// anything else.
if ((cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) &&
(cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000))
{
printf("%s:%d: unexpected cbid %d\n", __FILE__, __LINE__, cbid);
exit(-1);
}
// on entry, enable all the event groups being collected this pass,
// for metrics we collect for all instances of the event
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
cudaDeviceSynchronize();
CUPTI_CALL(cuptiSetEventCollectionMode(cbInfo->context,
CUPTI_EVENT_COLLECTION_MODE_KERNEL));
for (i = 0; i < metricData->eventGroups->numEventGroups; i++) {
uint32_t all = 1;
CUPTI_CALL(cuptiEventGroupSetAttribute(metricData->eventGroups->eventGroups[i],
CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
sizeof(all), &all));
CUPTI_CALL(cuptiEventGroupEnable(metricData->eventGroups->eventGroups[i]));
}
}
// on exit, read and record event values
if (cbInfo->callbackSite == CUPTI_API_EXIT) {
cudaDeviceSynchronize();
// for each group, read the event values from the group and record
// in metricData
for (i = 0; i < metricData->eventGroups->numEventGroups; i++) {
CUpti_EventGroup group = metricData->eventGroups->eventGroups[i];
CUpti_EventDomainID groupDomain;
uint32_t numEvents, numInstances, numTotalInstances;
CUpti_EventID *eventIds;
size_t groupDomainSize = sizeof(groupDomain);
size_t numEventsSize = sizeof(numEvents);
size_t numInstancesSize = sizeof(numInstances);
size_t numTotalInstancesSize = sizeof(numTotalInstances);
uint64_t *values, normalized, *sum;
size_t valuesSize, eventIdsSize;
size_t numCountersRead = 0;
CUPTI_CALL(cuptiEventGroupGetAttribute(group,
CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID,
&groupDomainSize, &groupDomain));
CUPTI_CALL(cuptiDeviceGetEventDomainAttribute(metricData->device, groupDomain,
CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT,
&numTotalInstancesSize, &numTotalInstances));
CUPTI_CALL(cuptiEventGroupGetAttribute(group,
CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
&numInstancesSize, &numInstances));
CUPTI_CALL(cuptiEventGroupGetAttribute(group,
CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS,
&numEventsSize, &numEvents));
eventIdsSize = numEvents * sizeof(CUpti_EventID);
eventIds = (CUpti_EventID *)malloc(eventIdsSize);
CUPTI_CALL(cuptiEventGroupGetAttribute(group,
CUPTI_EVENT_GROUP_ATTR_EVENTS,
&eventIdsSize, eventIds));
valuesSize = sizeof(uint64_t) * numInstances * numEvents;
values = (uint64_t *)malloc(valuesSize);
CUPTI_CALL(cuptiEventGroupReadAllEvents(group,
CUPTI_EVENT_READ_FLAG_NONE,
&valuesSize,
values,
&eventIdsSize,
eventIds,
&numCountersRead));
if (metricData->eventIdx >= metricData->numEvents) {
fprintf(stderr, "error: too many events collected, metric expects only %d\n",
(int)metricData->numEvents);
exit(-1);
}
sum = (uint64_t *)calloc(sizeof(uint64_t), numEvents);
// sum collect event values from all instances
for (k = 0; k < numInstances; k++) {
for (j = 0; j < numEvents; j++) {
sum[j] += values[(k * numEvents) + j];
}
}
for (j = 0; j < numEvents; j++) {
// normalize the event value to represent the total number of
// domain instances on the device
normalized = (sum[j] * numTotalInstances) / numInstances;
metricData->eventIdArray[metricData->eventIdx] = eventIds[j];
metricData->eventValueArray[metricData->eventIdx] = normalized;
metricData->eventIdx++;
// print collected value
{
char eventName[128];
size_t eventNameSize = sizeof(eventName) - 1;
CUPTI_CALL(cuptiEventGetAttribute(eventIds[j], CUPTI_EVENT_ATTR_NAME,
&eventNameSize, eventName));
eventName[127] = '\0';
printf("\t%s = %llu (", eventName, (unsigned long long)sum[j]);
if (numInstances > 1) {
for (k = 0; k < numInstances; k++) {
if (k != 0)
printf(", ");
printf("%llu", (unsigned long long)values[(k * numEvents) + j]);
}
}
printf(")\n");
printf("\t%s (normalized) (%llu * %u) / %u = %llu\n",
eventName, (unsigned long long)sum[j],
numTotalInstances, numInstances,
(unsigned long long)normalized);
}
}
free(values);
free(sum);
}
for (i = 0; i < metricData->eventGroups->numEventGroups; i++)
CUPTI_CALL(cuptiEventGroupDisable(metricData->eventGroups->eventGroups[i]));
}
}
static void
cleanUp(int *h_A, int *h_B, int *h_C, int *d_A, int *d_B, int *d_C)
{
if (d_A)
cudaFree(d_A);
if (d_B)
cudaFree(d_B);
if (d_C)
cudaFree(d_C);
// Free host memory
if (h_A)
free(h_A);
if (h_B)
free(h_B);
if (h_C)
free(h_C);
}
static void
runPass()
{
int N = 50000;
size_t size = N * sizeof(int);
int threadsPerBlock = 0;
int blocksPerGrid = 0;
int *h_A, *h_B, *h_C;
int *d_A, *d_B, *d_C;
int i, sum;
// Allocate input vectors h_A and h_B in host memory
h_A = (int*)malloc(size);
h_B = (int*)malloc(size);
h_C = (int*)malloc(size);
// Initialize input vectors
initVec(h_A, N);
initVec(h_B, N);
memset(h_C, 0, size);
// Allocate vectors in device memory
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Invoke kernel
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
printf("Launching kernel: blocks %d, thread/block %d\n",
blocksPerGrid, threadsPerBlock);
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify result
for (i = 0; i < N; ++i) {
sum = h_A[i] + h_B[i];
if (h_C[i] != sum) {
fprintf(stderr, "error: result verification failed\n");
exit(-1);
}
}
cleanUp(h_A, h_B, h_C, d_A, d_B, d_C);
}
static void CUPTIAPI
bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords)
{
uint8_t *rawBuffer;
*size = 16 * 1024;
rawBuffer = (uint8_t *)malloc(*size + ALIGN_SIZE);
*buffer = ALIGN_BUFFER(rawBuffer, ALIGN_SIZE);
*maxNumRecords = 0;
if (*buffer == NULL) {
printf("Error: out of memory\n");
exit(-1);
}
}
static void CUPTIAPI
bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize)
{
CUpti_Activity *record = NULL;
CUpti_ActivityKernel6 *kernel;
//since we launched only 1 kernel, we should have only 1 kernel record
CUPTI_CALL(cuptiActivityGetNextRecord(buffer, validSize, &record));
kernel = (CUpti_ActivityKernel6 *)record;
if (kernel->kind != CUPTI_ACTIVITY_KIND_KERNEL) {
fprintf(stderr, "Error: expected kernel activity record, got %d\n", (int)kernel->kind);
exit(-1);
}
kernelDuration = kernel->end - kernel->start;
free(buffer);
}
int
main(int argc, char *argv[])
{
CUpti_SubscriberHandle subscriber;
CUcontext context = 0;
CUdevice device = 0;
int deviceNum;
int deviceCount;
char deviceName[32];
const char *metricName;
CUpti_MetricID metricId;
CUpti_EventGroupSets *passData;
MetricData_t metricData;
unsigned int pass;
CUpti_MetricValue metricValue;
printf("Usage: %s [device_num] [metric_name]\n", argv[0]);
// make sure activity is enabled before any CUDA API
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
DRIVER_API_CALL(cuInit(0));
DRIVER_API_CALL(cuDeviceGetCount(&deviceCount));
if (deviceCount == 0) {
printf("There is no device supporting CUDA.\n");
return -2;
}
if (argc > 1)
deviceNum = atoi(argv[1]);
else
deviceNum = 0;
printf("CUDA Device Number: %d\n", deviceNum);
DRIVER_API_CALL(cuDeviceGet(&device, deviceNum));
DRIVER_API_CALL(cuDeviceGetName(deviceName, 32, device));
printf("CUDA Device Name: %s\n", deviceName);
int major, minor;
DRIVER_API_CALL(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
DRIVER_API_CALL(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
int deviceComputeCapability = 10 * major + minor;
printf("Compute Capability of Device: %d.%d\n", major,minor);
if(deviceComputeCapability > 72) {
printf("Sample unsupported on Device with compute capability > 7.2\n");
return -2;
}
DRIVER_API_CALL(cuCtxCreate(&context, 0, device));
// Get the name of the metric to collect
if (argc > 2)
metricName = argv[2];
else {
metricName = METRIC_NAME;
}
// need to collect duration of kernel execution without any event
// collection enabled (some metrics need kernel duration as part of
// calculation). The only accurate way to do this is by using the
// activity API.
{
CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
runPass();
cudaDeviceSynchronize();
CUPTI_CALL(cuptiActivityFlushAll(0));
}
// setup launch callback for event collection
CUPTI_CALL(cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)getMetricValueCallback, &metricData));
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020));
CUPTI_CALL(cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000));
// allocate space to hold all the events needed for the metric
CUPTI_CALL(cuptiMetricGetIdFromName(device, metricName, &metricId));
CUPTI_CALL(cuptiMetricGetNumEvents(metricId, &metricData.numEvents));
metricData.device = device;
metricData.eventIdArray = (CUpti_EventID *)malloc(metricData.numEvents * sizeof(CUpti_EventID));
metricData.eventValueArray = (uint64_t *)malloc(metricData.numEvents * sizeof(uint64_t));
metricData.eventIdx = 0;
// get the number of passes required to collect all the events
// needed for the metric and the event groups for each pass
CUPTI_CALL(cuptiMetricCreateEventGroupSets(context, sizeof(metricId), &metricId, &passData));
for (pass = 0; pass < passData->numSets; pass++) {
printf("Pass %u\n", pass);
metricData.eventGroups = passData->sets + pass;
runPass();
}
if (metricData.eventIdx != metricData.numEvents) {
fprintf(stderr, "error: expected %u metric events, got %u\n",
metricData.numEvents, metricData.eventIdx);
exit(-1);
}
// use all the collected events to calculate the metric value
CUPTI_CALL(cuptiMetricGetValue(device, metricId,
metricData.numEvents * sizeof(CUpti_EventID),
metricData.eventIdArray,
metricData.numEvents * sizeof(uint64_t),
metricData.eventValueArray,
kernelDuration, &metricValue));
// print metric value, we format based on the value kind
{
CUpti_MetricValueKind valueKind;
size_t valueKindSize = sizeof(valueKind);
CUPTI_CALL(cuptiMetricGetAttribute(metricId, CUPTI_METRIC_ATTR_VALUE_KIND,
&valueKindSize, &valueKind));
switch (valueKind) {
case CUPTI_METRIC_VALUE_KIND_DOUBLE:
printf("Metric %s = %f\n", metricName, metricValue.metricValueDouble);
break;
case CUPTI_METRIC_VALUE_KIND_UINT64:
printf("Metric %s = %llu\n", metricName,
(unsigned long long)metricValue.metricValueUint64);
break;
case CUPTI_METRIC_VALUE_KIND_INT64:
printf("Metric %s = %lld\n", metricName,
(long long)metricValue.metricValueInt64);
break;
case CUPTI_METRIC_VALUE_KIND_PERCENT:
printf("Metric %s = %f%%\n", metricName, metricValue.metricValuePercent);
break;
case CUPTI_METRIC_VALUE_KIND_THROUGHPUT:
printf("Metric %s = %llu bytes/sec\n", metricName,
(unsigned long long)metricValue.metricValueThroughput);
break;
case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL:
printf("Metric %s = utilization level %u\n", metricName,
(unsigned int)metricValue.metricValueUtilizationLevel);
break;
default:
fprintf(stderr, "error: unknown value kind\n");
exit(-1);
}
}
CUPTI_CALL(cuptiUnsubscribe(subscriber));
return 0;
}

CUDA cublas getrf and getri, for a matrix inversion, cause nvprof errors with one dimensional memory [duplicate]

Since CUDA 5.5, the CUBLAS library contains routines for batched matrix factorization and inversion (cublas<t>getrfBatched and cublas<t>getriBatched respectively).
Getting guide from the documentation, I wrote a test code for inversion of an N x N matrix using these routines. The code gives correct output only if the matrix has all non zero pivots. Setting any pivot to zero results in incorrect results. I have verified the results using MATLAB.
I realize that I am providing row major matrices as input while CUBLAS expects column major matrices, but it shouldn't matter as it would only transpose the result. To be sure, I also tested on column major input, but getting same behavior.
I am confused as, cublas<t>getriBatched expects pivot exchange information array P as input, which is the output from cublas<t>getrfBatched. So, if any zero pivots are eliminated by row exchange, then the inversion routine should handle it automatically.
How to perform inversion of matrices which contain a zero pivot using CUBLAS?
Following is a self contained compile-able example with different test cases:
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
//Select matrix by setting "a"
float* a = zero_pivot;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,a,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
int n; scanf("%d",&n);
return 0;
}
There seems to be a bug in the current CUBLAS library implementation of cublas<t>getrfBatched for matrices of dimension (n) such that 3<=n<=16, when there is a "zero pivot" as you say.
A possible workaround is to "identity-extend" your A matrix to be inverted, when n<17, to a size of 17x17 (using matlab nomenclature):
LU = getrf( [A 0 ; 0 I]);
continuing, you can then use cublas<t>getriBatched in an "ordinary" fashion:
invA = getri( LU(1:3,1:3) )
(You can also leave everything at n=17, call getri that way, and then extract the result as the first 3x3 rows and columns of invA.)
Here is a fully worked example, borrowing from the code you supplied, showing the inversion of your supplied 3x3 zero_pivot matrix, using the zero_pivot_war matrix as an "identity-extended" workaround:
$ cat t340.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = 17;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == 17)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
/* float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
*/
float zero_pivot_war[17*17] = {
0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
*/
float result[n*n];
//Select matrix by setting "a"
float* a = zero_pivot_war;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*17+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,result,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",result[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
// int n; scanf("%d",&n);
return 0;
}
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Inverse:
-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
========= ERROR SUMMARY: 0 errors
$
The above result appears to me to be correct based on a simple test elsewhere.
I don't have any further technical details about the nature of the possible bug in CUBLAS. From what I can tell, it is present in both CUDA 5.5 and CUDA 6.0 RC. Detailed bug discussions for NVIDIA-supplied assets (e.g. CUBLAS library) should be taken up on the NVIDIA developer forums or directly at the bug filing portal on developer.nvidia.com (you must be a registered developer to file a bug).

Is there a function in the cublas that can apply the sigmoid function with a vector?

As the title says, I want to do the element-wise operation in the vector with a function.I wonder that is there any function in the cublas library to do that?
I am not aware of a suitable CUBLAS function that can assist in the task. However, you can easily write your own code that applies the sigmoid function, or any other single-argument function for that matter, element-wise to a vector. Note that such code would be memory-bound rather than compute-bound in most circumstances. See the CUDA program below for a worked example, in particular sigmoid_kernel(). The output of the program should look something like this:
source[0]= 0.0000000000000000e+000 source[99999]= 9.9999000000000005e-001
result[0]= 5.0000000000000000e-001 result[99999]= 7.3105661250612963e-001
.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define DEFAULT_LEN 100000
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
__device__ __forceinline__ double sigmoid (double a)
{
return 1.0 / (1.0 + exp (-a));
}
__global__ void sigmoid_kernel (const double * __restrict__ src,
double * __restrict__ dst, int len)
{
int stride = gridDim.x * blockDim.x;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = tid; i < len; i += stride) {
dst[i] = sigmoid (src[i]);
}
}
int main (void)
{
double *source, *result;
double *d_a = 0, *d_b = 0;
int len = DEFAULT_LEN;
/* Allocate memory on host */
source = (double *)malloc (len * sizeof (source[0]));
if (!source) return EXIT_FAILURE;
result = (double *)malloc (len * sizeof (result[0]));
if (!result) return EXIT_FAILURE;
/* create source data */
for (int i = 0; i < len; i++) source [i] = i * 1e-5;
/* spot check of source data */
printf ("source[0]=% 23.16e source[%d]=% 23.16e\n",
source[0], len-1, source[len-1]);
/* Allocate memory on device */
CUDA_SAFE_CALL (cudaMalloc((void**)&d_a, sizeof(d_a[0]) * len));
CUDA_SAFE_CALL (cudaMalloc((void**)&d_b, sizeof(d_b[0]) * len));
/* Push source data to device */
CUDA_SAFE_CALL (cudaMemcpy (d_a, source, sizeof(d_a[0]) * len,
cudaMemcpyHostToDevice));
/* Compute execution configuration */
dim3 dimBlock(256);
int threadBlocks = (len + (dimBlock.x - 1)) / dimBlock.x;
if (threadBlocks > 65520) threadBlocks = 65520;
dim3 dimGrid(threadBlocks);
sigmoid_kernel<<<dimGrid,dimBlock>>>(d_a, d_b, len);
CHECK_LAUNCH_ERROR();
/* retrieve results from device */
CUDA_SAFE_CALL (cudaMemcpy (result, d_b, sizeof (result[0]) * len,
cudaMemcpyDeviceToHost));
/* spot check of results */
printf ("result[0]=% 23.16e result[%d]=% 23.16e\n",
result[0], len-1, result[len-1]);
/* free memory on host and device */
CUDA_SAFE_CALL (cudaFree(d_a));
CUDA_SAFE_CALL (cudaFree(d_b));
free (result);
free (source);
return EXIT_SUCCESS;
}

How to configure cublas{t}symm() function arguments

This function performs the symmetric matrix-matrix multiplication using CUDA. Although, I succeeded in using the nonsymmetric version "cublas{t}gemm()" I couldn't use the "cublas{t}symm()" function properly.
I know that CUBLAS library uses column-major matrix storage. I am using row-major C/C++ matrix and I know how to solve this issue for "cublas{t}gemm()" by replacing the input matrices and etc. However, I couldn't solve it for the symmetric case. The problem is even if I use column-major matrix storage I find unexpectable results. Matrices contain complex floats (cuComplex). I assume I have row-major matrices. Here is the code and the output:
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
int m,n,k;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*(m+1)/2; //size of a symmetric matrix
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
for (i = 0; i < size_A; ++i)
h_A[i] = make_cuComplex( (float)(i+1),(float)0);
for (i = 0; i < size_B; ++i)
h_B[i] = make_cuComplex((float)(i+2), (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(n / threads.x, m / threads.y);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, n,m,&alpha,d_A,m,d_B,m,&beta,d_C,m);
// copy result from device to host
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
printf ("\nComputations completed.\n\n");
printf (" symm matrix A: \n");
int s=0;
for (i=0; i<min(m,4); i++) {
for (j=0; j<=i; j++) {
//printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
printf ("%7.5G", h_A[s].x);
s++;
}
printf ("\n");
}
printf ("\n matrix B: \n");
for (i=0; i<min(k,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
printf ("%7.5G", h_B[j+i*n].x);
}
printf ("\n");
}
printf ("\n matrix C=A*B: \n");
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
printf ("%7.5G", h_CUBLAS[j+i*n].x);
}
printf ("\n");
}
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaDeviceReset();
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0, sizeMult = 5;
initializeCUDA(argc, argv, devID);
int matrix_result = matrixMultiply(argc, argv, devID);
}
I suppose that I have the following matrices for the multiplication:
A =
1 2 4
2 3 5
4 5 6
B =
2 3
4 5
6 7
and expect to obtain
A*B =
34 41
46 56
64 79
But the obtained OUTPUT is as follows:
symm matrix A:
1
2 3
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
78 90
74 97
114 146
What am I missing in this code ? Probably the arguments of "cublasCsymm" function are wrong.
Thanks,
Kagan
EDIT:
Based on questions posed below, I elected to re-work my answer and example code.
You can handle row-major storage without transpose at least for these operations. And this observation is further facilitated by the fact that the symm function does not used the packed storage.
So to answer the additional questions:
the cublasCsymm function does not use a packed storage format (like some other functions such as cublasCspmv for example), because the cublasCsymm function is intended to duplicate the functionality of the corresponding netlib function, which also does not use a packed storage format. Based on my review of the cublas API, I don't see a symmetric-packed-storage matrix-matrix multiply function available.
You can use row-major storage (e.g. C-style) with cublas, without transposing, at least for these operations (matrix-matrix multiply, without packed storage) by following the advice given here.
What follows is a re-worked version of my previous example, that incorporates the information in item 2 above.
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK sa
mples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND
_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMa
trixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on
what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __
LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, dev
iceProp.name, deviceProp.major, deviceProp.minor);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*m; //size of a symmetric matrix
printf("size_A = %d\n", size_A);
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
// for (i = 0; i < size_A; ++i)
// h_A[i] = make_cuComplex( (float)(i+1),(float)0);
h_A[0] = make_cuComplex((float)1, (float)0);
h_A[1] = make_cuComplex((float)2, (float)0);
h_A[2] = make_cuComplex((float)4, (float)0);
h_A[3] = make_cuComplex((float)0, (float)0);
h_A[4] = make_cuComplex((float)3, (float)0);
h_A[5] = make_cuComplex((float)5, (float)0);
h_A[6] = make_cuComplex((float)0, (float)0);
h_A[7] = make_cuComplex((float)0, (float)0);
h_A[8] = make_cuComplex((float)6, (float)0);
// for (i = 0; i < size_B; ++i)
// h_B[i] = make_cuComplex((float)(i+2), (float)0);
h_B[0] = make_cuComplex((float)2, (float)0);
h_B[1] = make_cuComplex((float)3, (float)0);
h_B[2] = make_cuComplex((float)4, (float)0);
h_B[3] = make_cuComplex((float)5, (float)0);
h_B[4] = make_cuComplex((float)6, (float)0);
h_B[5] = make_cuComplex((float)7, (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, n,m,&alpha,d_A,m,d_B,n,&beta,d_C,n);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
// copy result from device to host
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
printf ("\nComputations completed.\n\n");
printf (" symm matrix A: \n");
// int s=0;
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(m,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
// printf ("%7.5G", h_A[s].x);
printf ("%7.5G", h_A[j+(i*m)].x);
// s++;
}
printf ("\n");
}
printf ("\n matrix B: \n");
for (i=0; i<min(k,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
printf ("%7.5G", h_B[j+(i*n)].x);
}
printf ("\n");
}
printf ("\n matrix C=A*B: \n");
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
printf ("%7.5G", h_CUBLAS[j+(i*n)].x);
}
printf ("\n");
}
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaDeviceReset();
return 0;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0;
initializeCUDA(argc, argv, devID);
int matrix_result = matrixMultiply(argc, argv, devID);
cudaCheckErrors("some error");
return 0;
}
$ ./t213
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
size_A = 9
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1 2 4
0 3 5
0 0 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
34 41
46 56
64 79
$
ORIGINAL RESPONSE:
Several problems:
When I run your code as you have it posted right now, I don't get the
results that you show. Here's what I get:
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1
2 3
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
-131 -128
260 -122
-115 266
The code compiles with a number of warnings and also you're not doing proper error checking (for example you're not checking the return value from cublasCsymm
You are wanting to multiply C = A*B This means A is on the LEFT,
but you are passing CUBLAS_SIDE_RIGHT to cublasCsymm Several other cublasCsymm parameters were wrong as well. I think maybe you thought you could do A*B as (B(T)*A(T)) but that only works for square matrices. Not sure what you were thinking, exactly.
You having row-major storage on your matrices and passing them to cublas which interprets them in column-major order. For the following matrix:
1 2
3 4
row-major storage looks like this:
1 2 3 4
column-major storage looks like this:
1 3 2 4
You can transpose these matrices if you wish, using cublasCgeam or you can manually modify your storage.
You're making some sort of assumption about some kind of compressed
storage format for the symmetric matrix A which is not correct.
Read carefully the defintion of the storage
type.
It doesn't say the portion of the matrix that is "supplied" or
"present" it says the portion of the matrix that is filled.
Here is a complete code that has the above problems fixed:
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK sa
mples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*m; //size of a symmetric matrix
printf("size_A = %d\n", size_A);
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
// for (i = 0; i < size_A; ++i)
// h_A[i] = make_cuComplex( (float)(i+1),(float)0);
h_A[0] = make_cuComplex((float)1, (float)0);
h_A[1] = make_cuComplex((float)2, (float)0);
h_A[2] = make_cuComplex((float)4, (float)0);
h_A[3] = make_cuComplex((float)0, (float)0);
h_A[4] = make_cuComplex((float)3, (float)0);
h_A[5] = make_cuComplex((float)5, (float)0);
h_A[6] = make_cuComplex((float)0, (float)0);
h_A[7] = make_cuComplex((float)0, (float)0);
h_A[8] = make_cuComplex((float)6, (float)0);
// for (i = 0; i < size_B; ++i)
// h_B[i] = make_cuComplex((float)(i+2), (float)0);
h_B[0] = make_cuComplex((float)2, (float)0);
h_B[1] = make_cuComplex((float)4, (float)0);
h_B[2] = make_cuComplex((float)6, (float)0);
h_B[3] = make_cuComplex((float)3, (float)0);
h_B[4] = make_cuComplex((float)5, (float)0);
h_B[5] = make_cuComplex((float)7, (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, m,n,&alpha,d_A,m,d_B,m,&beta,d_C,m);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
Here is the output:
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0
size_A = 9
Computing result using CUBLAS...
Computations completed.
symm matrix A:
1 0 0
2 3 0
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
34 41
46 56
64 79

Changing from for loop to multithreading in kernel

I'm currently working on interpolation of a grid and having some problems regarding multithreading. The code is suppose to read a map represented by a 2x2 matrix, and then interpolate it to increase the number of points by a factor of 100. When using for loops in the kernel, it works great.
Before interpolation: http://bildr.no/view/OWV1UDRO
After interpolation: http://bildr.no/view/eTlmNmpo
When I tried to change the for loops with threads, it produced some weird result. In stead of numbers, it filled the resulting matrix with -1.#QNAN
Here's my working code with for loops in the kernel
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <fstream>
#include "cuda.h"
using namespace std;
float Z[41][41];
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
texture<float, 2, cudaReadModeElementType> tex;
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
for (float i=0; i<n*k; i++)
{
for (float j=0; j<m*k; j++)
{
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
}
}
int main (void)
{
// Start timer
clock_t tStart = clock();
// Size of map
int n=41;
int m=41;
int g = 0;
float numberOfInterpolationsPerSquare = 100;
float numberOfElements = pow(sqrt(numberOfInterpolationsPerSquare)*n,2);
size_t pitch, tex_ofs;
float *f;
float *r;
float *map_d = 0;
// Build read-Streams
ifstream map;
//Create and open a txt file for MATLAB
ofstream file;
// Open data
map.open("Map.txt", ios_base::in);
file.open("Bilinear.txt");
// Store the map in a 2D array
for (int i=0; i<n; i++)
{
for (int j=0; j<m; j++)
{
map >> Z[i][j];
}
}
// Allocate memory on host and device
CUDA_SAFE_CALL(cudaMallocPitch((void**)&map_d,&pitch,n*sizeof(*map_d),m));
CUDA_SAFE_CALL(cudaMalloc((void**)&f, numberOfElements*sizeof(float)));
r = (float*)malloc(numberOfElements*sizeof(float));
// Copy map from host to device
CUDA_SAFE_CALL(cudaMemcpy2D(map_d, pitch, Z, n*sizeof(Z[0][0]), n*sizeof(Z[0][0]),m,cudaMemcpyHostToDevice));
// Set texture mode to bilinear interpolation
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
// Bind the map to texture
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, map_d, &tex.channelDesc, n, m, pitch));
// Checking for offset
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
// Launch Kernel
kernel <<< 1,1 >>> (m, n, f, numberOfInterpolationsPerSquare);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
// Copy result from device to host
cudaMemcpy(r, f, numberOfElements*sizeof(float), cudaMemcpyDeviceToHost);
// Write results to file
for(int h=0;h<numberOfElements;h++)
{
if(g==sqrt(numberOfElements))
{
file << endl;
g=0;
}
file << r[h] << " ";
g++;
}
// Free memory
CUDA_SAFE_CALL (cudaUnbindTexture (tex));
CUDA_SAFE_CALL (cudaFree (map_d));
CUDA_SAFE_CALL (cudaFree (f));
free( r );
// Print out execution time
printf("Time taken: %.3fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
Here's the kernel with multithreading, which doesn't work
__global__ void kernel (int m, int n, float *f, float numberOfInterpolationsPerSquare)
{
int k = sqrt(numberOfInterpolationsPerSquare);
int i= blockIdx.x * blockDim.x + threadIdx.x;
int j= blockIdx.y * blockDim.y + threadIdx.y;
if(i>=n*k || j>=m*k)
return;
f[(int)(j+(m*k*i))] = tex2D (tex, j/k+0.5f, i/k+0.5f);
}
Does anyone know why the multithread version doesn't work?
Regards
Sondre
In the second kernel, i and j are int instead of float. So j/k and i/k in tex2D will result in integer division. Declare k as float to avoid integer division.
Initially, the kernel was launched with the following configuration:
//Find number of blocks
int nthreads = 1024;
int blocksize = 512;
int nblocks = ceil( (n*m*numberOfInterpolationsPerSquare) / nthreads);
// Launch Kernel
kernel <<< nblocks,blocksize >>> (m, n, f, numberOfInterpolationsPerSquare);
The problem with the above code is that it would launch a 1D grid of 1D blocks, but inside the kernel, 2D indexing is used. A 2D grid/block configuration is required for the kernel to work correctly. From the looks of the kernel code, following grid/block configuration should work:
float k = sqrt(numberOfInterpolationsPerSquare);
const int threads_x = (int)ceil(n * k);
const int threads_y = (int)ceil(m * k);
const dim3 dimBlock(16,16);
dim3 dimGrid;
dimGrid.x = (threads_x + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (threads_y + dimBlock.y - 1)/dimBlock.y;
kernel<<<dimGrid,dimBlock>>>(m, n, f, numberOfInterpolationsPerSquare);