I'm attempting to compile a basic CUDA matrix multiplication program, but I'm running into this error:
nvcc -I. -I/usr/local/cuda/include -c matrixMult1.cu -o matrixMult1.o
make: nvcc: Command not found
make: *** [matrixMult1.o] Error 127
I was getting another error originally and it was recommended that I use nvcc, the only catch being that I know absolutely nothing about nvcc. Anyone have an idea? Thanks in advance!
Makefile:
GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMult1.o: matrixMult1.cu
$(GCC) $(INCLUDES) -c matrixMult1.cu -o $#
matrixMult1: matrixMult1.o
$(GCC) -o $# matrixMult1.o $(CUDA_LIBS)
clean:
$(RM) *.o *~
Kernel:
//********************************************************************
// matrixMul_kernel.cu
//
// Kernel for a basic matrix multiplication program.
//********************************************************************
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
/* Thread block size */
#define BLOCK_SIZE 3
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
/* CUDA Kernel */
__global__ void matrixMul (float * C, float * A, float * B, int wA,
int wB) {
/* Two dimensional thread ID */
int tx = threadIdx.x;
int ty = threadIdx.y;
/* Computation holder variable */
float value = 0;
/* Loop through row of A and column of B to compute cell of C */
for (int i = 0; i < wA; ++i) {
float elementA = A[ty * wA + i];
float elementB = B[i * wB + tx];
value += elementA * elementB;
}
/* Write the result to C */
C[ty * wA + tx] = value;
}
#endif
Main Program:
//********************************************************************
// matrixMult1.c
//
// A basic matrix multiplication program.
//********************************************************************
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMul_kernel.cu>
#define WA 3
#define HA 3
#define WB 3
#define HB WA
#define WC WB
#define HC HA
void initMatrix(float * matrix, int numIndices);
//*************
// Main Program
//*************
int main(int argc, char** argv) {
/* Set random seed */
srand(2013);
/* Compute memory sizes for matrices A, B, and C */
unsigned int sizeA = WA * HA;
unsigned int sizeB = WB * HB;
unsigned int sizeC = WC * HC;
unsigned int memoryA = sizeof(float) * sizeA;
unsigned int memoryB = sizeof(float) * sizeB;
unsigned int memoryC = sizeof(float) * sizeC;
/* Allocate memory for matrices A, B, and C */
float * matrixA = (float *) malloc(memoryA);
float * matrixB = (float *) malloc(memoryB);
float * matrixC = (float *) malloc(memoryC);
/* Initialize matrices A and B */
initMatrix(matrixA, sizeA);
initMatrix(matrixB, sizeB);
/* Print matrix A */
printf("\nMatrix A:\n");
for (int i = 0; i < sizeA; i++) {
printf("%f ", matrixA[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Print matrix B */
printf("\nMatrix B:\n");
for (int i = 0; i < sizeB; i++) {
printf("%f ", matrixB[i]);
if (((i + 1) % WA) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
/* Allocate device memory */
float* deviceMemA;
float* deviceMemB;
float* deviceMemC;
cudaMalloc((void**) &deviceMemA, memoryA);
cudaMalloc((void**) &deviceMemB, memoryB);
cudaMalloc((void**) &deviceMemC, memoryC);
/* Copy host memory to device */
cudaMemcpy(deviceMemA, matrixA, memoryA,
cudaMemcpyHostToDevice);
cudaMemcpy(deviceMemB, matrixB, memoryB,
cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);
/* Execute kernel */
matrixMul<<< grid, threads >>>(deviceMemC, deviceMemA,
deviceMemB, WA, WB);
cudaMemcpy(deviceMemC, matrixC, memoryC,
cudaMemcpyHostToDevice);
/* Print matrix C */
printf("\nMatrix C:\n");
for (int i = 0; i < sizeC; i++) {
printf("%f ", matrixC[i]);
if (((i + 1) % WC) == 0) {
printf("\n");
} else {
printf(" | ");
}
}
printf("\n");
/* Free up memory */
free(matrixA);
free(matrixB);
free(matrixC);
cudaFree(deviceMemA);
cudaFree(deviceMemB);
cudaFree(deviceMemC);
}
//--------------------------------------------------------------------
// initMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been instantiated with a random
// float value.
//--------------------------------------------------------------------
void initMatrix(float * matrix, int numIndices) {
/*
Loop through the block of bytes, assigning a random float
for each index of the matrix
*/
for (int i = 0; i < numIndices; ++i) {
/* Assign a random float between 0 and 1 at this byte */
matrix[i] = rand() / (float)RAND_MAX;
}
}
This error:
nvcc: Command not found
indicates that nvcc is not in your shell's PATH.
To fix it, assuming it's bash or similar:
PATH=$PATH:/usr/local/cuda/bin
make
...or add it to the system or your user's profile.
Related
I am trying to implement Cholesky decomposition using the cuSOLVER library. I am a beginner CUDA programmer and I have always specified block-sizes and grid-sizes, but I am not able to find out how this can be set explicitly by the programmer with cuSOLVER functions.
Here is the documentation: http://docs.nvidia.com/cuda/cusolver/index.html#introduction
The QR decomposition is implemented using the cuSOLVER library (see the example here: http://docs.nvidia.com/cuda/cusolver/index.html#ormqr-example1) and even there the above two parameters are not set.
To summarize, I have the following questions
How can the parameters: block-size and grid-size can be set with the cuSOLVER library?
How is the same being done with the mentioned QR example code in the NVIDIA documentation?
Robert Crovella has already answered this question. Here, I'm just providing a full example showing how Cholesky decomposition can be easily performed using the potrf function provided by the cuSOLVER library.
The Utilities.cu and Utilities.cuh files are mantained at this page and omitted here. The example implements the CPU as well as the GPU approach.
#include "cuda_runtime.h"
#include "device_launch_paraMeters.h"
#include<iostream>
#include <fstream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>
#include "Utilities.cuh"
#define prec_save 10
/******************************************/
/* SET HERMITIAN POSITIVE DEFINITE MATRIX */
/******************************************/
// --- Credit to: https://math.stackexchange.com/questions/357980/how-to-generate-random-symmetric-positive-definite-matrices-using-matlab
void setPDMatrix(double * __restrict h_A, const int N) {
// --- Initialize random seed
srand(time(NULL));
double *h_A_temp = (double *)malloc(N * N * sizeof(double));
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A_temp[i * N + j] = (float)rand() / (float)RAND_MAX;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = 0.5 * (h_A_temp[i * N + j] + h_A_temp[j * N + i]);
for (int i = 0; i < N; i++) h_A[i * N + i] = h_A[i * N + i] + N;
}
/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {
T *h_in = (T *)malloc(M * sizeof(T));
gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));
std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();
}
/********/
/* MAIN */
/********/
int main(){
const int N = 1000;
// --- CUDA solver initialization
cusolverDnHandle_t solver_handle;
cusolveSafeCall(cusolverDnCreate(&solver_handle));
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));
/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Setting the host, N x N matrix
double *h_A = (double *)malloc(N * N * sizeof(double));
setPDMatrix(h_A, N);
// --- Allocate device space for the input matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * sizeof(double)));
// --- Move the relevant matrix from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * sizeof(double), cudaMemcpyHostToDevice));
/****************************************/
/* COMPUTING THE CHOLESKY DECOMPOSITION */
/****************************************/
// --- cuSOLVE input/output parameters/arrays
int work_size = 0;
int *devInfo; gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
// --- CUDA CHOLESKY initialization
cusolveSafeCall(cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, &work_size));
// --- CUDA POTRF execution
double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));
cusolveSafeCall(cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_LOWER, N, d_A, N, work, work_size, devInfo));
int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
if (devInfo_h != 0) std::cout << "Unsuccessful potrf execution\n\n" << "devInfo = " << devInfo_h << "\n\n";
// --- At this point, the lower triangular part of A contains the elements of L.
/***************************************/
/* CHECKING THE CHOLESKY DECOMPOSITION */
/***************************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\h_A.txt", N * N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCholeskyCUDA\\solveSquareLinearSystemCholeskyCUDA\\d_A.txt", N * N);
cusolveSafeCall(cusolverDnDestroy(solver_handle));
return 0;
}
EDIT
Cholesky decomposition requires that the relevant matrix is Hermitian and positive definite. Symmetric and positive definite matrices can be generated by the approach in How to generate random symmetric positive definite matrices using MATLAB?.
The following Matlab code can be used for checking the results
clear all
close all
clc
warning off
N = 1000;
% --- Setting the problem solution
x = ones(N, 1);
load h_A.txt
A = reshape(h_A, N, N);
yMatlab = A * x;
Lmatlab = chol(A, 'lower');
xprime = inv(Lmatlab) * yMatlab;
xMatlab = inv(Lmatlab') * xprime;
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));
load d_A.txt
LCUDA = tril(reshape(d_A, N, N));
fprintf('Percentage rms of Cholesky decompositions in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(Lmatlab - LCUDA).^2)) / sum(sum(abs(Lmatlab).^2))));
load xCUDA.txt
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xCUDA - x).^2)) / sum(sum(abs(x).^2))));
You don't set block sizes (or grid sizes) explicitly when using a library like cusolver, or cublas, or cusparse.
The library chooses these, when it actually runs device code internal to the library.
I've spent several hours struggling with unspecified launch failure.
I've come up with a tiny task for myself in order to understand how shared memory works.
Task is to divide array [1, 2, 3, ... , N] into K group of (N / K) elements and find the sum of each group. (Difference between current and previous element of the array equals 1)
I was planning to use N threads in grid divided between K blocks. So every threadblock contains (N / K) threads. Thus one threadblock could be used to compute sum of one group. Also I wanted to dynamically allocate shared memory.
When I start program I got unspecified launch failure after cudaDeviceSynchronize() call. But when I try step-through debugging everthing is ok and works fine.
What am I doing wrong? (Visual Studio 2012 Professional, Compute Capability 2.1) I would very appreciate any help.
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
extern __shared__ double shrd[];
__global__ void kernel(double * a){
size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
temp[threadID_block] = static_cast<double>(threadID_global);
__syncthreads();
if (threadID_block == 0){
a[blockID_global] = 0.0;
for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
a[blockID_global] += temp[index];
}
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
dim3 gridSize(2,2,1);
dim3 blockSize(4,4,1);
double * dev_a = NULL;
size_t length = gridSize.x * gridSize.y ;
size_t byteSize = length * sizeof(double);
CUDA_CALL(cudaMalloc(&dev_a,byteSize));
size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
kernel <<< gridSize, blockSize, shmem_perBlock >>> (dev_a);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
double * a = new double [length];
CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
for (size_t index = 0; index < length; index++){
printf("%.3f\n",a[index]);
}
printf("\n");
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaDeviceReset());
delete[]a;
system("pause");
return 0;
}
If you are on kepler or later first read this:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
Otherwise if you are pre-kepler read this:
http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
There are some fundamentals you are missing in terms of CUDA programming. I have given you a template of your code below. It is for clarification of some of these fundamentals. Do not expect this to be optimized as I am expecting you to program the parallel reduction. This will get you started with an understanding on how to use shared memory.
Good Luck!
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 10000
#define K 100
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf("%s\n",cudaGetErrorString(x)); \
system("pause"); \
return EXIT_FAILURE;}} while(0)
__global__ void kernel(double* a, double* results){
extern __shared__ double shared[];
size_t tid, tid_local, stride;
tid = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
tid_local = threadIdx.x; //thread id within a block
stride = blockDim.x*gridDim.x; //total number of threads
double *start = &a[K*blockIdx.x]; //each block will get K of a block.
shared[tid_local]=start[tid_local]; //copy K elements into shared memory
__syncthreads();
//Perform Parallel reduction, you will have to implement this
//After parallel reduction, result should be in shared[0]
//for demonstration I made the code serial for each block on thread 0.
//This is for demonstration only.
double sum=0;
if(tid_local==0){
for(int i=0; i<K; i++){
sum+=shared[i];
}
a[blockIdx.x]=sum;
}
}
int main(){
int devNum = 0;
CUDA_CALL(cudaGetDevice(&devNum));
CUDA_CALL(cudaSetDevice(devNum));
double * dev_a = NULL;
double * dev_results=NULL;
CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));
//copy dev_a onto GPU (this is the array you are summing).
dim3 block_size(K, 1, 1);
dim3 grid_size (N/K, 1, 1);
size_t shmem_perBlock = K * sizeof(double);
kernel <<< grid_size, block_size, shmem_perBlock >>> (dev_a, dev_results);
CUDA_CALL(cudaGetLastError());
CUDA_CALL(cudaDeviceSynchronize());
//copy dev_results back to CPU, this is your result.
CUDA_CALL(cudaFree(dev_a));
CUDA_CALL(cudaFree(dev_results));
system("pause");
return 0;
}
I want to use Scsrmv cusparse function.
Reading the documentation from here, I can't figure how to define csrRowPtrA and csrColIndA:
csrRowPtrA : integer array of m+1 elements that contains the start of
every row and the end of the last row plus one.
csrColIndA : integer array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) )
column indices of the nonzero elements of matrix A.
So , for example:
float *devRow;
cudaMalloc((void **)&devRow, (m+1)*sizeof(float));
and if A is the matrix, then:
for (int i=0; i<m; i+= n) //m is rows , n is columns
devRow[i] = A[i];
This is for the start of every row. For the last row and plus 1? This confused me.
And for columns? Something like:
for (int i=0;i<nnz;i++)
devCol = devRow[m] - devRow[0];
Following Robert Crovella's answer, this is a fully worked example on how converting a sparse matrix stored in dense format into a CSR (compressed row storage) format. I hope it will be useful to other users.
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include "Utilities.cuh"
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if(CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 5; // --- Number of columns
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f; h_A_dense[16] = 0.0f;
h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f; h_A_dense[17] = 0.0f;
h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f; h_A_dense[18] = 8.0f;
h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f; h_A_dense[19] = 6.0f;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase (descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
}
You can convert a dense matrix to sparse with code you write yourself. For the CSR (compressed-sparse-row) formulation, you could also use the CUSPARSE function for this.
The general format of a CSR sparse matrix representation is documented in many places, including the CUSPARSE manual.
The following program tests the dense to sparse conversion using cuSPARSE. It produces garbage in the first several lines of output. But if I move the lines marked with (2) to the place after the lines marked with (1), the program works fine. Can someone tell me what could be the reason?
EDIT:
To make the presentation clearer, I rewrote the program with thrust, the same issue persists.
EDIT:
As suggested by Robert, I changed it back to the version without thrust and added api level error check code.
#include <iostream>
#include <cusparse_v2.h>
using std::cerr;
using std::cout;
using std::endl;
#define WRAP(x) do {x} while (0)
#define CHKcusparse(x) WRAP( \
cusparseStatus_t err = (x); \
if (err != CUSPARSE_STATUS_SUCCESS) { \
cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line " \
<< __LINE__ << " of " << __FILE__ << ": " << #x << endl; \
exit(1); \
} \
)
#define CHKcuda(x) WRAP( \
cudaError_t err = (x); \
if (err != cudaSuccess) { \
cerr << "Cuda Error #" << int(err) << ", \"" \
<< cudaGetErrorString(err) << "\" at Line " << __LINE__ \
<< " of " << __FILE__ << ": " << #x << endl; \
exit(1); \
} \
)
#define ALLOC(X, T, N) do { \
h##X = (T*) malloc(sizeof(T) * (N)); \
CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \
} while(0)
int main() {
srand(100);
cusparseHandle_t g_cusparse_handle;
CHKcusparse(cusparseCreate(&g_cusparse_handle));
const int n = 100, in_degree = 10;
int nnz = n * in_degree, nn = n * n;
int *dnnz, *dridx, *dcols;
int *hnnz, *hridx, *hcols;
float *dvals, *dmat;
float *hvals, *hmat;
// (1) The number of non-zeros in each column.
ALLOC(nnz, int, n);
// The dense matrix.
ALLOC(mat, float, nn);
// The values in sparse matrix.
ALLOC(vals, float, nnz);
// (2) The row indices of the sparse matrix.
ALLOC(ridx, int, nnz);
// The column offsets of the sparse matrix.
ALLOC(cols, int, n+1);
// Fill and copy dense matrix and number of non-zeros.
for (int i = 0; i < nn; i++) {hmat[i] = rand();}
for (int i = 0; i < n; i++) {hnnz[i] = in_degree;}
CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice));
CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice));
CHKcuda(cudaDeviceSynchronize());
// Perform dense to CSC format
cusparseMatDescr_t cspMatDesc;
CHKcusparse(cusparseCreateMatDescr(&cspMatDesc));
CHKcusparse(cusparseSdense2csc(
g_cusparse_handle, n, n, cspMatDesc, dmat, n,
dnnz, dvals, dridx, dcols
));
// Copy row indices back.
CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost));
CHKcuda(cudaDeviceSynchronize());
CHKcusparse(cusparseDestroyMatDescr(cspMatDesc));
// Display row indices.
for (int i = 0; i < n; i++) {
for (int j = 0; j < in_degree; j++) {
std::cout << hridx[i * in_degree + j] << ", ";
}
std::cout << std::endl;
}
CHKcuda(cudaFree(dnnz));
CHKcuda(cudaFree(dvals));
CHKcuda(cudaFree(dridx));
CHKcuda(cudaFree(dcols));
CHKcuda(cudaFree(dmat));
free(hnnz);
free(hmat);
free(hvals);
free(hridx);
free(hcols);
return 0;
}
The basic problem is that you are passing internally inconsistent data to the dense-to-sparse routine. You are passing a dense matrix which has 100 non-zero elements per column, but you are telling cusparse that there are only 10 non-zero elements per column.
If you run your code with cuda-memcheck, you will see that there are errors coming out of cusparse.
For this code, you can fix the issue by changing your in_degree variable to 100.
For the general case, cusparse provides a convenient routine to populate the number of non-zero elements per column correctly.
As already underlined by Robert Crovella, passing from dense to sparse can be effectively performed using cuSPARSE by the cusparse<t>nnz() and cusparse<t>dense2csr() routines. The vice versa can be done by the cusparse<t>csr2dense() routine. Below, there is a fully worked out example showing how passing from dense to sparse and vice versa using cuSPARSE in CSR format.
cuSparseUtilities.cuh
#ifndef CUSPARSEUTILITIES_CUH
#define CUSPARSEUTILITIES_CUH
#include "cusparse_v2.h"
void setUpDescriptor(cusparseMatDescr_t &, cusparseMatrixType_t, cusparseIndexBase_t);
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
const cusparseHandle_t handle, const int Nrows, const int Ncols);
#endif
cuSparseUtilities.cu
#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"
/*****************************/
/* SETUP DESCRIPTOR FUNCTION */
/*****************************/
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) {
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType(descrA, matrixType));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase));
}
/********************************************************/
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */
/********************************************************/
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
const cusparseHandle_t handle, const int Nrows, const int Ncols) {
const int lda = Nrows; // --- Leading dimension of dense matrix
gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int)));
// --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz));
// --- Device side sparse matrix
gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double)));
gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int)));
gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0]));
}
kernel.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cusparse_v2.h>
#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"
/********/
/* MAIN */
/********/
int main() {
cusparseHandle_t handle;
// --- Initialize cuSPARSE
cusparseSafeCall(cusparseCreate(&handle));
cusparseMatDescr_t descrA = 0;
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int Nrows = 5; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows * Ncols * sizeof(*h_A_dense));
// --- Column-major storage
h_A_dense[ 0] = 0.4612f; h_A_dense[ 5] = -0.0006f; h_A_dense[10] = 1.3f; h_A_dense[15] = 0.0f;
h_A_dense[ 1] = 0.0f; h_A_dense[ 6] = 1.443f; h_A_dense[11] = 0.0f; h_A_dense[16] = 0.0f;
h_A_dense[ 2] = -0.0006f; h_A_dense[ 7] = 0.4640f; h_A_dense[12] = 0.0723f; h_A_dense[17] = 0.0f;
h_A_dense[ 3] = 0.3566f; h_A_dense[ 8] = 0.0723f; h_A_dense[13] = 0.7543f; h_A_dense[18] = 0.0f;
h_A_dense[ 4] = 0.f; h_A_dense[ 9] = 0.0f; h_A_dense[14] = 0.0f; h_A_dense[19] = 0.1f;
// --- Create device array and copy host array to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
/*******************************/
/* FROM DENSE TO SPARSE MATRIX */
/*******************************/
// --- Descriptor for sparse matrix A
setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ONE);
int nnz = 0; // --- Number of nonzero elements in dense matrix
int *d_nnzPerVector; // --- Device side number of nonzero elements per row
double *d_A; // --- Sparse matrix values - array of size nnz
int *d_A_RowIndices; // --- "Row indices"
int *d_A_ColIndices; // --- "Column indices"
dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA, handle, Nrows, Ncols);
/*******************************************************/
/* CHECKING THE RESULTS FOR DENSE TO SPARSE CONVERSION */
/*******************************************************/
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(int));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(int), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Host side sparse matrix
double *h_A = (double *)malloc(nnz * sizeof(double));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(int));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(int));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(int), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix in CSR format\n\n");
for (int i = 0; i < nnz; ++i) printf("A[%i] = %f\n", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
/*******************************/
/* FROM SPARSE TO DENSE MATRIX */
/*******************************/
double *d_A_denseReconstructed; gpuErrchk(cudaMalloc(&d_A_denseReconstructed, Nrows * Ncols * sizeof(double)));
cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices,
d_A_denseReconstructed, Nrows));
/*******************************************************/
/* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */
/*******************************************************/
double *h_A_denseReconstructed = (double *)malloc(Nrows * Ncols * sizeof(double));
gpuErrchk(cudaMemcpy(h_A_denseReconstructed, d_A_denseReconstructed, Nrows * Ncols * sizeof(double), cudaMemcpyDeviceToHost));
printf("\nReconstructed dense matrix \n");
for (int m = 0; m < Nrows; m++) {
for (int n = 0; n < Ncols; n++)
printf("%f\t", h_A_denseReconstructed[n * Nrows + m]);
printf("\n");
}
return 0;
}
I'm attempting to write a simple matrix multplication program that continually adds the product of two matrices to a third result matrix (I'm essentially giving the GPU a workout while I measure power consumption with a separate apparatus).
My problem occurs when I specify a large number of iterations. I've tried this with several combinations of BLOCK_SIZE and matrix dimension values, and I've noted that the number of iterations can be increased with smaller matrix dimensions, but the BLOCK_SIZE must be the square root of the matrix dimensions (square matrices).
The resulting error in this case is a 39 second (regardless of iteration value, as long as it is 'too much') freeze followed by all zero matrix output. Interestingly, I ran this once with an iteration of 20000 and it worked fine. I ran it again and got the freeze error.
Any ideas? Thanks in advance!
Kernel:
//********************************************************************
// matrixMultiplication_kernel.cu
//
// Kernel for a basic CUDA matrix multiplication program.
//********************************************************************
#ifndef MATRIXMULTIPLICATION_KERNEL
#define MATRIXMULTIPLICATION_KERNEL
#define BLOCK_SIZE 16 // Set thread block size
#define colsA 256 // Set matrix A column dimension
#define rowsA 256 // Set matrix A row dimension
#define colsB 256 // Set matrix B column dimension
#define rowsB colsA // Set matrix B row dimension
#define colsC colsB // Set matrix C column dimension
#define rowsC rowsA // Set matrix C row dimension
//--------------------------------------------------------------------
// matrixMultiplication() - Multiplies matrixA and matrixB, storing
// the result in device memory for matrixC.
//
// PRE: matrixA, matrixB, and matrixC are float pointers; numColsA
// numColsB are integers.
// POST: The result of multiplying matrixA and matrixB is stored in
// matrixC.
//--------------------------------------------------------------------
__global__ void matrixMultiplication(float * matrixA, float * matrixB,
float * matrixC, int numColsA,
int numColsB) {
/* Declare matrix-multplication holder value ouside of for loop */
float val;
/* Set block and thread index positions */
int blockX = blockIdx.x;
int blockY = blockIdx.y;
int threadX = threadIdx.x;
int threadY = threadIdx.y;
/*
Set starting and ending indices of the first sub-matrix of A
and sub-matrix size for matrix A
*/
int startA = numColsA * BLOCK_SIZE * blockY;
int endA = startA + numColsA - 1;
int subSizeA = BLOCK_SIZE;
/*
Set starting index of the first sub-matrix of B and sub-matrix
size for matrix B
*/
int startB = BLOCK_SIZE * blockX;
int subSizeB = BLOCK_SIZE * colsB;
/* Perform matrix multiplication 20000 times */
for (int iteration = 0; iteration < 20000; iteration++) {
/* Loop through matrix A and matrix B's sub-matrices */
for (int i = startA, j = startB; i <= endA; i += subSizeA,
j += subSizeB) {
/*
Declare shared memory arrays for matrix A and B
sub-matrices
*/
__shared__ float subA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float subB[BLOCK_SIZE][BLOCK_SIZE];
/* Fill sub-matrices */
subA[threadY][threadX] =
matrixA[i + colsA * threadY + threadX];
subB[threadY][threadX] =
matrixB[j + colsB * threadY + threadX];
/* Ensure that the matrices are loaded */
__syncthreads();
/* Loop through the block */
for (int k = 0; k < BLOCK_SIZE; ++k) {
/* Compute product of two matrix indices */
val += subA[threadY][k] * subB[k][threadX];
}
/*
Ensure completion before the next set of sub-matrices
begin computation
*/
__syncthreads();
}
/* Set device memory for this sub-matrix */
int position = colsB * BLOCK_SIZE * blockY + BLOCK_SIZE * blockX;
matrixC[position + colsB * threadY + threadX] = val;
}
}
#endif
Host:
//********************************************************************
// matrixMultiplication.cu
//
// A basic CUDA matrix multiplication program.
//********************************************************************
/* Include necessary libraries and kernel */
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMultiplication_kernel.cu>
/* Function declarations */
void fillMatrix(float * matrix, int numIndices);
//*************
// Main Program
//*************
int main(int argc, char** argv) {
/* Declare device memory */
float * deviceA;
float * deviceB;
float * deviceC;
srand(2013); // Set random seed
/* Determine total number of indices in each matrix */
unsigned int numIndicesA = colsA * rowsA;
unsigned int numIndicesB = colsB * rowsB;
unsigned int numIndicesC = colsC * rowsC;
/* Determine memory size of each matrix */
unsigned int memoryA = sizeof(float) * numIndicesA;
unsigned int memoryB = sizeof(float) * numIndicesB;
unsigned int memoryC = sizeof(float) * numIndicesC;
/* Allocate memory for each matrix */
float * matrixA = (float *) malloc(memoryA);
float * matrixB = (float *) malloc(memoryB);
float * matrixC = (float *) malloc(memoryC);
/* Set contents of matrices A and B (matrix C is all zeros) */
fillMatrix(matrixA, numIndicesA);
fillMatrix(matrixB, numIndicesB);
/* Allocate device memory for each matrix */
cudaMalloc((void **) &deviceA, memoryA);
cudaMalloc((void **) &deviceB, memoryB);
cudaMalloc((void **) &deviceC, memoryC);
/* Copy host memory to device memory for matrices A and B */
cudaMemcpy(deviceA, matrixA, memoryA, cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, matrixB, memoryB, cudaMemcpyHostToDevice);
/* Set thread count to BLOCK_SIZE x BLOCK_SIZE */
dim3 tCount(BLOCK_SIZE, BLOCK_SIZE);
/* Set thread block count */
dim3 tbCount((colsC / tCount.x), (rowsC / tCount.y));
/* Run kernel */
matrixMultiplication <<< tbCount, tCount >>> (deviceA, deviceB,
deviceC, colsA,
colsB);
/* Copy device memory to host memory for matrix C */
cudaMemcpy(matrixC, deviceC, memoryC, cudaMemcpyDeviceToHost);
for(int i = 0; i < 256; i++) {
printf("%f ", matrixC[i]);
}
printf("\n");
/* Free up host and device memory for each matrix */
free(matrixA);
free(matrixB);
free(matrixC);
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
}
//--------------------------------------------------------------------
// fillMatrix - Assigns a random float value to each indice of the
// matrix.
//
// PRE: matrix is a pointer to a block of bytes in memory; numIndices
// is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been filled with random float
// values.
//--------------------------------------------------------------------
void fillMatrix(float * matrix, int numIndices) {
/* Loop through each index of the matrix */
for (int i = 0; i < numIndices; ++i) {
/*
Assign a random float between 0 and 1 for this index of
the matrix
*/
matrix[i] = rand() / (float)RAND_MAX;
}
}
Makefile:
GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart
matrixMultiplication.o: matrixMultiplication.cu
$(GCC) $(INCLUDES) -c matrixMultiplication.cu -o $#
matrixMultiplication: matrixMultiplication.o
$(GCC) -o $# matrixMultiplication.o $(CUDA_LIBS)
clean:
$(RM) *.o *~
Problem solved! It was a system timeout issue due to the long duration of the kernel. By switching terminal only mode, I was able to circumvent the issue.
Thanks for all the help guys!