I am trying to use Unified Memory with cudaMallocManaged() with the cuBLAS library. I am performing a simple matrix to vector multiplication as a simple example, and storing the result in an array results. However when printing the results array, I get back all 0's, instead of the results of multiplying the matrix mat by the vector vec.
The flow I am using is:
allocating memory with cudaMallocManaged()
Initializing the arrays with data
Allocating the cuBLAS handle
Calling cublasDgemv to perform the multiplication storing the results in results
When using new and then cublasSetMatrix() or cublasSetVector() this works fine.
How do I use Unified Memory with cuBLAS?
Here are minimum working examples:
Unified Memory Attempt (this gives back all 0's in results):
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static const char *cublasErrChk(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
int main() {
size_t dims = 4;
double *vec, *mat, *results;
cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++) {
vec[i] = 0.5 * i;
printf("%.2lf ", vec[i]);
}
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++) {
mat[i] = 1.0 * i;
printf("%.2lf ", mat[i]);
if (i % dims == 0)
printf("\n");
}
printf("\n");
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
double alpha = 1.f, beta = 1.f;
// multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
mat, dims,
vec, 1,
&beta,
results, 1
)
);
for (int i = 0; i < dims; i++)
printf("%.2lf ", results[i]);
printf("\n");
cudaErrChk( cudaFree(vec) );
cudaErrChk( cudaFree(mat) );
cudaErrChk( cudaFree(results) );
return 0;
}
Regular malloc/setMatrix() Attempt:
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static const char *cublasErrChk(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
int main() {
size_t dims = 4;
double *h_vec, *h_mat, *h_results;
h_vec = new double[dims];
h_mat = new double[dims * dims];
h_results = new double[dims];
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++) {
h_vec[i] = 0.5 * i;
printf("%.2lf ", h_vec[i]);
}
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++) {
h_mat[i] = 1.0 * i;
printf("%.2lf ", h_mat[i]);
if (i % dims == 0)
printf("\n");
}
printf("\n");
double *d_vec, *d_mat, *d_results;
cudaErrChk( cudaMalloc(&d_vec, dims * sizeof(double)) );
cudaErrChk( cudaMalloc(&d_mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMalloc(&d_results, dims * sizeof(double)) );
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
// copy the data manually to the GPUs
cublasErrChk( cublasSetVector(dims, sizeof(*d_vec), h_vec, 1, d_vec, 1) );
cublasErrChk( cublasSetMatrix(dims, dims, sizeof(double), h_mat, dims, d_mat, dims) );
double alpha = 1.f, beta = 1.f;
// // multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
d_mat, dims,
d_vec, 1,
&beta,
d_results, 1
)
);
cublasErrChk( cublasGetVector(dims, sizeof(*h_results), d_results, 1, h_results, 1) );
for (int i = 0; i < dims; i++)
printf("%.2lf ", h_results[i]);
printf("\n");
cudaErrChk( cudaFree(d_vec) );
cudaErrChk( cudaFree(d_mat) );
cudaErrChk( cudaFree(d_results) );
delete [] h_vec;
delete [] h_mat;
delete [] h_results;
return 0;
}
Compile with
nvcc -o main main.cu -lcublas
As #talonmies pointed out, the problem was that I was using an asynchronous call and not getting the results back in time. This is fixed by adding cudaDeviceSynchronize() after the cublasDgemv() call:
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"
#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
static const char *cublasErrChk(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
int main() {
size_t dims = 4;
double *vec, *mat, *results;
cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );
printf("Vector:\n");
for (int i = 1; i < dims + 1; i++) {
vec[i] = 0.5 * i;
printf("%.2lf ", vec[i]);
}
printf("\n\nMatrix:\n");
for (int i = 1; i < dims * dims + 1; i++) {
mat[i] = 1.0 * i;
printf("%.2lf ", mat[i]);
if (i % dims == 0)
printf("\n");
}
printf("\n");
cublasHandle_t handle;
cublasErrChk( cublasCreate(&handle) );
double alpha = 1.f, beta = 1.f;
// multiply mat by vec to get results
cublasErrChk(
cublasDgemv(
handle, CUBLAS_OP_N,
dims, dims,
&alpha,
mat, dims,
vec, 1,
&beta,
results, 1
)
);
cudaDeviceSynchronize();
for (int i = 0; i < dims; i++)
printf("%.2lf ", results[i]);
printf("\n");
cudaErrChk( cudaFree(vec) );
cudaErrChk( cudaFree(mat) );
cudaErrChk( cudaFree(results) );
return 0;
}
Related
I am currently working on CUDA and trying to solve Ax = b using cuBLAS and cuSPARSE library. I looked through the sample codes including conjugateGradient & conjugateGradientPrecond provided by NVIDIA. However, the conjugate gradient method only works for positive definite matrix and it is an iterative method. Now, I have some general sparse matrices and I think I should take advantage of cuSPARSE library. Does anyone know how can I solve Ax = b using cuSPARSE and cuBLAS libraries? I could not find useful APIs for me. Generally, the matrices are expected to be at least 1000x1000 and in some cases it would go up to 100000x100000. Should I do this using a direct method?
One possibility to solve general sparse linear systems in CUDA is using cuSOLVER.
cuSOLVER has three useful routines:
cusolverSpDcsrlsvlu, which works for square linear systems (number of unknowns equal to the number of equations) and internally uses sparse LU factorization with partial pivoting;
cusolverSpDcsrlsvqr, which works for square linear systems (number of unknowns equal to the number of equations) and internally uses sparse QR factorization;
cusolverSpDcsrlsqvqr, which works for rectangular linear systems (number of unknowns different to the number of equations) and internally solves a least square problem.
For ALL the above routines, the supported matrix type is CUSPARSE_MATRIX_TYPE_GENERAL. If A is symmetric/Hermitian and only lower/upper part is used or meaningful, then its missing upper/lower part must be extended.
NOTES ON cusolverSpDcsrlsvlu
Attention should be paid to two input parameters: tol and reorder. Concerning the former, if the system matrix A is singular, then some diagonal elements of the matrix U of the LU decomposition are zero. The algorithm decides for zero if |U(j,j)|<tol. Concerning the latter, cuSOLVER provides a reordering to reduce
zero fill-in which dramactically affects the performance of LU factorization. reorder toggles between reordering (reorder=1) or not reordering (reorder=0).
Attention should be paid also to an output parameter: singularity. It is -1 if A is invertible, otherwise it provides the first index j such that U(j,j)=0.
NOTES ON cusolverSpDcsrlsvqr
Attention should be paid to the same input/output parameters are before. In particular, tol is used to decide for singularity, reorder has no effect and singularity is -1 if A is invertible, otherwise it returns the first index j such that R(j,j)=0.
NOTES ON cusolverSpDcsrlsqvqr
Attention should be paid to the input parameter tol, which is used to decide the rank of A.
Attention should be also paid to the output parameters rankA, which represents the numerical rank of A, p, a permutation vector of length equal to the number of columns of A (please, see the documentation for further details) and min_norm, which is the norm of the residual ||Ax - b||.
Currently, as of CUDA 10.0, the above three functions are for the host channel only, which means that they do not yet run on GPU. They must be called as:
cusolverSpDcsrlsvluHost;
cusolverSpDcsrlsvqrHost;
cusolverSpDcsrlsqvqrHost,
and the input argument should all reside on the host.
Below, please find a fully worked example using all the above three possibilities:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cusparse.h>
#include <cusolverSp.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
//extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__host__ __device__ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cusolverGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if (CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
_cusolverGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs", __FILE__, __LINE__, err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f;
h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f;
h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f;
h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
// --- Allocating and defining dense host and device data vectors
double *h_y = (double *)malloc(Nrows * sizeof(double));
h_y[0] = 100.0; h_y[1] = 200.0; h_y[2] = 400.0; h_y[3] = 500.0;
double *d_y; gpuErrchk(cudaMalloc(&d_y, Nrows * sizeof(double)));
gpuErrchk(cudaMemcpy(d_y, h_y, Nrows * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_x = (double *)malloc(Ncols * sizeof(double));
double *d_x; gpuErrchk(cudaMalloc(&d_x, Ncols * sizeof(double)));
// --- CUDA solver initialization
cusolverSpHandle_t solver_handle;
cusolverSpCreate(&solver_handle);
// --- Using LU factorization
int singularity;
cusolveSafeCall(cusolverSpDcsrlsvluHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
// --- Using QR factorization
//cusolveSafeCall(cusolverSpDcsrlsvqrHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
//int rankA;
//int *p = (int *)malloc(N * sizeof(int));
//double min_norm;
//cusolveSafeCall(cusolverSpDcsrlsqvqrHost(solver_handle, N, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, &rankA, h_x, p, &min_norm));
printf("Showing the results...\n");
for (int i = 0; i < N; i++) printf("%f\n", h_x[i]);
}
I have an array of 300,000 points and I want the fft of every 600 points. I'm attempting to use cufftPlanMany to execute, but I'm getting an unknown error here:
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, 1,1, CUFFT_C2C, 500));
retrevialfft.cu(82) : cufftSafeCall() CUFFT error: <unknown>
Here's the code in context
cudaSetDevice(0);
// Allocate host memory for the signal
cufftComplex* h_signal=(cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = rand() / (float)RAND_MAX;
h_signal[i].y = 0;
// printf("Orignal: %f %f \n", h_signal[i].x, h_signal[i].y);
}
int mem_size = sizeof(cufftComplex) * SIGNAL_SIZE;
// Allocate device memory for signal
cufftComplex* d_signal;
cudaMalloc((void**)&d_signal, mem_size);
int rank = 1; //1d plan
int numCols = 300000;
int n[] = {numCols};
int batch = 500;
int istride = 1;
int ostride = 1;
int idist = numCols;
// CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, 1,1, CUFFT_C2C, 500));
// Transform signal
printf("Transforming signal cufftExecC2C\n");
cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD));
// Copy device memory to host
cufftComplex* h_transformed = (cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);;
cudaMemcpy(h_transformed, d_signal, mem_size,
cudaMemcpyDeviceToHost);
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
free(h_signal);
free(h_transformed);
cudaFree(d_signal);
cudaDeviceReset();
Any idea of what the error actually is?
You decided not to show any more detail on your question. Below, I'm providing a full working code using cufftPlanMany() to execute batched 1D FFTs. I hope it helps.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
int batch = 3; // --- How many transforms to be performed
int numCols = 16; // --- Size of each transform
int SIGNAL_SIZE = batch * numCols; // --- Overall size for all the signals
// --- Allocate host memory for all the signals
cufftComplex* h_signal=(cufftComplex*)malloc(sizeof(cufftComplex) * SIGNAL_SIZE);
// --- Initalize host memory for all the signals
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) {
h_signal[i].x = 1.f;
h_signal[i].y = 0.f;
}
// --- Allocate device memory for all the signals
cufftComplex* d_signal; gpuErrchk(cudaMalloc((void**)&d_signal, sizeof(cufftComplex) * SIGNAL_SIZE));
// --- Host to Device memcopy
gpuErrchk(cudaMemcpy(d_signal, h_signal, sizeof(cufftComplex) * SIGNAL_SIZE, cudaMemcpyHostToDevice));
int rank = 1; // --- 1d plan
int n[] = {numCols};
int istride = 1;
int ostride = 1;
int idist = numCols;
int odist = numCols;
// --- CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlanMany(&plan, rank, n, NULL, istride, idist, NULL, ostride, odist, CUFFT_C2C, 500));
// --- Signals transformations
cufftSafeCall(cufftExecC2C(plan, (cufftComplex*)d_signal, (cufftComplex*)d_signal, CUFFT_FORWARD));
// --- Device to Host memcopy
gpuErrchk(cudaMemcpy(h_signal, d_signal, sizeof(cufftComplex) * SIGNAL_SIZE, cudaMemcpyDeviceToHost));
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i) printf("Real part = %f; Imaginar part = %f\n", h_signal[i].x, h_signal[i].y);
// --- Destroy CUFFT context
cufftSafeCall(cufftDestroy(plan));
// --- Memory cleanup
free(h_signal);
gpuErrchk(cudaFree(d_signal));
cudaDeviceReset();
}
I want to perform 441 2D, 32-by-32 FFTs using the batched method provided by the cuFFT library. The parameters of the transform are the following:
int n[2] = {32,32};
int inembed[] = {32,32};
int onembed[] = {32,32/2+1};
cufftPlanMany(&plan,2,n,inembed,1,32*32,onembed,1,32*(32/2+1),CUFFT_D2Z,441);
cufftPlanMany(&inverse_plan,2,n,onembed,1,32*32,inembed,1,32*32,CUFFT_Z2D,441);
After I did the forward and inverse FFTs using the above plans, I could not get the original data back.
Can anyone advise me how to set the parameters correctly for cudaPlanMany? Many thanks in advance.
By the way, is it the best way to use cudaPlanMany for my situation?
Here is a full example on how using cufftPlanMany to perform batched direct and inverse transformations in CUDA. The example refers to float to cufftComplex transformations and back. The final result of the direct+inverse transformation is correct but for a multiplicative constant equal to the overall number of matrix elements nRows*nCols.
#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
/*********************/
/* CUFFT ERROR CHECK */
/*********************/
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";
case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";
case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";
case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";
case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";
case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";
case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";
case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";
case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";
case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}
return "<unknown>";
}
#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
/********/
/* MAIN */
/********/
void main() {
cufftHandle forward_plan, inverse_plan;
int batch = 3;
int rank = 2;
int nRows = 5;
int nCols = 5;
int n[2] = {nRows, nCols};
int idist = nRows*nCols;
int odist = nRows*(nCols/2+1);
int inembed[] = {nRows, nCols};
int onembed[] = {nRows, nCols/2+1};
int istride = 1;
int ostride = 1;
cufftSafeCall(cufftPlanMany(&forward_plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch));
float *h_in = (float*)malloc(sizeof(float)*nRows*nCols*batch);
for(int i=0; i<nRows*nCols*batch; i++) h_in[i] = 1.f;
float2* h_freq = (float2*)malloc(sizeof(float2)*nRows*(nCols/2+1)*batch);
float* d_in; gpuErrchk(cudaMalloc(&d_in, sizeof(float)*nRows*nCols*batch));
float2* d_freq; gpuErrchk(cudaMalloc(&d_freq, sizeof(float2)*nRows*(nCols/2+1)*batch));
gpuErrchk(cudaMemcpy(d_in,h_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyHostToDevice));
cufftSafeCall(cufftExecR2C(forward_plan, d_in, d_freq));
gpuErrchk(cudaMemcpy(h_freq,d_freq,sizeof(float2)*nRows*(nCols/2+1)*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*(nCols/2+1)*batch; i++) printf("Direct transform: %i %f %f\n",i,h_freq[i].x,h_freq[i].y);
cufftSafeCall(cufftPlanMany(&inverse_plan, rank, n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch));
cufftSafeCall(cufftExecC2R(inverse_plan, d_freq, d_in));
gpuErrchk(cudaMemcpy(h_in,d_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyDeviceToHost));
for(int i=0; i<nRows*nCols*batch; i++) printf("Inverse transform: %i %f \n",i,h_in[i]);
getchar();
}
I'm Trying to bin a 2D array to a texture and to do interpolation between the data. My Problem is. When I'm binding my Array to the texture the the Values i access are total nonsense. Even when I'm trying to acces the first Value (text2D(tex,0.0f,0.0f) i doesn't make sense. So i guess I'm binding it wrong or my memcopy is wrong. Any ideas where my mistake is?
Here is the Code
#include <stdio.h>
#include <iostream>
#include "cuda.h"
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "HelloWorld.h"
#include "linearInterpolation_kernel.cu"
#include "linearInterpolation_kernel2.cu"
#include "linearInterpolation_kernel3.cu"
using namespace std;
using std::cout;
const int blocksize = 16;
__global__
void hello(char *a, int *b) {
a[threadIdx.x] += b[threadIdx.x];
}
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( cudaError err, const char *file, const int line )
{
if( cudaSuccess != err) {
printf("%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
inline void __getLastCudaError( const char *errorMessage, const char *file, const int line )
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
printf("%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString( err ) );
}
}
int main()
{
int N = 200;
float *A;
A = (float *) malloc(N*sizeof(float));
float *B;
B = (float *) malloc(N*sizeof(float));
float *result;
result = (float *) malloc(N*sizeof(float));
float angle = 0.5f;
for(int i = 0; i < N; i++){
A[i] = (float)rand();
B[i] = (float)rand();
}
cout << A[3] << endl;
cout << B[3] << endl;
ipLinearTexture(A,B,result,angle,N);
float result2;
result2 = (angle)*A[3] + (1-angle)*B[3];
printf(" A %f B %f Result %f\n", A[3], B[3], result[3]);
cout << result2 << endl;
return 1;
}
void ipLinearTexture(float *A, float* B, float* result, float angle, int N)
{
float cuTime;
const int N2 = N;
float *dev_result;
float **AB;
AB = (float **) malloc( N * sizeof(float *));
if(AB)
{
for(int i = 0; i < N; i++)
{
AB[i] = (float *) calloc( 2 , sizeof(float *));
}
}
for (int i = 0; i < N; i++)
{
AB[i][0] = A[i];
AB[i][1] = B[i];
}
cudaMalloc(&dev_result, N * sizeof(float));
unsigned int size = N * 2 * sizeof(float);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray( &cu_array, &channelDesc,N,2 ));
checkCudaErrors(cudaMemcpyToArray( cu_array, 0, 0, AB, size, cudaMemcpyHostToDevice));
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = false; // access with normalized texture coordinates
checkCudaErrors(cudaBindTextureToArray( tex, cu_array, channelDesc));
dim3 dimBlock(10, 1, 1);
dim3 dimGrid((int)ceil((double)N*2/dimBlock.x), 1, 1);
transformKernel3<<< dimGrid, dimBlock, 0 >>>( dev_result, N, 2, angle);
checkCudaErrors(cudaUnbindTexture(tex));
cudaMemcpy(result, dev_result, N * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
result[0] = (float)cuTime;
cout << "==================================================" << endl;
for (int i = 0 ; i < N ;i++)
{
cout << result[i] << endl;
}
cout << "==================================================" << endl;
cudaFree(dev_result);
cudaFreeArray(cu_array);
}
Here is the code inside the Kernel
#ifndef _SIMPLETEXTURE_KERNEL3_H_
#define _SIMPLETEXTURE_KERNEL3_H_
// declare texture reference for 2D float texture
texture<float, 1> tex;
////////////////////////////////////////////////////////////////////////////////
//! Transform an image using texture lookups
//! #param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel3( float* g_odata, int width, int height, float theta)
{
unsigned int id = blockIdx.x*blockDim.x + threadIdx.x;
if (id < width*height)
{
g_odata[id] = tex1D(tex, xid * 2 + 0.5f);
}
}
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
Like the concept in OpenGL, you could think a 2D texture is a rectangle field. The center point of each small rectangle is your array data. So, tex2D(tex, 0.5f/width, 0.5f/height) will be exactly your first value of array data. (width & height is the width and height of 2D array data)
I want to be able to sort an array out using function pointers in polymorphism. Not to mention, am only doing this to see how things work and so forth.
Here's a simple generic sorting interface, an insertion sort implemented through that interface, and some test code that demonstrates its use:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
struct sort_interface {
// number of elements
size_t nmemb;
// passed through to 'arg' of compare() and swap()
void *arg;
// compares elements at 'i' and 'j'
int (*compare)(void *arg, size_t i, size_t j);
// swaps elements at 'i' and 'j'
void (*swap)(void *arg, size_t i, size_t j);
};
static void insertion_sort (struct sort_interface iface)
{
for (size_t i = 0; i < iface.nmemb; i++) {
size_t j = i;
while (j > 0) {
if (iface.compare(iface.arg, j - 1, j) <= 0) {
break;
}
iface.swap(iface.arg, j - 1, j);
j--;
}
}
}
static int func_comparator (void *arg, size_t i, size_t j)
{
int *arr = arg;
if (arr[i] < arr[j]) {
return -1;
}
if (arr[i] > arr[j]) {
return 1;
}
return 0;
}
static void func_swap (void *arg, size_t i, size_t j)
{
int *arr = arg;
int temp = arr[i];
arr[i] = arr[j];
arr[j] = temp;
}
int main (int argc, char *argv[])
{
int arr[] = {7, 6, 8, 2, 9, 1, 156, 1, 62, 1671, 15};
size_t count = sizeof(arr) / sizeof(arr[0]);
struct sort_interface iface;
iface.nmemb = count;
iface.arg = arr;
iface.compare = func_comparator;
iface.swap = func_swap;
insertion_sort(iface);
for (size_t i = 0; i < count; i++) {
printf("%d ", arr[i]);
}
printf("\n");
return 0;
}
You might also want to take a look at the qsort() function of the C standard library, which too uses a function pointer comparator, but is somewhat limited to compared to the above. In particular, it assumes you're sorting a continuous array, and if you have pointers to elements or their members, those will be broken (but the above interface allows you to fix pointers in swap()).
Here's an example for how to use the qsort() interface, and also an insertion sort implementation that uses the same interface as qsort():
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
static void insertion_sort (void *base, size_t nmemb, size_t size, int(*compar)(const void *, const void *))
{
char temp[size];
for (size_t i = 0; i < nmemb; i++) {
size_t j = i;
while (j > 0) {
char *x = (char *)base + (j - 1) * size;
char *y = (char *)base + j * size;
if (compar(x, y) <= 0) {
break;
}
memcpy(temp, x, size);
memcpy(x, y, size);
memcpy(y, temp, size);
j--;
}
}
}
static int int_comparator (const void *ve1, const void *ve2)
{
const int *e1 = ve1;
const int *e2 = ve2;
if (*e1 < *e2) {
return -1;
}
if (*e1 > *e2) {
return 1;
}
return 0;
}
int main (int argc, char *argv[])
{
int arr[] = {7, 6, 8, 2, 9, 1, 156, 1, 62, 1671, 15};
size_t count = sizeof(arr) / sizeof(arr[0]);
qsort(arr, count, sizeof(arr[0]), int_comparator); // or insertion_sort()
for (size_t i = 0; i < count; i++) {
printf("%d ", arr[i]);
}
printf("\n");
return 0;
}