Mathematical function in device to improve results (CUDA) - cuda

Is it worth to execute the mathematical function pow() in device(GPU), in order to improve the execution time of a code?
I found the function __powf() from Cuda Toolkit Documentation:
http://docs.nvidia.com/cuda/cuda-c-programming-guide/#intrinsic-functions
So I replaced the pow() function calls with __powf() and I used the option -use_fast_math for the compiler, but I got results "nan" instead of double precision numbers. What should I change on my code to achieve the above?
Libraries of my code.cu:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/time.h> // for gettimeofday()
#include <time.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cublas.h>
#include "magma.h"
Part of my code.cu:
void function(double *cx, double *cy, double *R, int var, double pts[][2], int e) {
magma_trans_t my_trans = MagmaNoTrans;
magma_int_t info;
magma_int_t M, C;
magma_int_t ldda, lddb;
C = 3;
M = var;
int i;
double Q[M];
double a[3];
int ret;
double A[3][M];
double pts_x[M], pts_y[M];
double *dev_pts_x, *dev_pts_y, *devA, *devB, *pWork, lWorkQuery[1];
/* Allocate device memory for the matrix (column-major) */
ldda = ((M + 31) / 32) * 32;
lddb = ldda;
cudaMalloc((void **)&devA, (ldda * C) * sizeof(double));
cudaMalloc((void **)&devB, (M) * sizeof(double));
for (i = 0; i < M; i++) {
pts_x[i] = pts[i][0];
pts_y[i] = pts[i][1];
A[0][i] = pts[i][0];
A[1][i] = pts[i][1];
A[2][i] = 1.0;
}
cudaMalloc((void **)&dev_pts_x, (M) * sizeof(double));
cudaMemcpy(dev_pts_x, pts_x, M * sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&dev_pts_y, (M) * sizeof(double));
cudaMemcpy(dev_pts_y, pts_y, M * sizeof(double), cudaMemcpyHostToDevice);
// Kernel invocation
dim3 threadsPerBlock(1, 1);
dim3 numBlocks(M / threadsPerBlock.x, M / threadsPerBlock.y);
call <<< numBlocks, threadsPerBlock >>> (var, dev_pts_x, dev_pts_y, devB);
cublasSetMatrix(M, C, sizeof(double), A, M, devA, ldda);
// cublasSetMatrix(M, 1, sizeof(double), B, M, devB, M);
/* Resolve the LLSP using MAGMA */
ret = magma_dgels_gpu(my_trans, M, C, 1 , devA, ldda, devB, M, lWorkQuery, -1, &info);
int lwork = (int)lWorkQuery[0];
//printf("Optimal work space %d\n", lwork);
pWork = (double*)malloc((lwork) * sizeof(double));
ret = magma_dgels_gpu(my_trans, M, C, 1, devA, ldda, devB, M, pWork, lwork, &info);
magma_dgetmatrix(M, 1, devB, lddb, Q, M);
a[2] = Q[2];
*cx = Q[0];
*cy = Q[1];
*R = sqrt((pow(*cx, 2)+pow(*cy, 2)) - a[2]);
}
__global__ void call(int v, double *pts_x, double *pts_y, double *B) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < v) {
B[i] = -(pow(pts_x[i], 2.0) + pow(pts_y[i], 2.0));
}
}

You use pow to square numbers, this is very inefficient. Use multiplication with an inline function:
static inline double square(double x) { return x * x; }
You might be getting NaN values because the number passed to pow is negative. This should not be a problem, but the cuda implementation of pow or __powf might not support that.
Also note that computing the euclidian distance between two points can be done more directly with the hypot() function:
double hypot(double x, double y);
Finally, as Weather Vane underlined, you might not need to take the square root if all you are interested in is the comparison with another distance computed the same way.

Related

CUDA: Pass device function as an argument to global function

How to make something like this work?
#define Eval(x, y, func) {y = func(x);}
__global__ void Evaluate(double *xs, double *ys, int N, double f(double))
{
int tid = threadIdx.x;
if (tid < N)
Eval(xs[tid], ys[tid], f);
}
And then inside main function I have
double *xs_d, *ys_d;
double *xs_h, *ys_h;
xs_h = (double *) malloc(sizeof(double) * 256);
ys_h = (double *) malloc(sizeof(double) * 256);
cudaMalloc((void **)&xs_d, sizeof(double) * 256);
cudaMalloc((void **)&ys_d, sizeof(double) * 256);
for (int i = 0; i < 256; i++)
{
xs_h[i] = (double)i;
}
HANDLE_ERROR(cudaMemcpy(xs_d, xs_h, 256*sizeof(double), cudaMemcpyHostToDevice));
Evaluate<<<1,256>>>(xs_d, ys_d, 256, Sin);
cudaDeviceSynchronize();
HANDLE_ERROR(cudaMemcpy(ys_h, ys_d, 256*sizeof(double), cudaMemcpyDeviceToHost));
It fails in the last line. So far I have seen solutions like this How to pass device function as an input argument to host-side function? but there they use __device__ functions, which cannot be changed or accessed by a host (main for example) function. For example, I cannot put __device__ int *fptrf1 = (int *)f1; (taken from the example) inside main. Is it possible to somehow have this flexibility?
For example, I cannot put __device__ int *fptrf1 = (int *)f1; (taken from the example) inside main. Is it possible to somehow have this flexibility?
One possible approach is to use a lambda:
$ cat t151.cu
#define Eval(x, y, func) {y = func(x);}
template <typename F>
__global__ void Evaluate(double *xs, double *ys, int N, F f)
{
int tid = threadIdx.x;
if (tid < N)
Eval(xs[tid], ys[tid], f);
}
int main(){
double *xs_d, *ys_d;
double *xs_h, *ys_h;
xs_h = (double *) malloc(sizeof(double) * 256);
ys_h = (double *) malloc(sizeof(double) * 256);
cudaMalloc((void **)&xs_d, sizeof(double) * 256);
cudaMalloc((void **)&ys_d, sizeof(double) * 256);
for (int i = 0; i < 256; i++)
{
xs_h[i] = (double)i;
}
cudaMemcpy(xs_d, xs_h, 256*sizeof(double), cudaMemcpyHostToDevice);
auto Sinlambda = [] __host__ __device__ (double v) {return sin(v);};
Evaluate<<<1,256>>>(xs_d, ys_d, 256, Sinlambda);
cudaDeviceSynchronize();
cudaMemcpy(ys_h, ys_d, 256*sizeof(double), cudaMemcpyDeviceToHost);
}
$ nvcc -o t151 t151.cu -std=c++11 --extended-lambda
$ cuda-memcheck ./t151
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
(CUDA 11.3)
For a variety of device function pointer uses, this answer links to a number of examples.

Random permutation on GPU using Thrust

I'm trying to write code that will permute a vector on a gpu, but I'm have great difficulty getting Thrust to cooperate. At the moment the code below compiles fine, but does nothing to the order of the vector r. Please help. Thanks!
void rng_permutation<float>(const int n, float* r){
float* order;
cudaMalloc((void**)&order, n* sizeof(float));
/*
some lines of code that generate uniform random floats between 0 and 1 that I know work
*/
thrust::device_ptr<float> order_(order);
thrust::device_vector<float> order__(order_, order_ + n);
thrust::device_ptr<float> r_(r);
thrust::device_vector<float> r__(r_, r_ + n);
thrust::sort_by_key(order__.begin(), order__.end(), r__.begin());
thrust::copy(order_, order_ + n, order__.begin());
thrust::copy(r_, r_ + n, r__.begin());
cudaFree(order);
order__.clear();
r__.clear();
thrust::device_vector<float>().swap(order__);
thrust::device_vector<float>().swap(r__);
}
You've got your sources and destinations backwards here:
thrust::copy(order_, order_ + n, order__.begin());
thrust::copy(r_, r_ + n, r__.begin());
The previous line of code just sorted what is in order__. You are then copying the contents of order_ over the top of that (the first parameters to thrust::copy are the source parameters, the last are the destination parameters). Makes no sense. Instead, reverse that:
thrust::copy(order__.begin(), order__.end(), order_);
thrust::copy(r__.begin(), r__.end(), r_);
And you will get sensible results:
$ cat t312.cu
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <iostream>
#include <thrust/sequence.h>
template <typename T>
void caffe_gpu_rng_uniform(int n, T lo, T hi, T *o)
{
T *d = (T *)malloc(n*sizeof(T));
for (int i = 0; i < n; i++) d[i] = (rand()/(float)RAND_MAX)*(hi-lo) + lo;
cudaMemcpy(o, d, n*sizeof(T), cudaMemcpyHostToDevice);
free(d);
};
template <typename T>
void print_gpu_array_entries(T *o, int x , int y , int n){
thrust::copy_n(thrust::device_pointer_cast<T>(o), x, std::ostream_iterator<T>(std::cout, ","));
std::cout << std::endl;
}
void rng_permutation(const int n, float* r){
float* order;
cudaMalloc((void**)&order, n* sizeof(float));
caffe_gpu_rng_uniform<float>(n, (float)0.0, (float)1.0, order);
print_gpu_array_entries<float>(order, 10 , 1 , n);
print_gpu_array_entries<float>(r, 10 , 1 , n);
thrust::device_ptr<float> order_(order);
thrust::device_vector<float> order__(order_, order_ + n);
thrust::device_ptr<float> r_(r);
thrust::device_vector<float> r__(r_, r_ + n);
thrust::sort_by_key(order__.begin(), order__.end(), r__.begin());
thrust::copy(order__.begin(), order__.end(), order_);
thrust::copy(r__.begin(), r__.end(), r_);
print_gpu_array_entries<float>(order, 10 , 1 , n);
print_gpu_array_entries<float>(r, 10 , 1 , n);
cudaFree(order);
order__.clear();
r__.clear();
thrust::device_vector<float>().swap(order__);
thrust::device_vector<float>().swap(r__);
}
int main(){
thrust::device_vector<float> data(10);
thrust::sequence(data.begin(), data.end());
rng_permutation(10, thrust::raw_pointer_cast(data.data()));
}
$ nvcc -o t312 t312.cu
$ ./t312
0.840188,0.394383,0.783099,0.79844,0.911647,0.197551,0.335223,0.76823,0.277775,0.55397,
0,1,2,3,4,5,6,7,8,9,
0.197551,0.277775,0.335223,0.394383,0.55397,0.76823,0.783099,0.79844,0.840188,0.911647,
5,8,6,1,9,7,2,3,0,4,
$

Dynamic 2D array using double pointer in CUDA [duplicate]

I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.
The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.
You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}

Solving general sparse linear systems in CUDA

I am currently working on CUDA and trying to solve Ax = b using cuBLAS and cuSPARSE library. I looked through the sample codes including conjugateGradient & conjugateGradientPrecond provided by NVIDIA. However, the conjugate gradient method only works for positive definite matrix and it is an iterative method. Now, I have some general sparse matrices and I think I should take advantage of cuSPARSE library. Does anyone know how can I solve Ax = b using cuSPARSE and cuBLAS libraries? I could not find useful APIs for me. Generally, the matrices are expected to be at least 1000x1000 and in some cases it would go up to 100000x100000. Should I do this using a direct method?
One possibility to solve general sparse linear systems in CUDA is using cuSOLVER.
cuSOLVER has three useful routines:
cusolverSpDcsrlsvlu, which works for square linear systems (number of unknowns equal to the number of equations) and internally uses sparse LU factorization with partial pivoting;
cusolverSpDcsrlsvqr, which works for square linear systems (number of unknowns equal to the number of equations) and internally uses sparse QR factorization;
cusolverSpDcsrlsqvqr, which works for rectangular linear systems (number of unknowns different to the number of equations) and internally solves a least square problem.
For ALL the above routines, the supported matrix type is CUSPARSE_MATRIX_TYPE_GENERAL. If A is symmetric/Hermitian and only lower/upper part is used or meaningful, then its missing upper/lower part must be extended.
NOTES ON cusolverSpDcsrlsvlu
Attention should be paid to two input parameters: tol and reorder. Concerning the former, if the system matrix A is singular, then some diagonal elements of the matrix U of the LU decomposition are zero. The algorithm decides for zero if |U(j,j)|<tol. Concerning the latter, cuSOLVER provides a reordering to reduce
zero fill-in which dramactically affects the performance of LU factorization. reorder toggles between reordering (reorder=1) or not reordering (reorder=0).
Attention should be paid also to an output parameter: singularity. It is -1 if A is invertible, otherwise it provides the first index j such that U(j,j)=0.
NOTES ON cusolverSpDcsrlsvqr
Attention should be paid to the same input/output parameters are before. In particular, tol is used to decide for singularity, reorder has no effect and singularity is -1 if A is invertible, otherwise it returns the first index j such that R(j,j)=0.
NOTES ON cusolverSpDcsrlsqvqr
Attention should be paid to the input parameter tol, which is used to decide the rank of A.
Attention should be also paid to the output parameters rankA, which represents the numerical rank of A, p, a permutation vector of length equal to the number of columns of A (please, see the documentation for further details) and min_norm, which is the norm of the residual ||Ax - b||.
Currently, as of CUDA 10.0, the above three functions are for the host channel only, which means that they do not yet run on GPU. They must be called as:
cusolverSpDcsrlsvluHost;
cusolverSpDcsrlsvqrHost;
cusolverSpDcsrlsqvqrHost,
and the input argument should all reside on the host.
Below, please find a fully worked example using all the above three possibilities:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cusparse.h>
#include <cusolverSp.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
//extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__host__ __device__ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cusolverGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if (CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
_cusolverGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs", __FILE__, __LINE__, err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f;
h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f;
h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f;
h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
// --- Allocating and defining dense host and device data vectors
double *h_y = (double *)malloc(Nrows * sizeof(double));
h_y[0] = 100.0; h_y[1] = 200.0; h_y[2] = 400.0; h_y[3] = 500.0;
double *d_y; gpuErrchk(cudaMalloc(&d_y, Nrows * sizeof(double)));
gpuErrchk(cudaMemcpy(d_y, h_y, Nrows * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_x = (double *)malloc(Ncols * sizeof(double));
double *d_x; gpuErrchk(cudaMalloc(&d_x, Ncols * sizeof(double)));
// --- CUDA solver initialization
cusolverSpHandle_t solver_handle;
cusolverSpCreate(&solver_handle);
// --- Using LU factorization
int singularity;
cusolveSafeCall(cusolverSpDcsrlsvluHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
// --- Using QR factorization
//cusolveSafeCall(cusolverSpDcsrlsvqrHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
//int rankA;
//int *p = (int *)malloc(N * sizeof(int));
//double min_norm;
//cusolveSafeCall(cusolverSpDcsrlsqvqrHost(solver_handle, N, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, &rankA, h_x, p, &min_norm));
printf("Showing the results...\n");
for (int i = 0; i < N; i++) printf("%f\n", h_x[i]);
}

Replicate a vector multiple times using CUDA Thrust

I am trying to solve a problem using CUDA Thrust.
I have a host array with 3 elements. Is it possible, using Thrust, to create a device array of 384 elements in which the 3 elements in my host array is repeated 128 times (128 x 3 = 384)?
Generally speaking, starting from an array of 3 elements, how can I use Thrust to generate a device array of size X, where X = Y x 3, i.e. Y is the number of repetitions?
One possible approach:
create a device vector of appropriate size
create 3 strided ranges, one for each of the element positions {1, 2, 3} in the final output (device) vector
use thrust::fill to fill each of the 3 strided ranges with the appropriate (host vector) element {1, 2, 3}
This code is a trivial modification of the strided range example to demonstrate. You can change the REPS define to 128 to see the full expansion to 384 output elements:
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// for printing
#include <thrust/copy.h>
#include <ostream>
#define STRIDE 3
#define REPS 15 // change to 128 if you like
#define DSIZE (STRIDE*REPS)
// this example illustrates how to make strided access to a range of values
// examples:
// strided_range([0, 1, 2, 3, 4, 5, 6], 1) -> [0, 1, 2, 3, 4, 5, 6]
// strided_range([0, 1, 2, 3, 4, 5, 6], 2) -> [0, 2, 4, 6]
// strided_range([0, 1, 2, 3, 4, 5, 6], 3) -> [0, 3, 6]
// ...
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
int main(void)
{
thrust::host_vector<int> h_data(STRIDE);
h_data[0] = 1;
h_data[1] = 2;
h_data[2] = 3;
thrust::device_vector<int> data(DSIZE);
typedef thrust::device_vector<int>::iterator Iterator;
strided_range<Iterator> pos1(data.begin(), data.end(), STRIDE);
strided_range<Iterator> pos2(data.begin()+1, data.end(), STRIDE);
strided_range<Iterator> pos3(data.begin()+2, data.end(), STRIDE);
thrust::fill(pos1.begin(), pos1.end(), h_data[0]);
thrust::fill(pos2.begin(), pos2.end(), h_data[1]);
thrust::fill(pos3.begin(), pos3.end(), h_data[2]);
// print the generated data
std::cout << "data: ";
thrust::copy(data.begin(), data.end(), std::ostream_iterator<int>(std::cout, " ")); std::cout << std::endl;
return 0;
}
Robert Crovella has already answered this question using strided ranges. He has also pointed out the possibility of using the expand operator.
Below, I'm providing a worked example using the expand operator. Opposite to the use of strided ranges, it avoids the need of for loops.
#include <thrust/device_vector.h>
#include <thrust/gather.h>
#include <thrust/sequence.h>
#include <stdio.h>
using namespace thrust::placeholders;
/*************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX */
/*************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {
T Ncols; // --- Number of columns
__host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
__host__ __device__ T operator()(T i) { return i / Ncols; }
};
/*******************/
/* EXPAND OPERATOR */
/*******************/
template <typename InputIterator1, typename InputIterator2, typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size, 0);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size, 0);
thrust::scatter_if(thrust::counting_iterator<difference_type>(0), thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(), first1, output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan(output_indices.begin(), output_indices.end(), output_indices.begin(), thrust::maximum<difference_type>());
// gather input values according to index array (output = first2[output_indices])
OutputIterator output_end = output; thrust::advance(output_end, output_size);
thrust::gather(output_indices.begin(), output_indices.end(), first2, output);
// return output + output_size
thrust::advance(output, output_size);
return output;
}
/**************************/
/* STRIDED RANGE OPERATOR */
/**************************/
template <typename Iterator>
class strided_range
{
public:
typedef typename thrust::iterator_difference<Iterator>::type difference_type;
struct stride_functor : public thrust::unary_function<difference_type,difference_type>
{
difference_type stride;
stride_functor(difference_type stride)
: stride(stride) {}
__host__ __device__
difference_type operator()(const difference_type& i) const
{
return stride * i;
}
};
typedef typename thrust::counting_iterator<difference_type> CountingIterator;
typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
// type of the strided_range iterator
typedef PermutationIterator iterator;
// construct strided_range for the range [first,last)
strided_range(Iterator first, Iterator last, difference_type stride)
: first(first), last(last), stride(stride) {}
iterator begin(void) const
{
return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
}
iterator end(void) const
{
return begin() + ((last - first) + (stride - 1)) / stride;
}
protected:
Iterator first;
Iterator last;
difference_type stride;
};
/********/
/* MAIN */
/********/
int main(){
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int Nrows = 10; // --- Number of objects
const int Ncols = 3; // --- Number of centroids
thrust::device_vector<int> d_sequence(Nrows * Ncols);
thrust::device_vector<int> d_counts(Ncols, Nrows);
thrust::sequence(d_sequence.begin(), d_sequence.begin() + Ncols);
expand(d_counts.begin(), d_counts.end(), d_sequence.begin(),
thrust::make_permutation_iterator(
d_sequence.begin(),
thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)));
printf("\n\nCentroid indices\n");
for(int i = 0; i < Nrows; i++) {
std::cout << " [ ";
for(int j = 0; j < Ncols; j++)
std::cout << d_sequence[i * Ncols + j] << " ";
std::cout << "]\n";
}
return 0;
}
As an apparently simpler alternative to using CUDA Thrust, I'm posting below a worked example implementing in CUDA the classical Matlab's meshgrid function.
In Matlab
x = [1 2 3];
y = [4 5 6 7];
[X, Y] = meshgrid(x, y);
produces
X =
1 2 3
1 2 3
1 2 3
1 2 3
and
Y =
4 4 4
5 5 5
6 6 6
7 7 7
X is exactly the four-fold replication of the x array, which is the OP's question and first guess of Robert Crovella's answer, while Y is the three-fold consecutive replication of each element of the y array, which is the second guess of Robert Crovella's answer.
Here is the code:
#include <cstdio>
#include <thrust/pair.h>
#include "Utilities.cuh"
#define BLOCKSIZE_MESHGRID_X 16
#define BLOCKSIZE_MESHGRID_Y 16
#define DEBUG
/*******************/
/* MESHGRID KERNEL */
/*******************/
template <class T>
__global__ void meshgrid_kernel(const T * __restrict__ x, size_t Nx, const float * __restrict__ y, size_t Ny, T * __restrict__ X, T * __restrict__ Y)
{
unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int tidy = blockIdx.y * blockDim.y + threadIdx.y;
if ((tidx < Nx) && (tidy < Ny)) {
X[tidy * Nx + tidx] = x[tidx];
Y[tidy * Nx + tidx] = y[tidy];
}
}
/************/
/* MESHGRID */
/************/
template <class T>
thrust::pair<T *,T *> meshgrid(const T *x, const unsigned int Nx, const T *y, const unsigned int Ny) {
T *X; gpuErrchk(cudaMalloc((void**)&X, Nx * Ny * sizeof(T)));
T *Y; gpuErrchk(cudaMalloc((void**)&Y, Nx * Ny * sizeof(T)));
dim3 BlockSize(BLOCKSIZE_MESHGRID_X, BLOCKSIZE_MESHGRID_Y);
dim3 GridSize (iDivUp(Nx, BLOCKSIZE_MESHGRID_X), iDivUp(BLOCKSIZE_MESHGRID_Y, BLOCKSIZE_MESHGRID_Y));
meshgrid_kernel<<<GridSize, BlockSize>>>(x, Nx, y, Ny, X, Y);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
return thrust::make_pair(X, Y);
}
/********/
/* MAIN */
/********/
int main()
{
const int Nx = 3;
const int Ny = 4;
float *h_x = (float *)malloc(Nx * sizeof(float));
float *h_y = (float *)malloc(Ny * sizeof(float));
float *h_X = (float *)malloc(Nx * Ny * sizeof(float));
float *h_Y = (float *)malloc(Nx * Ny * sizeof(float));
for (int i = 0; i < Nx; i++) h_x[i] = i;
for (int i = 0; i < Ny; i++) h_y[i] = i + 4.f;
float *d_x; gpuErrchk(cudaMalloc(&d_x, Nx * sizeof(float)));
float *d_y; gpuErrchk(cudaMalloc(&d_y, Ny * sizeof(float)));
gpuErrchk(cudaMemcpy(d_x, h_x, Nx * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, Ny * sizeof(float), cudaMemcpyHostToDevice));
thrust::pair<float *, float *> meshgrid_pointers = meshgrid(d_x, Nx, d_y, Ny);
float *d_X = (float *)meshgrid_pointers.first;
float *d_Y = (float *)meshgrid_pointers.second;
gpuErrchk(cudaMemcpy(h_X, d_X, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_Y, d_Y, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
for (int j = 0; j < Ny; j++) {
for (int i = 0; i < Nx; i++) {
printf("i = %i; j = %i; x = %f; y = %f\n", i, j, h_X[j * Nx + i], h_Y[j * Nx + i]);
}
}
return 0;
}