Warp shuffling for CUDA - cuda

I need to make a warp shuffling that look like this:
On this picture, the number of threads is limited to 8 to make it readable.
If I read the Nvidia SDK and ptx manual, the shuffle instruction should do the job, specially the shfl.idx.b32 d[|p], a, b, c; ptx instruction.
From the manual I read:
Each thread in the currently executing warp will compute a source lane
index j based on input operands b and c and the mode. If the computed
source lane index j is in range, the thread will copy the input operand
a from lane j into its own destination register d;
So, providing proper values of b and c, I should be able to do it by writing a function like this (inspired from CUDA SDK __shufl primitive implementation).
__forceinline__ __device __ float shuffle(float var){
float ret;
int srcLane = ???
int c = ???
asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
return ret;
}
If it is possible, what is the constant for srcLane and c? I am not able to determine them (I am using CUDA 8.0) .
Best,
Timocafe

I would recommend doing this with the CUDA intrinsic rather than with PTX (or inline ASM). However the following code demonstrates both methods:
// cat t54.cu
#include <stdio.h>
__global__ void k(){
int i = threadIdx.x;
int j = i;
if (i<4) j*=2;
if ((i>3) && (i<8)) j-=(7-i);
int k = __shfl_sync(0x0FFU, i+100, j);
printf("lane: %d, result: %d\n", i, k);
}
__forceinline__ __device__ float shuffle(float var, int lane){
float ret;
int srcLane = lane;
int c = 0x1F;
asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
return ret;
}
__global__ void k1(){
int i = threadIdx.x;
int j = i;
if (i<4) j*=2;
if ((i>3) && (i<8)) j-=(7-i);
float k = shuffle((float)(i+100), j);
printf("lane: %d, result: %f\n", i, k);
}
int main(){
k<<<1,8>>>();
cudaDeviceSynchronize();
k1<<<1,8>>>();
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -o t54 t54.cu
$ cuda-memcheck ./t54
========= CUDA-MEMCHECK
lane: 0, result: 100
lane: 1, result: 102
lane: 2, result: 104
lane: 3, result: 106
lane: 4, result: 101
lane: 5, result: 103
lane: 6, result: 105
lane: 7, result: 107
lane: 0, result: 100.000000
lane: 1, result: 102.000000
lane: 2, result: 104.000000
lane: 3, result: 106.000000
lane: 4, result: 101.000000
lane: 5, result: 103.000000
lane: 6, result: 105.000000
lane: 7, result: 107.000000
========= ERROR SUMMARY: 0 errors
$
Using the CUDA intrinsic (the first method) the only real task is to compute the source lane index. Based on your pattern I wrote some code to do that and put it in the variable j.

Robert has already and satisfactorily answered this question. I had implemented the code below, showing permutation of a full warp.
#include <stdio.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
__global__ void shufflingKernel(double *d_data, double *d_result, int *d_perm){
unsigned mask = __activemask();
int tid = threadIdx.x;
int srcLane = d_perm[tid];
double var = d_data[tid];
//d_result[tid] = __shfl_sync(0xFFFFFFFF, var, srcLane);
d_result[tid] = __shfl_sync(mask, var, srcLane);
}
int main(){
const int N = 32;
double h_data[32] = { 3.4, 42.2, 2., -1., 10., 11., 2., -1., 10., 33., 2.3, 11., 44., 0., -33., -21.,
4.4, 43.2, 3., -2., 13., 15., 222., -90., 17., 30., 11.3, 7., 22., 100., -30., -91. };
double *h_result = (double *)malloc(N * sizeof(double));
int h_perm[32] = { 6, 11, 9, 2, 5, 23, 31, 0, 3, 27, 29, 1, 28, 30, 17, 13, 10, 8, 4, 22, 7, 18, 24, 12, 20,
19, 16, 26, 21, 15, 25, 14 };
int *d_perm; gpuErrchk(cudaMalloc(&d_perm, N * sizeof(int)));
double *d_data; gpuErrchk(cudaMalloc(&d_data, N * sizeof(double)));
double *d_result; gpuErrchk(cudaMalloc(&d_result, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_perm, &h_perm[0], N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_data, &h_data[0], N * sizeof(double), cudaMemcpyHostToDevice));
shufflingKernel << <1, 32>> >(d_data, d_result, d_perm);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_result, d_result, N * sizeof(double), cudaMemcpyDeviceToHost));
for (int k = 0; k < N; k++) {
printf("k = %d; Original = %f; New = %f; Check = %f\n", k, h_data[k], h_result[k], h_data[h_perm[k]]);
}
}
Notice that, instead of using 0xFFFFFFFF for the mask of active threads, it is safer using the warp-level primitive __activemask() in the sense of Shuffle instruction in CUDA not working.

What you are trying to do in your shuffle operation is to be able to have dynamically index source lanes on which shuffle operates. One needs to understand that any variation of shuffle command (__shfl, __shfl_up, __shfl_down, __shfl_xor) needs a constant value for its second parameter and this parameter is the same for all lanes within a warp. You can play with grouping of threads within a warp by specifying width. Thus, for example, by specifying
float var = ...
__shfl_xor(var, 3, 4);
the lane permutation will look like:
0 1 2 3
|
3 2 1 0
So, to answer your question, it's not possible to do it with a single __shuffle operation of any kind. But you can implement it by combining several __shuffle commands with different second parameters.

Related

CUDA device runtime api cudaMemsetAsync doesn't work

I am trying to call cudaMemsetAsync from kernel (so called "dynamic parallelism"). But no matter what value I use, it always set memory to 0.
Here is my test code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_device_runtime_api.h"
#include <stdio.h>
const int size = 5;
__global__ void kernel(int *c)
{
cudaMemsetAsync(c, 0x7FFFFFFF, size * 4, NULL);
}
int main()
{
cudaError_t cudaStatus;
int c[size] = { 12, 12, 12, 12, 12 };
int *dev_c = 0;
cudaStatus = cudaSetDevice(0);
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
cudaStatus = cudaDeviceReset();
printf("%d\n", cudaStatus);
printf("{%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
And if I run it, I got output like this:
>nvcc -run kernel.cu -gencode=arch=compute_35,code=\"sm_35,compute_35\" -rdc=true -lcudadevrt
kernel.cu
Creating library a.lib and object a.exp
0
{0,0,0,0,0}
When I call memory set, I use value 0x7FFFFFFF. I'm expecting non-zero numbers, but it always shows zero.
Is this a bug? or I did something wrong? I'm using CUDA 8.0
I can confirm this appears not to work in CUDA 8 on the systems I tested it with.
If you want a single thread to perform the operation, you can use memset directly in device code (it, like memcpy, has been supported forever). The kernel will emit a byte sized loop inline within your kernel and the operation will be handled by each running thread.
If you want a dynamic parallelism style memset operation, then the easiest thing is to make your own. A trivial (and very, very lightly tested) implementation in the code you posted might look like this:
#include <cstring>
#include <cstdio>
const int size = 5;
__global__ void myMemset_kernel(void* p, unsigned char val, size_t sz)
{
size_t tid = threadIdx.x + blockDim.x * blockIdx.x;
unsigned char* _p = (unsigned char*)p;
for(; tid < sz; tid += blockDim.x * gridDim.x) {
_p[tid] = val;
}
}
__device__ void myMemset(void* p, unsigned int val, size_t sz, cudaStream_t s=NULL)
{
const dim3 blocksz(256,1,1);
size_t nblocks = (sz + blocksz.x -1) / blocksz.x;
unsigned charval = val & 0xff;
myMemset_kernel<<< dim3(nblocks,1,1), blocksz, 0, s >>>(p, charval, sz);
}
__global__ void kernel(int *c)
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
myMemset(c, 0x7FFFFFFF, size * 4, s);
cudaDeviceSynchronize();
}
int main()
{
int c[size];
int *dev_c;
memset(&c[0], 0xffffff0c, size * sizeof(int));
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
cudaMalloc((void**)&dev_c, size * sizeof(int));
cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
kernel <<< 1, 1 >>>(dev_c);
cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_c);
printf("{%08x,%08x,%08x,%08x,%08x}\n", c[0], c[1], c[2], c[3], c[4]);
return 0;
}
which compiles and does this:
$ nvcc -rdc=true -arch=sm_52 -o memset memset.cu -lcudadevrt
$ ./memset
{0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c,0c0c0c0c}
{ffffffff,ffffffff,ffffffff,ffffffff,ffffffff}
A final point -- note the values above and read this question and answer. In your code, it is not possible to use cudaMemset to apply a value of 0x7FFFFFFF. Although the value argument is an unsigned integer, cudaMemset and its relatives work like regular memset and set byte values. Only the least significant byte of the 32 bit argument is used to set values. If your objective is to set 32 bit values, then you will need to make your own version of memset for that purpose anyway.

is there a way to do "saypx" in cuBLAS?

cublasSaxpy computes y' = a * x + y, where x and y are vectors and a is scalar.
It turns out I need to compute y' = a * y + x instead. I'm not seeing how to twist the cuBLAS library into doing that.
(Of course, I could compute y' = a * y, then y' = y' + x, but y' is read too often in that case. And I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code. I'm just surprised there's no apparent way to do "saypx" directly.)
[Added] There are functions similar to "saxpby" in Intel's version of cblas, which would do what I need. But oddly enough, that's not in cuBLAS.
[Added #2] It looks like I can use the cudnnAddTensor function, with some aliasing of descriptors (I have a FilterDescriptor that points to the tensor, which AddTensor won't accept, but I should be able to alias a TensorDescriptor to the same memory and shape.)
There isn't a way I am aware of to do what you are asking in CUBLAS, nor in standard BLAS. What you have found in MKL is an extension added by Intel, but I don't recall seeing something similar in other host and accelerator BLAS implementations.
The good news is that your assertion that "I could write my own CUDA code to do it, but then it's likely not anywhere near as fast as the cuBLAS code", is untrue, at least for an operation as trivial as saxpy. Even a naïve implementation of saxpy will get very close to CUBLAS because there really aren't that many was to read two arrays, perform an FMAD and write back the result. As long as you get memory coalescing correct, it is pretty simple to write performant code. For example:
#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>
#include "cublas_v2.h"
typedef enum
{
AXPY = 0,
AXPBY = 1
} saxpy_op_t;
__device__ __host__ __inline__
float axpby_op(float y, float x, float a)
{
return a * y + x;
}
__device__ __host__ __inline__
float axpy_op(float y, float x, float a)
{
return y + a * x;
}
template<typename T>
class pitched_accessor
{
T * p;
size_t pitch;
public:
__host__ __device__
pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};
__host__ __device__
T& operator[](size_t idx) { return p[pitch*idx]; };
__host__ __device__
const T& operator[](size_t idx) const { return p[pitch*idx]; };
};
template<saxpy_op_t op>
__global__
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x,
const float a, const unsigned int N1)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
#pragma unroll 8
for(; idx < N1; idx += stride) {
switch (op) {
case AXPY:
y[idx] = axpy_op(y[idx], x[idx], a);
break;
case AXPBY:
y[idx] = axpby_op(y[idx], x[idx], a);
break;
}
}
}
__host__ void saxby(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
__host__ void saxpy(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
auto it = yhat.begin();
for(; it != yhat.end(); ++it) {
float err = std::fabs(*it - result);
assert( err < tolerance );
}
}
int main()
{
const int N = 1<<22;
std::vector<float> x_h(N);
std::vector<float> y_h(N);
const float a = 2.f, y0 = 1234.f, x0 = 532.f;
std::fill(y_h.begin(), y_h.end(), y0);
std::fill(x_h.begin(), x_h.end(), x0);
float *x_d, *y_d;
size_t sz = sizeof(float) * size_t(N);
cudaMalloc((void **)&x_d, sz);
cudaMalloc((void **)&y_d, sz);
cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxby(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpby_op(y0, x0, a));
}
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxpy(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
}
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
cublasDestroy(handle);
}
return int(cudaDeviceReset());
}
This demonstrates that a very simple axpy kernel can be easily adapted to perform both the standard operation and the version you want, and run within 10% of the runtime of CUBLAS on the compute 5.2 device I tested it on:
$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%) Time Calls Avg Min Max Name
54.06% 11.190ms 5 2.2381ms 960ns 2.9094ms [CUDA memcpy HtoD]
40.89% 8.4641ms 3 2.8214ms 2.8039ms 2.8310ms [CUDA memcpy DtoH]
1.73% 357.59us 1 357.59us 357.59us 357.59us void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.72% 355.15us 1 355.15us 355.15us 355.15us void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.60% 332.21us 1 332.21us 332.21us 332.21us void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)

CUDA cublas getrf and getri, for a matrix inversion, cause nvprof errors with one dimensional memory [duplicate]

Since CUDA 5.5, the CUBLAS library contains routines for batched matrix factorization and inversion (cublas<t>getrfBatched and cublas<t>getriBatched respectively).
Getting guide from the documentation, I wrote a test code for inversion of an N x N matrix using these routines. The code gives correct output only if the matrix has all non zero pivots. Setting any pivot to zero results in incorrect results. I have verified the results using MATLAB.
I realize that I am providing row major matrices as input while CUBLAS expects column major matrices, but it shouldn't matter as it would only transpose the result. To be sure, I also tested on column major input, but getting same behavior.
I am confused as, cublas<t>getriBatched expects pivot exchange information array P as input, which is the output from cublas<t>getrfBatched. So, if any zero pivots are eliminated by row exchange, then the inversion routine should handle it automatically.
How to perform inversion of matrices which contain a zero pivot using CUBLAS?
Following is a self contained compile-able example with different test cases:
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
//Select matrix by setting "a"
float* a = zero_pivot;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,a,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
int n; scanf("%d",&n);
return 0;
}
There seems to be a bug in the current CUBLAS library implementation of cublas<t>getrfBatched for matrices of dimension (n) such that 3<=n<=16, when there is a "zero pivot" as you say.
A possible workaround is to "identity-extend" your A matrix to be inverted, when n<17, to a size of 17x17 (using matlab nomenclature):
LU = getrf( [A 0 ; 0 I]);
continuing, you can then use cublas<t>getriBatched in an "ordinary" fashion:
invA = getri( LU(1:3,1:3) )
(You can also leave everything at n=17, call getri that way, and then extract the result as the first 3x3 rows and columns of invA.)
Here is a fully worked example, borrowing from the code you supplied, showing the inversion of your supplied 3x3 zero_pivot matrix, using the zero_pivot_war matrix as an "identity-extended" workaround:
$ cat t340.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = 17;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == 17)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
/* float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
*/
float zero_pivot_war[17*17] = {
0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
*/
float result[n*n];
//Select matrix by setting "a"
float* a = zero_pivot_war;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*17+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,result,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",result[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
// int n; scanf("%d",&n);
return 0;
}
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Inverse:
-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
========= ERROR SUMMARY: 0 errors
$
The above result appears to me to be correct based on a simple test elsewhere.
I don't have any further technical details about the nature of the possible bug in CUBLAS. From what I can tell, it is present in both CUDA 5.5 and CUDA 6.0 RC. Detailed bug discussions for NVIDIA-supplied assets (e.g. CUBLAS library) should be taken up on the NVIDIA developer forums or directly at the bug filing portal on developer.nvidia.com (you must be a registered developer to file a bug).

CUBLAS: Incorrect inversion for matrix with zero pivot

Since CUDA 5.5, the CUBLAS library contains routines for batched matrix factorization and inversion (cublas<t>getrfBatched and cublas<t>getriBatched respectively).
Getting guide from the documentation, I wrote a test code for inversion of an N x N matrix using these routines. The code gives correct output only if the matrix has all non zero pivots. Setting any pivot to zero results in incorrect results. I have verified the results using MATLAB.
I realize that I am providing row major matrices as input while CUBLAS expects column major matrices, but it shouldn't matter as it would only transpose the result. To be sure, I also tested on column major input, but getting same behavior.
I am confused as, cublas<t>getriBatched expects pivot exchange information array P as input, which is the output from cublas<t>getrfBatched. So, if any zero pivots are eliminated by row exchange, then the inversion routine should handle it automatically.
How to perform inversion of matrices which contain a zero pivot using CUBLAS?
Following is a self contained compile-able example with different test cases:
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
//Select matrix by setting "a"
float* a = zero_pivot;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,a,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
int n; scanf("%d",&n);
return 0;
}
There seems to be a bug in the current CUBLAS library implementation of cublas<t>getrfBatched for matrices of dimension (n) such that 3<=n<=16, when there is a "zero pivot" as you say.
A possible workaround is to "identity-extend" your A matrix to be inverted, when n<17, to a size of 17x17 (using matlab nomenclature):
LU = getrf( [A 0 ; 0 I]);
continuing, you can then use cublas<t>getriBatched in an "ordinary" fashion:
invA = getri( LU(1:3,1:3) )
(You can also leave everything at n=17, call getri that way, and then extract the result as the first 3x3 rows and columns of invA.)
Here is a fully worked example, borrowing from the code you supplied, showing the inversion of your supplied 3x3 zero_pivot matrix, using the zero_pivot_war matrix as an "identity-extended" workaround:
$ cat t340.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = 17;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == 17)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
/* float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
*/
float zero_pivot_war[17*17] = {
0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
*/
float result[n*n];
//Select matrix by setting "a"
float* a = zero_pivot_war;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*17+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,result,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",result[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
// int n; scanf("%d",&n);
return 0;
}
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Inverse:
-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
========= ERROR SUMMARY: 0 errors
$
The above result appears to me to be correct based on a simple test elsewhere.
I don't have any further technical details about the nature of the possible bug in CUBLAS. From what I can tell, it is present in both CUDA 5.5 and CUDA 6.0 RC. Detailed bug discussions for NVIDIA-supplied assets (e.g. CUBLAS library) should be taken up on the NVIDIA developer forums or directly at the bug filing portal on developer.nvidia.com (you must be a registered developer to file a bug).

How to quickly compact a sparse array with CUDA C?

Summary
Array [A - B - - - C] in device memory but want [A B C] - what's the quickest way with CUDA C?
Context
I have an array A of integers on device (GPU) memory. At each iteration, I randomly choose a few elements that are larger than 0 and subtract 1 from them. I maintain a sorted lookup array L of those elements that are equal to 0:
Array A:
# iteration i: [0 1 0 3 3 2 0 1 2 3]
# iteration i + 1: [0 0 0 3 2 2 0 1 2 3]
Lookup for 0-elements L:
# iteration i: [0 - 2 - - - 6 - - -] -> want compacted form: [0 2 6]
# iteration i + 1: [0 1 2 - - - 6 - - -] -> want compacted form: [0 1 2 6]
(Here, I randomly chose elements 1 and 4 to subtract 1 from. In my implementation in CUDA C, each thread maps onto an element in A, and so the lookup array is sparse to prevent data races and to maintain a sorted ordering (e.g. [0 1 2 6] rather than [0 2 6 1]).)
Later, I will do some operation only for those elements that are equal to 0. Hence I need to compact my sparse lookup array L, so that I can map threads to 0-elements.
As such, what is the most efficient way to compact a sparse array on device memory with CUDA C?
Many thanks.
Suppose I have:
int V[] = {1, 2, 0, 0, 5};
And my desired result is:
int R[] = {1, 2, 5}
In effect we are removing elements that are zero, or copying elements only if non-zero.
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <stdio.h>
#define SIZE 5
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
struct is_not_zero
{
__host__ __device__
bool operator()(const int x)
{
return (x != 0);
}
};
int main(){
int V[] = {1, 2, 0, 0, 5};
int R[] = {0, 0, 0, 0, 0};
int *d_V, *d_R;
cudaMalloc((void **)&d_V, SIZE*sizeof(int));
cudaCheckErrors("cudaMalloc1 fail");
cudaMalloc((void **)&d_R, SIZE*sizeof(int));
cudaCheckErrors("cudaMalloc2 fail");
cudaMemcpy(d_V, V, SIZE*sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
thrust::device_ptr<int> dp_V(d_V);
thrust::device_ptr<int> dp_R(d_R);
thrust::copy_if(dp_V, dp_V + SIZE, dp_R, is_not_zero());
cudaMemcpy(R, d_R, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy2 fail");
for (int i = 0; i<3; i++)
printf("R[%d]: %d\n", i, R[i]);
return 0;
}
the struct defintion provides us with a functor that tests for zero elements. Note that in thrust, there are no kernels and we are not writing device code directly. All that happens behind the scenes. And I'd definitely suggest familiarizing yourself with the quick start guide, so as not to turn this question into a tutorial on thrust.
After reviewing the comments, I think this modified version of the code will work around the cuda 4.0 issues:
#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <stdio.h>
#define SIZE 5
struct is_not_zero
{
__host__ __device__
bool operator()(const int x)
{
return (x != 0);
}
};
int main(){
int V[] = {1, 2, 0, 0, 5};
int R[] = {0, 0, 0, 0, 0};
thrust::host_vector<int> h_V(V, V+SIZE);
thrust::device_vector<int> d_V = h_V;
thrust::device_vector<int> d_R(SIZE, 0);
thrust::copy_if(d_V.begin(), d_V.end(), d_R.begin(), is_not_zero());
thrust::host_vector<int> h_R = d_R;
thrust::copy(h_R.begin(), h_R.end(), R);
for (int i = 0; i<3; i++)
printf("R[%d]: %d\n", i, R[i]);
return 0;
}