glmer warning messages in lme4 in r - lme4

I'm running a negative binomial glmer using lme4. I am getting some errors that I can see others have had in the past but I can't work out how to apply those solutions to my own model.
The errors are as follows:
Warning messages:
1: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, ...:
Model failed to converge with max|grad| = 0.00827217 (tol = 0.001, component 1)
2: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, ... :
Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?
I have included my data with reproducible code below:
abglmm.nb<-glmer.nb(Ab~Treatment*Struc+(1|Site),data=y)
y<-structure(list(Site = structure(c(3L, 3L, 3L, 2L, 2L, 2L, 1L,
1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L,
2L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Gar",
"Ing", "Mal"), class = "factor"), Treatment = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("Cag", "Hshg", "Lshg", "Nog"), class = "factor"),
Struc = c(72.28753968, 70.31960317, 57.22154762, 43.36305556,
55.09400794, 48.89420635, 68.52, 75.9256746, 65.8225, 43.45376984,
52.08833333, 30.52218254, 23.37588999, 21.27666667, 17.72349206,
26.48095238, 32.26884921, 40.57555556, 19.39702381, 15.24972222,
13.3684127, 32.06484127, 31.15543651, 31.92099206, 37.10579365,
39.91400794, 56.26515873, 11.0725, 15.39166667, 14.11166667,
20.03650794, 18.80099206, 21.7102381, 17.47722222, 15.18083333,
12.2865873), Ab = c(393L, 554L, 317L, 249L, 455L, 371L, 488L,
353L, 544L, 393L, 443L, 475L, 608L, 715L, 583L, 650L, 433L,
428L, 447L, 402L, 411L, 541L, 692L, 632L, 589L, 564L, 569L,
457L, 386L, 527L, 251L, 454L, 471L, 394L, 231L, 378L)), .Names = c("Site",
"Treatment", "Struc", "Ab"), class = "data.frame", row.names = c("BU1",
"BU2", "BU3", "JU1", "JU2", "JU3", "GU1", "GU2", "GU3", "KC1",
"KC2", "KC3", "HC1", "HC2", "HC3", "GC1", "GC2", "GC3", "OL1",
"OL2", "OL3", "ML1", "ML2", "ML3", "GL1", "GL2", "GL3", "TH1",
"TH2", "TH3", "KH1", "KH2", "KH3", "GH1", "GH2", "GH3"))
Any help would be greatly appreciated.

Related

Is there any performance downside when passing a struct to a kernel?

I have a kernel that takes several arrays as input. To improve readability it would be nice to group them into a struct and (after proper memory allocation and copy for each input) pass the struct to the kernel instead of the long list of pointers.
Is it going to be the same in the 2 cases, memory-wise, when accessing the arrays inside the kernel?
Can anyone recommend me some documentation on this topic (Couldn't find it on the programming guide)
No, there should be no difference. You can read the PTX output to make sure. Here is a simple example:
struct Foo
{
int* a, *b, *c;
};
__global__ void bar(Foo f)
{ f.a[0] = f.b[0] + f.c[0]; }
__global__ void baz(int* a, int* b, int* c)
{ a[0] = b[0] + c[0]; }
struct Quz
{
int* a, *b, *c;
~Quz() {}
};
__global__ void quuz(Quz f)
{ f.a[0] = f.b[0] + f.c[0]; }
And here is the PTX assembly. Note how there is basically no difference between the functions.
.visible .entry _Z3bar3Foo(
.param .align 8 .b8 _Z3bar3Foo_param_0[24]
)
{
.reg .b32 %r<4>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [_Z3bar3Foo_param_0+16];
ld.param.u64 %rd2, [_Z3bar3Foo_param_0+8];
ld.param.u64 %rd3, [_Z3bar3Foo_param_0];
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd6, %rd1;
ld.global.u32 %r1, [%rd5];
ld.global.u32 %r2, [%rd6];
add.s32 %r3, %r2, %r1;
st.global.u32 [%rd4], %r3;
ret;
}
.visible .entry _Z3bazPiS_S_(
.param .u64 _Z3bazPiS_S__param_0,
.param .u64 _Z3bazPiS_S__param_1,
.param .u64 _Z3bazPiS_S__param_2
)
{
.reg .b32 %r<4>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [_Z3bazPiS_S__param_0];
ld.param.u64 %rd2, [_Z3bazPiS_S__param_1];
ld.param.u64 %rd3, [_Z3bazPiS_S__param_2];
cvta.to.global.u64 %rd4, %rd1;
cvta.to.global.u64 %rd5, %rd3;
cvta.to.global.u64 %rd6, %rd2;
ld.global.u32 %r1, [%rd6];
ld.global.u32 %r2, [%rd5];
add.s32 %r3, %r2, %r1;
st.global.u32 [%rd4], %r3;
ret;
}
.visible .entry _Z4quuz3Quz(
.param .align 8 .b8 _Z4quuz3Quz_param_0[24]
)
{
.reg .b32 %r<4>;
.reg .b64 %rd<7>;
ld.param.u64 %rd1, [_Z4quuz3Quz_param_0+16];
ld.param.u64 %rd2, [_Z4quuz3Quz_param_0+8];
ld.param.u64 %rd3, [_Z4quuz3Quz_param_0];
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd6, %rd1;
ld.global.u32 %r1, [%rd5];
ld.global.u32 %r2, [%rd6];
add.s32 %r3, %r2, %r1;
st.global.u32 [%rd4], %r3;
ret;
}
It all works the same because CUDA puts all arguments into "constant memory" and accesses them through specialized memory load functions that go through the "constant cache."

Warp shuffling for CUDA

I need to make a warp shuffling that look like this:
On this picture, the number of threads is limited to 8 to make it readable.
If I read the Nvidia SDK and ptx manual, the shuffle instruction should do the job, specially the shfl.idx.b32 d[|p], a, b, c; ptx instruction.
From the manual I read:
Each thread in the currently executing warp will compute a source lane
index j based on input operands b and c and the mode. If the computed
source lane index j is in range, the thread will copy the input operand
a from lane j into its own destination register d;
So, providing proper values of b and c, I should be able to do it by writing a function like this (inspired from CUDA SDK __shufl primitive implementation).
__forceinline__ __device __ float shuffle(float var){
float ret;
int srcLane = ???
int c = ???
asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
return ret;
}
If it is possible, what is the constant for srcLane and c? I am not able to determine them (I am using CUDA 8.0) .
Best,
Timocafe
I would recommend doing this with the CUDA intrinsic rather than with PTX (or inline ASM). However the following code demonstrates both methods:
// cat t54.cu
#include <stdio.h>
__global__ void k(){
int i = threadIdx.x;
int j = i;
if (i<4) j*=2;
if ((i>3) && (i<8)) j-=(7-i);
int k = __shfl_sync(0x0FFU, i+100, j);
printf("lane: %d, result: %d\n", i, k);
}
__forceinline__ __device__ float shuffle(float var, int lane){
float ret;
int srcLane = lane;
int c = 0x1F;
asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
return ret;
}
__global__ void k1(){
int i = threadIdx.x;
int j = i;
if (i<4) j*=2;
if ((i>3) && (i<8)) j-=(7-i);
float k = shuffle((float)(i+100), j);
printf("lane: %d, result: %f\n", i, k);
}
int main(){
k<<<1,8>>>();
cudaDeviceSynchronize();
k1<<<1,8>>>();
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -o t54 t54.cu
$ cuda-memcheck ./t54
========= CUDA-MEMCHECK
lane: 0, result: 100
lane: 1, result: 102
lane: 2, result: 104
lane: 3, result: 106
lane: 4, result: 101
lane: 5, result: 103
lane: 6, result: 105
lane: 7, result: 107
lane: 0, result: 100.000000
lane: 1, result: 102.000000
lane: 2, result: 104.000000
lane: 3, result: 106.000000
lane: 4, result: 101.000000
lane: 5, result: 103.000000
lane: 6, result: 105.000000
lane: 7, result: 107.000000
========= ERROR SUMMARY: 0 errors
$
Using the CUDA intrinsic (the first method) the only real task is to compute the source lane index. Based on your pattern I wrote some code to do that and put it in the variable j.
Robert has already and satisfactorily answered this question. I had implemented the code below, showing permutation of a full warp.
#include <stdio.h>
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getchar(); exit(code); }
}
}
__global__ void shufflingKernel(double *d_data, double *d_result, int *d_perm){
unsigned mask = __activemask();
int tid = threadIdx.x;
int srcLane = d_perm[tid];
double var = d_data[tid];
//d_result[tid] = __shfl_sync(0xFFFFFFFF, var, srcLane);
d_result[tid] = __shfl_sync(mask, var, srcLane);
}
int main(){
const int N = 32;
double h_data[32] = { 3.4, 42.2, 2., -1., 10., 11., 2., -1., 10., 33., 2.3, 11., 44., 0., -33., -21.,
4.4, 43.2, 3., -2., 13., 15., 222., -90., 17., 30., 11.3, 7., 22., 100., -30., -91. };
double *h_result = (double *)malloc(N * sizeof(double));
int h_perm[32] = { 6, 11, 9, 2, 5, 23, 31, 0, 3, 27, 29, 1, 28, 30, 17, 13, 10, 8, 4, 22, 7, 18, 24, 12, 20,
19, 16, 26, 21, 15, 25, 14 };
int *d_perm; gpuErrchk(cudaMalloc(&d_perm, N * sizeof(int)));
double *d_data; gpuErrchk(cudaMalloc(&d_data, N * sizeof(double)));
double *d_result; gpuErrchk(cudaMalloc(&d_result, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_perm, &h_perm[0], N * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_data, &h_data[0], N * sizeof(double), cudaMemcpyHostToDevice));
shufflingKernel << <1, 32>> >(d_data, d_result, d_perm);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_result, d_result, N * sizeof(double), cudaMemcpyDeviceToHost));
for (int k = 0; k < N; k++) {
printf("k = %d; Original = %f; New = %f; Check = %f\n", k, h_data[k], h_result[k], h_data[h_perm[k]]);
}
}
Notice that, instead of using 0xFFFFFFFF for the mask of active threads, it is safer using the warp-level primitive __activemask() in the sense of Shuffle instruction in CUDA not working.
What you are trying to do in your shuffle operation is to be able to have dynamically index source lanes on which shuffle operates. One needs to understand that any variation of shuffle command (__shfl, __shfl_up, __shfl_down, __shfl_xor) needs a constant value for its second parameter and this parameter is the same for all lanes within a warp. You can play with grouping of threads within a warp by specifying width. Thus, for example, by specifying
float var = ...
__shfl_xor(var, 3, 4);
the lane permutation will look like:
0 1 2 3
|
3 2 1 0
So, to answer your question, it's not possible to do it with a single __shuffle operation of any kind. But you can implement it by combining several __shuffle commands with different second parameters.

Why doesn't nvrtc compiler emit this nvvm code fragments to ptx?

I have some NVVM code that I am trying to compile to PTX using nvrtc (i.e. using nvvmCompileProgram, nvvmGetCompiledResult).
Here is the nvvm code:
; ModuleID = 'test_warp_reduce'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-cuda"
define ptx_kernel void #lambda_crit_4197([0 x float]* %_4200_4590, [0 x i64]* %_4201_4591, [0 x float]* %_4202_4592) {
acc_bidx:
%0 = tail call ptx_device i32 #llvm.nvvm.read.ptx.sreg.tid.x()
%1 = tail call ptx_device i32 #llvm.nvvm.read.ptx.sreg.ntid.x()
%2 = tail call ptx_device i32 #llvm.nvvm.read.ptx.sreg.ctaid.x()
%3 = mul nsw i32 %2, %1
%4 = add nsw i32 %3, %0
%5 = icmp slt i32 %4, 32
br i1 %5, label %if_then12, label %next
if_then12: ; preds = %acc_bidx
%6 = getelementptr inbounds [0 x float]* %_4202_4592, i64 0, i32 %4
%7 = load float* %6
%8 = tail call i64 #clock()
%9 = tail call float #reduce_step(float %7, i32 1, i32 31)
%10 = tail call float #reduce_step(float %9, i32 2, i32 31)
%11 = tail call float #reduce_step(float %10, i32 4, i32 31)
%12 = tail call float #reduce_step(float %11, i32 8, i32 31)
%13 = tail call float #reduce_step(float %12, i32 16, i32 31)
%14 = tail call i64 #clock()
%15 = getelementptr inbounds [0 x float]* %_4200_4590, i64 0, i32 %4
%16 = getelementptr inbounds [0 x i64]* %_4201_4591, i64 0, i32 %0
%17 = sub nsw i64 %14, %8
store i64 %17, i64* %16
store float %13, float* %15
br label %next
next: ; preds = %acc_bidx, %if_then12
ret void
}
declare i64 #llvm.nvvm.texsurf.handle.p1i64(metadata, i64 addrspace(1)*)
; Function Attrs: nounwind readnone
declare ptx_device i32 #llvm.nvvm.read.ptx.sreg.tid.x()
; Function Attrs: nounwind readnone
declare ptx_device i32 #llvm.nvvm.read.ptx.sreg.ntid.x()
; Function Attrs: nounwind readnone
declare ptx_device i32 #llvm.nvvm.read.ptx.sreg.ctaid.x()
define i64 #clock() {
%1 = call i64 asm "mov.u32 $0, %clock;", "=r" ()
ret i64 %1
}
define float #reduce_step(float %a, i32 %b, i32 %c) {
%1 = call float asm
"{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, $1, $2, $3;
#p add.f32 r0, r0, $1;
mov.f32 $0, r0;
}", "=f, f, r, r" (float %a, i32 %b, i32 %c)
ret float %1
}
!nvvmir.version = !{!0}
!nvvm.annotations = !{!1}
!0 = metadata !{i32 1, i32 2}
!1 = metadata !{void ([0 x float]*, [0 x i64]*, [0 x float]*)* #lambda_crit_4197, metadata !"kernel", i64 1}
And here is the generated ptx code:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19324574
// Cuda compilation tools, release 7.0, V7.0.27
// Based on LLVM 3.4svn
//
.version 4.2
.target sm_52
.address_size 64
// .globl lambda_crit_4197
.visible .entry lambda_crit_4197(
.param .u64 lambda_crit_4197_param_0,
.param .u64 lambda_crit_4197_param_1,
.param .u64 lambda_crit_4197_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<11>;
.reg .s32 %r<15>;
.reg .s64 %rd<13>;
ld.param.u64 %rd1, [lambda_crit_4197_param_0];
ld.param.u64 %rd2, [lambda_crit_4197_param_1];
ld.param.u64 %rd3, [lambda_crit_4197_param_2];
mov.u32 %r1, %tid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mad.lo.s32 %r2, %r3, %r4, %r1;
setp.gt.s32 %p1, %r2, 31;
#%p1 bra BB0_2;
cvta.to.global.u64 %rd4, %rd3;
mul.wide.s32 %rd5, %r2, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f2, [%rd6];
mov.u32 %r5, 1;
mov.u32 %r14, 31;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f2, %r5, %r14;
#p add.f32 r0, r0, %f2;
mov.f32 %f1, r0;
}
// inline asm
mov.u32 %r7, 2;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f1, %r7, %r14;
#p add.f32 r0, r0, %f1;
mov.f32 %f3, r0;
}
// inline asm
mov.u32 %r9, 4;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f3, %r9, %r14;
#p add.f32 r0, r0, %f3;
mov.f32 %f5, r0;
}
// inline asm
mov.u32 %r11, 8;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f5, %r11, %r14;
#p add.f32 r0, r0, %f5;
mov.f32 %f7, r0;
}
// inline asm
mov.u32 %r13, 16;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f7, %r13, %r14;
#p add.f32 r0, r0, %f7;
mov.f32 %f9, r0;
}
// inline asm
cvta.to.global.u64 %rd7, %rd1;
add.s64 %rd8, %rd7, %rd5;
cvta.to.global.u64 %rd9, %rd2;
mul.wide.s32 %rd10, %r1, 8;
add.s64 %rd11, %rd9, %rd10;
mov.u64 %rd12, 0;
st.global.u64 [%rd11], %rd12;
st.global.f32 [%rd8], %f9;
BB0_2:
ret;
}
// .globl clock
.visible .func (.param .b64 func_retval0) clock(
)
{
.reg .s32 %r<2>;
.reg .s64 %rd<2>;
// inline asm
mov.u32 %r1, %clock;
// inline asm
cvt.u64.u32 %rd1, %r1;
st.param.b64 [func_retval0+0], %rd1;
ret;
}
// .globl reduce_step
.visible .func (.param .b32 func_retval0) reduce_step(
.param .b32 reduce_step_param_0,
.param .b32 reduce_step_param_1,
.param .b32 reduce_step_param_2
)
{
.reg .f32 %f<3>;
.reg .s32 %r<3>;
ld.param.f32 %f2, [reduce_step_param_0];
ld.param.u32 %r1, [reduce_step_param_1];
ld.param.u32 %r2, [reduce_step_param_2];
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f2, %r1, %r2;
#p add.f32 r0, r0, %f2;
mov.f32 %f1, r0;
}
// inline asm
st.param.f32 [func_retval0+0], %f1;
ret;
}
It seems that the nvvm compiler just eliminates code for mysterious reasons. For example, the calls for the clock function weren't emitted at all.
Whether I used the compiler optimization or not doesn't make a difference in the provided code.
Somebody told me that Cuda 7.5 had some similar issues (assembly not being emitted) on Windows. So I downgraded to 7.0. However, the problem is still in there.
Any clue why this might be the case?
I can tell from experience that PTX code only calls builtin functions. User defined functions get inlined into the calling functions.
I can't seem to find the proper documentation for it right now, but I'll link it in when I find it.
In your code base there are many places code segments like this repeat:
// inline asm
mov.u32 %r7, 2;
// inline asm
{ .reg .pred p;
.reg .f32 r0;
shfl.down.b32 r0|p, %f1, %r7, %r14;
#p add.f32 r0, r0, %f1;
mov.f32 %f3, r0;
}
Does this look familiar ? The first line one is coming from clock, the second block is coming from reduce_step.
TL;DR: You don't see the calls because they got inlined.

NVCC ignoring CUDA code?

I have just installed CUDA 5.5 on my notebook and trying out using NVCC to compile a basic hello world program from this link http://computer-graphics.se/hello-world-for-cuda.html
The code I'm trying out is this:
// This is the REAL "hello world" for CUDA!
// It takes the string "Hello ", prints it, then passes it to CUDA with an array
// of offsets. Then the offsets are added in parallel to produce the string "World!"
// By Ingemar Ragnemalm 2010
#include <stdio.h>
const int N = 16;
const int blocksize = 16;
__global__
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}
int main()
{
char a[N] = "Hello \0\0\0\0\0\0";
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);
printf("%s", a);
cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
cudaFree( bd );
printf("%s\n", a);
return EXIT_SUCCESS;
}
It is supposed to print out "Hello world!", but after I compiled using "nvcc hello.cu -o a.out", my output is "Hello Hello", can someone tell me what is going on?
This was caused by a broken CUDA driver installation. A corrected installation allowed what was otherwise correct code to run without error.
[This community wiki entry was assembled from comments to get this question off the unanswered queue]

Solving general sparse linear systems in CUDA

I am currently working on CUDA and trying to solve Ax = b using cuBLAS and cuSPARSE library. I looked through the sample codes including conjugateGradient & conjugateGradientPrecond provided by NVIDIA. However, the conjugate gradient method only works for positive definite matrix and it is an iterative method. Now, I have some general sparse matrices and I think I should take advantage of cuSPARSE library. Does anyone know how can I solve Ax = b using cuSPARSE and cuBLAS libraries? I could not find useful APIs for me. Generally, the matrices are expected to be at least 1000x1000 and in some cases it would go up to 100000x100000. Should I do this using a direct method?
One possibility to solve general sparse linear systems in CUDA is using cuSOLVER.
cuSOLVER has three useful routines:
cusolverSpDcsrlsvlu, which works for square linear systems (number of unknowns equal to the number of equations) and internally uses sparse LU factorization with partial pivoting;
cusolverSpDcsrlsvqr, which works for square linear systems (number of unknowns equal to the number of equations) and internally uses sparse QR factorization;
cusolverSpDcsrlsqvqr, which works for rectangular linear systems (number of unknowns different to the number of equations) and internally solves a least square problem.
For ALL the above routines, the supported matrix type is CUSPARSE_MATRIX_TYPE_GENERAL. If A is symmetric/Hermitian and only lower/upper part is used or meaningful, then its missing upper/lower part must be extended.
NOTES ON cusolverSpDcsrlsvlu
Attention should be paid to two input parameters: tol and reorder. Concerning the former, if the system matrix A is singular, then some diagonal elements of the matrix U of the LU decomposition are zero. The algorithm decides for zero if |U(j,j)|<tol. Concerning the latter, cuSOLVER provides a reordering to reduce
zero fill-in which dramactically affects the performance of LU factorization. reorder toggles between reordering (reorder=1) or not reordering (reorder=0).
Attention should be paid also to an output parameter: singularity. It is -1 if A is invertible, otherwise it provides the first index j such that U(j,j)=0.
NOTES ON cusolverSpDcsrlsvqr
Attention should be paid to the same input/output parameters are before. In particular, tol is used to decide for singularity, reorder has no effect and singularity is -1 if A is invertible, otherwise it returns the first index j such that R(j,j)=0.
NOTES ON cusolverSpDcsrlsqvqr
Attention should be paid to the input parameter tol, which is used to decide the rank of A.
Attention should be also paid to the output parameters rankA, which represents the numerical rank of A, p, a permutation vector of length equal to the number of columns of A (please, see the documentation for further details) and min_norm, which is the norm of the residual ||Ax - b||.
Currently, as of CUDA 10.0, the above three functions are for the host channel only, which means that they do not yet run on GPU. They must be called as:
cusolverSpDcsrlsvluHost;
cusolverSpDcsrlsvqrHost;
cusolverSpDcsrlsqvqrHost,
and the input argument should all reside on the host.
Below, please find a fully worked example using all the above three possibilities:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cusparse.h>
#include <cusolverSp.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
//extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__host__ __device__ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/**************************/
/* CUSOLVE ERROR CHECKING */
/**************************/
static const char *_cusolverGetErrorEnum(cusolverStatus_t error)
{
switch (error)
{
case CUSOLVER_STATUS_SUCCESS:
return "CUSOLVER_SUCCESS";
case CUSOLVER_STATUS_NOT_INITIALIZED:
return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED:
return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE:
return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH:
return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED:
return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR:
return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
}
return "<unknown>";
}
inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line)
{
if (CUSOLVER_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSOLVE error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
_cusolverGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs", __FILE__, __LINE__, err, \
_cusparseGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
const int Nrows = 4; // --- Number of rows
const int Ncols = 4; // --- Number of columns
const int N = Nrows;
// --- Host side dense matrix
double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));
// --- Column-major ordering
h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f;
h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f;
h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f;
h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f;
//create device array and copy host to it
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
int nnz = 0; // --- Number of nonzero elements in dense matrix
const int lda = Nrows; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row
int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
// --- Host side number of nonzero elements per row
int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
printf("\n");
// --- Device side dense matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side dense matrix
double *h_A = (double *)malloc(nnz * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");
for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
// --- Allocating and defining dense host and device data vectors
double *h_y = (double *)malloc(Nrows * sizeof(double));
h_y[0] = 100.0; h_y[1] = 200.0; h_y[2] = 400.0; h_y[3] = 500.0;
double *d_y; gpuErrchk(cudaMalloc(&d_y, Nrows * sizeof(double)));
gpuErrchk(cudaMemcpy(d_y, h_y, Nrows * sizeof(double), cudaMemcpyHostToDevice));
// --- Allocating the host and device side result vector
double *h_x = (double *)malloc(Ncols * sizeof(double));
double *d_x; gpuErrchk(cudaMalloc(&d_x, Ncols * sizeof(double)));
// --- CUDA solver initialization
cusolverSpHandle_t solver_handle;
cusolverSpCreate(&solver_handle);
// --- Using LU factorization
int singularity;
cusolveSafeCall(cusolverSpDcsrlsvluHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
// --- Using QR factorization
//cusolveSafeCall(cusolverSpDcsrlsvqrHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity));
//int rankA;
//int *p = (int *)malloc(N * sizeof(int));
//double min_norm;
//cusolveSafeCall(cusolverSpDcsrlsqvqrHost(solver_handle, N, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, &rankA, h_x, p, &min_norm));
printf("Showing the results...\n");
for (int i = 0; i < N; i++) printf("%f\n", h_x[i]);
}