I am wondering how should I implement a nice soft thresholding function kernel in CUDA?
The soft thresholding function is like following:
where lambda is the threshold, and x is the input vector or matrix. Suppose they are both real.
I prefer an existing code sample that runs reliably. I am reluctant to start from scratch. Anyone has done this before? Or knows where can I find a good sample code?
The two solutions proposed in the comments above, set up for an elementwise processing, are the following:
__global__ void myKernel1(float* __restrict__ x, float lambda, const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float xa = fabs(x[tid]);
x[tid] = (xa > lambda) ? x[tid] * ((xa - lambda) / xa) : 0;
}
}
and
__global__ void myKernel2(float* __restrict__ x, float lambda, const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float xa = fabs(x[tid]);
x[tid] = signbit(lambda-xa)*copysign(xa-lambda,x[tid]);
}
}
The disassembled codes for the two solutions are reported below. As noticed also by #njuffa, the second one seems to be in principle less burdened than the first one due to the lacking x/|x| division. However, as also noticed by #njuffa, this scenario will be likely memory bound rather than compute bound. However, perhaps this analysis gives indication that the second solution is preferrable when implemented as __device__ functions for non-elementwise computationS.
DISASSEMBLED CODE FOR THE FIRST SOLUTION
code for sm_21
Function : _Z9myKernel1Pffi
.headerflags #"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ S2R R3, SR_TID.X; /* 0x2c0000008400dc04 */
/*0018*/ IMAD R0, R0, c[0x0][0x8], R3; /* 0x2006400020001ca3 */
/*0020*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc23 */
/*0028*/ #P0 EXIT ; /* 0x80000000000001e7 */
/*0030*/ MOV32I R3, 0x4; /* 0x180000001000dde2 */
/*0038*/ SSY 0x90; /* 0x6000000140000007 */
/*0040*/ IMAD R16.CC, R0, R3, c[0x0][0x20]; /* 0x2007800080041ca3 */
/*0048*/ IMAD.HI.X R17, R0, R3, c[0x0][0x24]; /* 0x2086800090045ce3 */
/*0050*/ LD.E R2, [R16]; /* 0x8400000001009c85 */
/*0058*/ FSETP.GT.AND P0, PT, |R2|, c[0x0][0x28], PT; /* 0x220e4000a021dc80 */
/*0060*/ F2F.F32.F32 R5, |R2|; /* 0x1000000009215c44 */
/*0068*/ #P0 BRA 0x78; /* 0x40000000200001e7 */
/*0070*/ MOV.S R0, RZ; /* 0x28000000fc001df4 */
/*0078*/ FADD R4, |R2|, -c[0x0][0x28]; /* 0x50004000a0211d80 */
/*0080*/ JCAL 0x0; /* 0x1000000000010007 */
/*0088*/ FMUL.S R0, R2, R4; /* 0x5800000010201c10 */
/*0090*/ ST.E [R16], R0; /* 0x9400000001001c85 */
/*0098*/ EXIT ; /* 0x8000000000001de7 */
.................................
Function : __cuda_sm20_div_rn_noftz_f32_slowpath
.headerflags #"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ SHL R0, R4, 0x1; /* 0x6000c00004401c03 */
/*0008*/ MOV32I R6, 0x1; /* 0x1800000004019de2 */
/*0010*/ SHL R3, R5, 0x1; /* 0x6000c0000450dc03 */
/*0018*/ IMAD.U32.U32.HI R0, R0, 0x100, -R6; /* 0x200cc00400001d43 */
/*0020*/ ISETP.GT.U32.AND P0, PT, R0, 0xfd, PT; /* 0x1a0ec003f401dc03 */
/*0028*/ IMAD.U32.U32.HI R3, R3, 0x100, -R6; /* 0x200cc0040030dd43 */
/*0030*/ ISETP.GT.U32.OR P0, PT, R3, 0xfd, P0; /* 0x1a20c003f431dc03 */
/*0038*/ #!P0 BRA 0x178; /* 0x40000004e00021e7 */
/*0040*/ FSETP.LE.AND P0, PT, |R4|, +INF , PT; /* 0x218edfe00041dc80 */
/*0048*/ #!P0 BRA 0x60; /* 0x40000000400021e7 */
/*0050*/ FSETP.LE.AND P0, PT, |R5|, +INF , PT; /* 0x218edfe00051dc80 */
/*0058*/ #P0 BRA 0x70; /* 0x40000000400001e7 */
/*0060*/ FADD R4, R4, R5; /* 0x5000000014411c00 */
/*0068*/ BRA 0x370; /* 0x4000000c00001de7 */
/*0070*/ SHL R7, R5, 0x1; /* 0x6000c0000451dc03 */
/*0078*/ SHL R6, R4, 0x1; /* 0x6000c00004419c03 */
/*0080*/ ISETP.EQ.U32.AND P2, PT, R7, RZ, PT; /* 0x190e0000fc75dc03 */
/*0088*/ ISETP.EQ.U32.AND P1, PT, R6, RZ, PT; /* 0x190e0000fc63dc03 */
/*0090*/ PSETP.AND.AND P0, PT, P1, P2, PT; /* 0x0c0e00000811dc04 */
/*0098*/ #P0 BRA 0xc0; /* 0x40000000800001e7 */
/*00a0*/ FSETP.EQ.AND P3, PT, |R4|, +INF , PT; /* 0x210edfe00047dc80 */
/*00a8*/ FSETP.EQ.AND P0, PT, |R5|, +INF , PT; /* 0x210edfe00051dc80 */
/*00b0*/ #!P3 BRA 0xd8; /* 0x4000000080002de7 */
/*00b8*/ #!P0 BRA 0xd8; /* 0x40000000600021e7 */
/*00c0*/ MOV32I R0, 0xffc00000; /* 0x1bff000000001de2 */
/*00c8*/ MUFU.RSQ R4, R0; /* 0xc800000014011c00 */
/*00d0*/ BRA 0x370; /* 0x4000000a60001de7 */
/*00d8*/ PSETP.OR.AND P0, PT, P0, P1, PT; /* 0x0c0e00004401dc04 */
/*00e0*/ #!P0 BRA 0x100; /* 0x40000000600021e7 */
/*00e8*/ LOP.XOR R0, R5, R4; /* 0x6800000010501c83 */
/*00f0*/ LOP32I.AND R4, R0, 0x80000000; /* 0x3a00000000011c02 */
/*00f8*/ BRA 0x370; /* 0x40000009c0001de7 */
/*0100*/ PSETP.OR.AND P0, PT, P3, P2, PT; /* 0x0c0e00004831dc04 */
/*0108*/ #!P0 BRA 0x130; /* 0x40000000800021e7 */
/*0110*/ LOP.XOR R0, R5, R4; /* 0x6800000010501c83 */
/*0118*/ LOP32I.AND R0, R0, 0x80000000; /* 0x3a00000000001c02 */
/*0120*/ LOP32I.OR R4, R0, 0x7f800000; /* 0x39fe000000011c42 */
/*0128*/ BRA 0x370; /* 0x4000000900001de7 */
/*0130*/ ISETP.GE.AND P1, PT, R0, RZ, PT; /* 0x1b0e0000fc03dc23 */
/*0138*/ ISETP.GE.AND P0, PT, R3, RZ, PT; /* 0x1b0e0000fc31dc23 */
/*0140*/ #!P1 MOV32I R6, 0xffffffc0; /* 0x1bffffff0001a5e2 */
/*0148*/ #!P1 FFMA R4, R4, 1.84467440737095520000e+019, RZ; /* 0x307ed7e000412400 */
/*0150*/ #P1 MOV R6, RZ; /* 0x28000000fc0185e4 */
/*0158*/ #P0 BRA 0x180; /* 0x40000000800001e7 */
/*0160*/ FFMA R5, R5, 1.84467440737095520000e+019, RZ; /* 0x307ed7e000515c00 */
/*0168*/ IADD R6, R6, 0x40; /* 0x4800c00100619c03 */
/*0170*/ BRA 0x180; /* 0x4000000020001de7 */
/*0178*/ MOV R6, RZ; /* 0x28000000fc019de4 */
/*0180*/ IADD R7, R3, -0x7e; /* 0x4800fffe0831dc03 */
/*0188*/ MOV32I R9, 0x3f800000; /* 0x18fe000000025de2 */
/*0190*/ ISCADD R7, -R7, R5, 0x17; /* 0x410000001471dee3 */
/*0198*/ ISUB R3, R0, R3; /* 0x480000000c00dd03 */
/*01a0*/ MUFU.RCP R8, R7; /* 0xc800000010721c00 */
/*01a8*/ IADD R5, R0, -0x7e; /* 0x4800fffe08015c03 */
/*01b0*/ FFMA R9, -R7, R8, R9; /* 0x3012000020725e00 */
/*01b8*/ ISCADD R4, -R5, R4, 0x17; /* 0x4100000010511ee3 */
/*01c0*/ FFMA R5, R8, R9, R8; /* 0x3010000024815c00 */
/*01c8*/ FFMA R8, R4, R5, RZ; /* 0x307e000014421c00 */
/*01d0*/ FFMA R9, -R7, R8, R4; /* 0x3008000020725e00 */
/*01d8*/ FFMA R8, R9, R5, R8; /* 0x3010000014921c00 */
/*01e0*/ FFMA R7, -R7, R8, R4; /* 0x300800002071de00 */
/*01e8*/ FFMA R4, R7, R5, R8; /* 0x3010000014711c00 */
/*01f0*/ SHL R9, R4, 0x1; /* 0x6000c00004425c03 */
/*01f8*/ SHR.U32 R9, R9, 0x18; /* 0x5800c00060925c03 */
/*0200*/ IADD R0, R3, R9; /* 0x4800000024301c03 */
/*0208*/ IADD R6, R6, R0; /* 0x4800000000619c03 */
/*0210*/ IADD R0, R6, -0x1; /* 0x4800fffffc601c03 */
/*0218*/ ISETP.GT.U32.AND P0, PT, R0, 0xfd, PT; /* 0x1a0ec003f401dc03 */
/*0220*/ #P0 BRA 0x240; /* 0x40000000600001e7 */
/*0228*/ ISUB R0, R6, R9; /* 0x4800000024601d03 */
/*0230*/ ISCADD R4, R0, R4, 0x17; /* 0x4000000010011ee3 */
/*0238*/ BRA 0x370; /* 0x40000004c0001de7 */
/*0240*/ ISETP.LE.AND P0, PT, R6, 0xfe, PT; /* 0x198ec003f861dc23 */
/*0248*/ #P0 BRA 0x268; /* 0x40000000600001e7 */
/*0250*/ LOP32I.AND R0, R4, 0x80000000; /* 0x3a00000000401c02 */
/*0258*/ LOP32I.OR R4, R0, 0x7f800000; /* 0x39fe000000011c42 */
/*0260*/ BRA 0x370; /* 0x4000000420001de7 */
/*0268*/ ISETP.GT.AND P0, PT, R6, RZ, PT; /* 0x1a0e0000fc61dc23 */
/*0270*/ #P0 BRA 0x370; /* 0x40000003e00001e7 */
/*0278*/ ISETP.GE.AND P0, PT, R6, -0x18, PT; /* 0x1b0effffa061dc23 */
/*0280*/ #P0 BRA 0x298; /* 0x40000000400001e7 */
/*0288*/ LOP32I.AND R4, R4, 0x80000000; /* 0x3a00000000411c02 */
/*0290*/ BRA 0x370; /* 0x4000000360001de7 */
/*0298*/ FFMA.RP R3, R7, R5, R8; /* 0x311000001470dc00 */
/*02a0*/ FFMA.RM R0, R7, R5, R8; /* 0x3090000014701c00 */
/*02a8*/ FFMA.RZ R5, R7, R5, R8; /* 0x3190000014715c00 */
/*02b0*/ FSET.NEU.AND R3, R0, R3, PT; /* 0x168e00000c00dc00 */
/*02b8*/ I2I.S32.S32 R7, -R6; /* 0x1c0000001921df84 */
/*02c0*/ LOP32I.AND R5, R5, 0x7fffff; /* 0x3801fffffc515c02 */
/*02c8*/ ISETP.EQ.AND P0, PT, R7, RZ, PT; /* 0x190e0000fc71dc23 */
/*02d0*/ LOP32I.AND R0, R4, 0x80000000; /* 0x3a00000000401c02 */
/*02d8*/ I2I.S32.S32 R3, -R3; /* 0x1c0000000d20df84 */
/*02e0*/ I2I.S32.S32 R4, -R6; /* 0x1c00000019211f84 */
/*02e8*/ LOP32I.OR R7, R5, 0x800000; /* 0x380200000051dc42 */
/*02f0*/ #P0 BRA.U 0x328; /* 0x40000000c00081e7 */
/*02f8*/ #!P0 IADD R5, R6, 0x20; /* 0x4800c00080616003 */
/*0300*/ #!P0 SHL R5, R7, R5; /* 0x6000000014716003 */
/*0308*/ #!P0 ICMP.EQ.U32 R5, RZ, 0x1, R5; /* 0x310ac00007f16003 */
/*0310*/ #!P0 SHR.U32 R7, R7, R4; /* 0x580000001071e003 */
/*0318*/ #!P0 LOP.OR R3, R3, R5; /* 0x680000001430e043 */
/*0320*/ NOP; /* 0x4000000000001de4 */
/*0328*/ SHL R4, R7, 0x1e; /* 0x6000c00078711c03 */
/*0330*/ SHR.U32 R5, R4, 0x1f; /* 0x5800c0007c415c03 */
/*0338*/ LOP.AND R4, R7, 0x1; /* 0x6800c00004711c03 */
/*0340*/ LOP.OR R3, R3, R5; /* 0x680000001430dc43 */
/*0348*/ LOP.AND R3, R4, R3; /* 0x680000000c40dc03 */
/*0350*/ SHR.U32 R4, R7, 0x1; /* 0x5800c00004711c03 */
/*0358*/ ISETP.NE.U32.AND P0, PT, R3, RZ, PT; /* 0x1a8e0000fc31dc03 */
/*0360*/ #P0 IADD R4, R4, 0x1; /* 0x4800c00004410003 */
/*0368*/ LOP.OR R4, R0, R4; /* 0x6800000010011c43 */
/*0370*/ RET ; /* 0x9000000000001de7 */
......................................................
Function : __cuda_sm20_div_rn_f32
.headerflags #"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MUFU.RCP R3, R5; /* 0xc80000001050dc00 */
/*0008*/ MOV32I R6, 0x3f800000; /* 0x18fe000000019de2 */
/*0010*/ LOP32I.AND R0, R4, 0x7fffff; /* 0x3801fffffc401c02 */
/*0018*/ FFMA.FTZ R6, -R5, R3, R6; /* 0x300c00000c519e40 */
/*0020*/ LOP32I.OR R0, R0, 0x3f800000; /* 0x38fe000000001c42 */
/*0028*/ FFMA.FTZ R3, R3, R6, R3; /* 0x300600001830dc40 */
/*0030*/ FFMA.FTZ R6, R0, R3, RZ; /* 0x307e00000c019c40 */
/*0038*/ FFMA.FTZ R7, -R5, R6, R0; /* 0x300000001851de40 */
/*0040*/ FFMA.FTZ R6, R7, R3, R6; /* 0x300c00000c719c40 */
/*0048*/ FFMA.FTZ R0, -R5, R6, R0; /* 0x3000000018501e40 */
/*0050*/ LOP32I.AND R7, R4, 0xff800000; /* 0x3bfe00000041dc02 */
/*0058*/ FFMA.FTZ R6, R0, R3, R6; /* 0x300c00000c019c40 */
/*0060*/ FFMA.FTZ R0, R6, R7, RZ; /* 0x307e00001c601c40 */
/*0068*/ LOP32I.AND R3, R0, 0x7fffffff; /* 0x39fffffffc00dc02 */
/*0070*/ MOV32I R6, 0x7effffef; /* 0x19fbffffbc019de2 */
/*0078*/ IADD32I R3, R3, -0x800010; /* 0x0bfdffffc030dc02 */
/*0080*/ ISETP.GT.U32.AND P0, PT, R3, R6, PT; /* 0x1a0e00001831dc03 */
/*0088*/ #!P0 BRA 0xa8; /* 0x40000000600021e7 */
/*0090*/ JCAL 0x0; /* 0x1000000000010007 */
/*0098*/ MOV R0, R4; /* 0x2800000010001de4 */
/*00a0*/ NOP; /* 0x4000000000001de4 */
/*00a8*/ MOV R4, R0; /* 0x2800000000011de4 */
/*00b0*/ RET ; /* 0x9000000000001de7 */
.......................................
DISASSEMBLED CODE FOR THE SECOND SOLUTION
code for sm_21
Function : _Z9myKernel2Pffi
.headerflags #"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0018*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */
/*0020*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x2c], PT; /* 0x1b0e4000b001dc23 */
/*0028*/ #P0 BRA.U 0x98; /* 0x40000001a00081e7 */
/*0030*/ #!P0 MOV32I R3, 0x4; /* 0x180000001000e1e2 */
/*0038*/ #!P0 IMAD R2.CC, R0, R3, c[0x0][0x20]; /* 0x200780008000a0a3 */
/*0040*/ #!P0 IMAD.HI.X R3, R0, R3, c[0x0][0x24]; /* 0x208680009000e0e3 */
/*0048*/ #!P0 LD.E R0, [R2]; /* 0x8400000000202085 */
/*0050*/ #!P0 FADD R5, |R0|, -c[0x0][0x28]; /* 0x50004000a0016180 */
/*0058*/ #!P0 FADD R4, -|R0|, c[0x0][0x28]; /* 0x50004000a0012280 */
/*0060*/ #!P0 LOP32I.AND R0, R0, 0x80000000; /* 0x3a00000000002002 */
/*0068*/ #!P0 LOP32I.AND R5, R5, 0x7fffffff; /* 0x39fffffffc516002 */
/*0070*/ #!P0 SHR.U32 R4, R4, 0x1f; /* 0x5800c0007c412003 */
/*0078*/ #!P0 LOP.OR R5, R0, R5; /* 0x6800000014016043 */
/*0080*/ #!P0 I2F.F32.S32 R0, R4; /* 0x1800000011202204 */
/*0088*/ #!P0 FMUL R0, R0, R5; /* 0x5800000014002000 */
/*0090*/ #!P0 ST.E [R2], R0; /* 0x9400000000202085 */
/*0098*/ EXIT ; /* 0x8000000000001de7 */
.................................
EDIT
A follow-up of this post has appeared in Soft thresholding in CUDA.
Related
If I use fma(a, b, c) in cuda, it means that the formula ab+c is calculated in a single ternary operation. But if I want to calculate -ab+c, does the invoking fma(-a, b, c) take one more multiply operation ?
Unfortunately shader assembly language is undocumented at that level.
However we can try it out:
#!/bin/bash
cat <<EOF > fmatest.cu
__global__ void fma_plus(float *res, float a, float b, float c)
{
*res = fma(a, b, c);
}
__global__ void fma_minus(float *res, float a, float b, float c)
{
*res = fma(-a, b, c);
}
EOF
nvcc -arch sm_60 -c fmatest.cu
cuobjdump -sass fmatest.o
gives
code for sm_60
Function : _Z9fma_minusPffff
.headerflags #"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
/* 0x001fc400fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ MOV R0, c[0x0][0x148]; /* 0x4c98078005270000 */
/*0018*/ MOV R5, c[0x0][0x14c]; /* 0x4c98078005370005 */
/* 0x001fc800fe8007f1 */
/*0028*/ MOV R2, c[0x0][0x140]; /* 0x4c98078005070002 */
/*0030*/ MOV R3, c[0x0][0x144]; /* 0x4c98078005170003 */
/*0038*/ FFMA R0, R0, -R5, c[0x0][0x150]; /* 0x5181028005470000 */
/* 0x001ffc00ffe000f1 */
/*0048*/ STG.E [R2], R0; /* 0xeedc200000070200 */
/*0050*/ EXIT; /* 0xe30000000007000f */
/*0058*/ BRA 0x58; /* 0xe2400fffff87000f */
/* 0x001f8000fc0007e0 */
/*0068*/ NOP; /* 0x50b0000000070f00 */
/*0070*/ NOP; /* 0x50b0000000070f00 */
/*0078*/ NOP; /* 0x50b0000000070f00 */
..................................
Function : _Z8fma_plusPffff
.headerflags #"EF_CUDA_SM60 EF_CUDA_PTX_SM(EF_CUDA_SM60)"
/* 0x001fc400fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ MOV R0, c[0x0][0x148]; /* 0x4c98078005270000 */
/*0018*/ MOV R5, c[0x0][0x14c]; /* 0x4c98078005370005 */
/* 0x001fc800fe8007f1 */
/*0028*/ MOV R2, c[0x0][0x140]; /* 0x4c98078005070002 */
/*0030*/ MOV R3, c[0x0][0x144]; /* 0x4c98078005170003 */
/*0038*/ FFMA R0, R0, R5, c[0x0][0x150]; /* 0x5180028005470000 */
/* 0x001ffc00ffe000f1 */
/*0048*/ STG.E [R2], R0; /* 0xeedc200000070200 */
/*0050*/ EXIT; /* 0xe30000000007000f */
/*0058*/ BRA 0x58; /* 0xe2400fffff87000f */
/* 0x001f8000fc0007e0 */
/*0068*/ NOP; /* 0x50b0000000070f00 */
/*0070*/ NOP; /* 0x50b0000000070f00 */
/*0078*/ NOP; /* 0x50b0000000070f00 */
.................................
So the FFMA instruction can indeed take an additional sign to apply to the product (note that it is applied to b in the shader assembly instruction, however this gives the same result).
You can try the same with double precision operands and other compute capabilities instead of sm_60 as well, which will give you similar results.
The problem
During a project in CUDA C, I came across unexpected behaviour regarding single precision and double precision floating point operations. In the project, I first fill an array with number in a kernel and in another kernel, I do some computation on these numbers. All variables and arrays are double precision, so I would not expect any single precision floating point operation to happen. However, if I analyze the executable of the program using NVPROF, it shows that single precision operations are executed. How is this possible?
Minimal, Complete, and Verifiable example
Here is the smallest program, that shows this behaviour on my architecture: (asserts and error catching has been left out). I use a Nvidia Tesla k40 graphics card.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define Nx 10
#define Ny 10
#define RANDOM double(0.236954587566)
__global__ void test(double *array, size_t pitch){
double rho, u;
int x = threadIdx.x + blockDim.x*blockIdx.x;
int y = threadIdx.y + blockDim.y*blockIdx.y;
int idx = y*(pitch/sizeof(double)) + 2*x;
if(x < Nx && y < Ny){
rho = array[idx];
u = array[idx+1]/rho;
array[idx] = rho*u;
}
}
__global__ void fill(double *array, size_t pitch){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int idx = y*(pitch/sizeof(double)) + 2*x;
if(x < Nx || y < Ny){
array[idx] = RANDOM*idx;
array[idx + 1] = idx*idx*RANDOM;
}
}
int main(int argc, char* argv[]) {
double *d_array;
size_t pitch;
cudaMallocPitch((void **) &d_array, &pitch, 2*Nx*sizeof(double), Ny);
dim3 threadDistribution = dim3(8,8);
dim3 blockDistribution = dim3( (Nx + threadDistribution.x - 1) / (threadDistribution.x), (Ny + threadDistribution.y) / (threadDistribution.y));
fill <<< blockDistribution, threadDistribution >>> (d_array, pitch);
cudaDeviceSynchronize();
test <<< blockDistribution, threadDistribution >>> (d_array, pitch);
return 0;
}
The output of NVPROF (edited to make it more readable, if you need the full output, just ask in the comments):
....
Device "Tesla K40c (0)"
Kernel: test(double*, unsigned long)
Metric Name Min Max Avg
flop_count_sp 198 198 198
flop_count_sp_add 0 0 0
flop_count_sp_mul 0 0 0
flop_count_sp_fma 99 99 99
flop_count_sp_special 102 102 102
flop_count_dp 1214 1214 1214
flop_count_dp_add 0 0 0
flop_count_dp_mul 204 204 204
flop_count_dp_fma 505 505 505
What I've found so far
I found that if I delete the division in line 16:
u = array[idx+1]/rho;
==>
u = array[idx+1];
the output is as expected: zero single precision operations and exactly 100 double precision operations are executed. Does anyone know why the division causes the program to use single precision flop and 10 times more double precision floating point operations?
I've also tried using intrinsics (__ddiv_rn), but this didn't solve the problem.
Many thanks in advance!
Edit - Working solution
Altough I still haven't figured out why it uses the single precision, I have found a 'solution' to this problem, thanks to #EOF.
Replacing the division by multiplication with the reciprocal of rho did the job:
u = array[idx+1]/rho;
==>
u = array[idx+1]*__drcp_rn(rho);
As others have pointed out, CUDA devices do not have instructions for floating point division in hardware. Instead they start from an initial approximation to the reciprocal of the denominator, provided by a single precision special function unit. It's product with the numerator is then iteratively refined until it matches the fraction to within machine precision.
Even the __ddiv_rn() intrinsic is compiled to this instruction sequence by ptxas, so it's use makes no difference.
You can gain closer insight by inspecting the code yourself using cuobjdump -sass, although this is made difficult by no official documentation for shader assembly being available other than the bare list of instructions.
I'll use the following bare-bones division kernel as an example:
__global__ void div(double x, double y, double *z) {
*z = x / y;
}
This is compiled to the following shader assembly for a compute capability 3.5 device:
Function : _Z3divddPd
.headerflags #"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08a0109c10801000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ MOV R0, c[0x0][0x14c]; /* 0x64c03c00299c0002 */
/*0018*/ MOV32I R2, 0x1; /* 0x74000000009fc00a */
/*0020*/ MOV R8, c[0x0][0x148]; /* 0x64c03c00291c0022 */
/*0028*/ MOV R9, c[0x0][0x14c]; /* 0x64c03c00299c0026 */
/*0030*/ MUFU.RCP64H R3, R0; /* 0x84000000031c000e */
/*0038*/ MOV32I R0, 0x35b7333; /* 0x7401adb9999fc002 */
/* 0x08a080a080a4a4a4 */
/*0048*/ DFMA R4, -R8, R2, c[0x2][0x0]; /* 0x9b880840001c2012 */
/*0050*/ DFMA R4, R4, R4, R4; /* 0xdb801000021c1012 */
/*0058*/ DFMA R4, R4, R2, R2; /* 0xdb800800011c1012 */
/*0060*/ DMUL R6, R4, c[0x0][0x140]; /* 0x64000000281c101a */
/*0068*/ FSETP.GE.AND P0, PT, R0, |c[0x0][0x144]|, PT; /* 0x5db09c00289c001e */
/*0070*/ DFMA R8, -R8, R6, c[0x0][0x140]; /* 0x9b881800281c2022 */
/*0078*/ MOV R2, c[0x0][0x150]; /* 0x64c03c002a1c000a */
/* 0x0880acb0a0ac8010 */
/*0088*/ MOV R3, c[0x0][0x154]; /* 0x64c03c002a9c000e */
/*0090*/ DFMA R4, R8, R4, R6; /* 0xdb801800021c2012 */
/*0098*/ #P0 BRA 0xb8; /* 0x120000000c00003c */
/*00a0*/ FFMA R0, RZ, c[0x0][0x14c], R5; /* 0x4c001400299ffc02 */
/*00a8*/ FSETP.GT.AND P0, PT, |R0|, c[0x2][0x8], PT; /* 0x5da01c40011c021e */
/*00b0*/ #P0 BRA 0xe8; /* 0x120000001800003c */
/*00b8*/ MOV R4, c[0x0][0x140]; /* 0x64c03c00281c0012 */
/* 0x08a1b810b8008010 */
/*00c8*/ MOV R5, c[0x0][0x144]; /* 0x64c03c00289c0016 */
/*00d0*/ MOV R7, c[0x0][0x14c]; /* 0x64c03c00299c001e */
/*00d8*/ MOV R6, c[0x0][0x148]; /* 0x64c03c00291c001a */
/*00e0*/ CAL 0xf8; /* 0x1300000008000100 */
/*00e8*/ ST.E.64 [R2], R4; /* 0xe5800000001c0810 */
/*00f0*/ EXIT; /* 0x18000000001c003c */
/*00f8*/ LOP32I.AND R0, R7, 0x40000000; /* 0x20200000001c1c00 */
/* 0x08a08010a010b010 */
/*0108*/ MOV32I R15, 0x1ff00000; /* 0x740ff800001fc03e */
/*0110*/ ISETP.LT.U32.AND P0, PT, R0, c[0x2][0xc], PT; /* 0x5b101c40019c001e */
/*0118*/ MOV R8, RZ; /* 0xe4c03c007f9c0022 */
/*0120*/ SEL R9, R15, c[0x2][0x10], !P0; /* 0x65002040021c3c26 */
/*0128*/ MOV32I R12, 0x1; /* 0x74000000009fc032 */
/*0130*/ DMUL R10, R8, R6; /* 0xe4000000031c202a */
/*0138*/ LOP32I.AND R0, R5, 0x7f800000; /* 0x203fc000001c1400 */
/* 0x08a0108ca01080a0 */
/*0148*/ MUFU.RCP64H R13, R11; /* 0x84000000031c2c36 */
/*0150*/ DFMA R16, -R10, R12, c[0x2][0x0]; /* 0x9b883040001c2842 */
/*0158*/ ISETP.LT.U32.AND P0, PT, R0, c[0x2][0x14], PT; /* 0x5b101c40029c001e */
/*0160*/ MOV R14, RZ; /* 0xe4c03c007f9c003a */
/*0168*/ DFMA R16, R16, R16, R16; /* 0xdb804000081c4042 */
/*0170*/ SEL R15, R15, c[0x2][0x10], !P0; /* 0x65002040021c3c3e */
/*0178*/ SSY 0x3a0; /* 0x1480000110000000 */
/* 0x08acb4a4a4a4a480 */
/*0188*/ DMUL R14, R14, R4; /* 0xe4000000021c383a */
/*0190*/ DFMA R12, R16, R12, R12; /* 0xdb803000061c4032 */
/*0198*/ DMUL R16, R14, R12; /* 0xe4000000061c3842 */
/*01a0*/ DFMA R10, -R10, R16, R14; /* 0xdb883800081c282a */
/*01a8*/ DFMA R10, R10, R12, R16; /* 0xdb804000061c282a */
/*01b0*/ DSETP.LEU.AND P0, PT, |R10|, RZ, PT; /* 0xdc581c007f9c2a1e */
/*01b8*/ #!P0 BRA 0x1e0; /* 0x120000001020003c */
/* 0x088010b010b8acb4 */
/*01c8*/ DSETP.EQ.AND P0, PT, R10, RZ, PT; /* 0xdc101c007f9c281e */
/*01d0*/ #!P0 BRA 0x358; /* 0x12000000c020003c */
/*01d8*/ DMUL.S R8, R4, R6; /* 0xe4000000035c1022 */
/*01e0*/ ISETP.GT.U32.AND P0, PT, R0, c[0x2][0x18], PT; /* 0x5b401c40031c001e */
/*01e8*/ MOV32I R0, 0x1ff00000; /* 0x740ff800001fc002 */
/*01f0*/ MOV R14, RZ; /* 0xe4c03c007f9c003a */
/*01f8*/ SEL R15, R0, c[0x2][0x10], !P0; /* 0x65002040021c003e */
/* 0x08b4a49c849c849c */
/*0208*/ DMUL R12, R10, R8; /* 0xe4000000041c2832 */
/*0210*/ DMUL R18, R10, R14; /* 0xe4000000071c284a */
/*0218*/ DMUL R10, R12, R14; /* 0xe4000000071c302a */
/*0220*/ DMUL R16, R8, R18; /* 0xe4000000091c2042 */
/*0228*/ DFMA R8, R10, R6, -R4; /* 0xdb901000031c2822 */
/*0230*/ DFMA R12, R16, R6, -R4; /* 0xdb901000031c4032 */
/*0238*/ DSETP.GT.AND P0, PT, |R8|, |R12|, PT; /* 0xdc209c00061c221e */
/* 0x08b010ac10b010a0 */
/*0248*/ SEL R9, R17, R11, P0; /* 0xe5000000059c4426 */
/*0250*/ FSETP.GTU.AND P1, PT, |R9|, 1.469367938527859385e-39, PT; /* 0xb5e01c00801c263d */
/*0258*/ MOV R11, R9; /* 0xe4c03c00049c002e */
/*0260*/ SEL R8, R16, R10, P0; /* 0xe5000000051c4022 */
/*0268*/ #P1 NOP.S; /* 0x8580000000443c02 */
/*0270*/ FSETP.LT.AND P0, PT, |R5|, 1.5046327690525280102e-36, PT; /* 0xb5881c20001c161d */
/*0278*/ MOV32I R0, 0x3ff00000; /* 0x741ff800001fc002 */
/* 0x0880a48090108c10 */
/*0288*/ MOV R16, RZ; /* 0xe4c03c007f9c0042 */
/*0290*/ SEL R17, R0, c[0x2][0x1c], !P0; /* 0x65002040039c0046 */
/*0298*/ LOP.OR R10, R8, 0x1; /* 0xc2001000009c2029 */
/*02a0*/ LOP.AND R8, R8, -0x2; /* 0xca0003ffff1c2021 */
/*02a8*/ DMUL R4, R16, R4; /* 0xe4000000021c4012 */
/*02b0*/ DMUL R6, R16, R6; /* 0xe4000000031c401a */
/*02b8*/ DFMA R14, R10, R6, -R4; /* 0xdb901000031c283a */
/* 0x08b010b010a0b4a4 */
/*02c8*/ DFMA R12, R8, R6, -R4; /* 0xdb901000031c2032 */
/*02d0*/ DSETP.GT.AND P0, PT, |R12|, |R14|, PT; /* 0xdc209c00071c321e */
/*02d8*/ SEL R8, R10, R8, P0; /* 0xe5000000041c2822 */
/*02e0*/ LOP.AND R0, R8, 0x1; /* 0xc2000000009c2001 */
/*02e8*/ IADD R11.CC, R8, -0x1; /* 0xc88403ffff9c202d */
/*02f0*/ ISETP.EQ.U32.AND P0, PT, R0, 0x1, PT; /* 0xb3201c00009c001d */
/*02f8*/ IADD.X R0, R9, -0x1; /* 0xc88043ffff9c2401 */
/* 0x08b4a480a010b010 */
/*0308*/ SEL R10, R11, R8, !P0; /* 0xe5002000041c2c2a */
/*0310*/ #P0 IADD R8.CC, R8, 0x1; /* 0xc084000000802021 */
/*0318*/ SEL R11, R0, R9, !P0; /* 0xe5002000049c002e */
/*0320*/ #P0 IADD.X R9, R9, RZ; /* 0xe08040007f802426 */
/*0328*/ DFMA R14, R10, R6, -R4; /* 0xdb901000031c283a */
/*0330*/ DFMA R4, R8, R6, -R4; /* 0xdb901000031c2012 */
/*0338*/ DSETP.GT.AND P0, PT, |R4|, |R14|, PT; /* 0xdc209c00071c121e */
/* 0x08b4acb4a010b810 */
/*0348*/ SEL R8, R10, R8, P0; /* 0xe5000000041c2822 */
/*0350*/ SEL.S R9, R11, R9, P0; /* 0xe500000004dc2c26 */
/*0358*/ MOV R8, RZ; /* 0xe4c03c007f9c0022 */
/*0360*/ MUFU.RCP64H R9, R7; /* 0x84000000031c1c26 */
/*0368*/ DSETP.GT.AND P0, PT, |R8|, RZ, PT; /* 0xdc201c007f9c221e */
/*0370*/ #P0 BRA.U 0x398; /* 0x120000001000023c */
/*0378*/ #!P0 DSETP.NEU.AND P1, PT, |R6|, +INF , PT; /* 0xb4681fff80201a3d */
/* 0x0800b8a010ac0010 */
/*0388*/ #!P0 SEL R9, R7, R9, P1; /* 0xe500040004a01c26 */
/*0390*/ #!P0 SEL R8, R6, RZ, P1; /* 0xe50004007fa01822 */
/*0398*/ DMUL.S R8, R8, R4; /* 0xe4000000025c2022 */
/*03a0*/ MOV R4, R8; /* 0xe4c03c00041c0012 */
/*03a8*/ MOV R5, R9; /* 0xe4c03c00049c0016 */
/*03b0*/ RET; /* 0x19000000001c003c */
/*03b8*/ BRA 0x3b8; /* 0x12007ffffc1c003c */
The MUFU.RCP64H instruction provides the initial approximation of the reciprocal. It operates on the high 32 bits of the denominator (y) and provides the high 32 bits of the double precision approximation, and therefor is counted as a Floating Point Operations (Single Precision Special) by the profiler.
There is another single precision FFMA instruction further down apparently used as a high-throughput version of testing a conditional where full precision isn't required.
I have the following kernel performing a simple assignment of a global memory matrix in to a global memory matrix out:
__global__ void simple_copy(float *outdata, const float *indata){
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
outdata[y*width + x] = indata[y*width + x];
}
I'm inspecting the disassembled microcode dumped by cuobjdump:
Function : _Z11simple_copyPfPKf
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x80001de218000000*/ MOV32I R0, 0x20; R0 = TILE_DIM
/*0010*/ /*0x00001c8614000000*/ LDC R0, c [0x0] [R0]; R0 = c
/*0018*/ /*0x90009de218000000*/ MOV32I R2, 0x24; R2 = 36
/*0020*/ /*0x00209c8614000000*/ LDC R2, c [0x0] [R2]; R2 = c
int x = blockIdx.x * TILE_DIM + threadIdx.x;
/*0028*/ /*0x9400dc042c000000*/ S2R R3, SR_CTAid_X; R3 = BlockIdx.x
/*0030*/ /*0x0c00dde428000000*/ MOV R3, R3; R3 = R3 ???
/*0038*/ /*0x84011c042c000000*/ S2R R4, SR_Tid_X; R3 = ThreadIdx.x
/*0040*/ /*0x10011de428000000*/ MOV R4, R4; R4 = R4 ???
/*0048*/ /*0x8030dca32008c000*/ IMAD R3, R3, 0x20, R4; R3 = R3 * TILE_DIM + R4 (contains x)
int y = blockIdx.y * TILE_DIM + threadIdx.y;
/*0050*/ /*0x98011c042c000000*/ S2R R4, SR_CTAid_Y;
/*0058*/ /*0x10011de428000000*/ MOV R4, R4;
/*0060*/ /*0x88015c042c000000*/ S2R R5, SR_Tid_Y;
/*0068*/ /*0x14015de428000000*/ MOV R5, R5;
/*0070*/ /*0x80411ca3200ac000*/ IMAD R4, R4, 0x20, R5; R4 ... (contains y)
int width = gridDim.x * TILE_DIM;
/*0078*/ /*0x50015de428004000*/ MOV R5, c [0x0] [0x14]; R5 = c
/*0080*/ /*0x80515ca35000c000*/ IMUL R5, R5, 0x20; R5 = R5 * TILE_DIM (contains width)
y*width + x
/*0088*/ /*0x14419ca320060000*/ IMAD R6, R4, R5, R3; R6 = R4 * R5 + R3 (contains y*width+x)
Loads indata[y*width + x]
/*0090*/ /*0x08619c036000c000*/ SHL R6, R6, 0x2;
/*0098*/ /*0x18209c0348000000*/ IADD R2, R2, R6;
/*00a0*/ /*0x08009de428000000*/ MOV R2, R2; R2 = R2 ???
/*00a8*/ /*0x00209c8580000000*/ LD R2, [R2]; Load from memory - R2 =
Stores outdata[y*width + x]
/*00b0*/ /*0x1440dca320060000*/ IMAD R3, R4, R5, R3;
/*00b8*/ /*0x0830dc036000c000*/ SHL R3, R3, 0x2;
/*00c0*/ /*0x0c001c0348000000*/ IADD R0, R0, R3; R0 = R0 + R3
/*00c8*/ /*0x00001de428000000*/ MOV R0, R0; R0 = R0 ???
/*00d0*/ /*0x00009c8590000000*/ ST [R0], R2; Store to memory
/*00d8*/ /*0x40001de740000000*/ BRA 0xf0;
/*00e0*/ /*0x00001de780000000*/ EXIT;
/*00e8*/ /*0x00001de780000000*/ EXIT;
/*00f0*/ /*0x00001de780000000*/ EXIT;
/*00f8*/ /*0x00001de780000000*/ EXIT;
The comments on top or aside of the disassembled code are my own.
As you can see, there are some apparently useless operations, marked by ??? in the comments. Essentially, they are moves of registers into themselves.
I have then the two following questions:
If they are useless, I believe that they are uselessly consuming computation time. Can I optimize the disassembled microcode by removing them?
PTX files can be inlined in CUDA codes. However, PTX is just an intermediate language needed for portability across GPUs. Can I somehow "inline" an optimized disassembled microcode?
Thank you very much in advance.
EDIT: THE SAME CODE COMPILED IN RELEASE MODE FOR SM = 2.0
Function : _Z11simple_copyPfPKf
.headerflags #"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.Y; /* 0x2c00000098001c04 */
/*0010*/ S2R R2, SR_TID.Y; /* 0x2c00000088009c04 */
/*0018*/ S2R R3, SR_CTAID.X; /* 0x2c0000009400dc04 */
/*0020*/ S2R R4, SR_TID.X; /* 0x2c00000084011c04 */
/*0028*/ MOV R5, c[0x0][0x14]; /* 0x2800400050015de4 */
/*0030*/ ISCADD R2, R0, R2, 0x5; /* 0x4000000008009ca3 */
/*0038*/ ISCADD R3, R3, R4, 0x5; /* 0x400000001030dca3 */
/*0040*/ SHL R0, R5, 0x5; /* 0x6000c00014501c03 */
/*0048*/ IMAD R2, R0, R2, R3; /* 0x2006000008009ca3 */
/*0050*/ ISCADD R0, R2, c[0x0][0x24], 0x2; /* 0x4000400090201c43 */
/*0058*/ ISCADD R2, R2, c[0x0][0x20], 0x2; /* 0x4000400080209c43 */
/*0060*/ LD R0, [R0]; /* 0x8000000000001c85 */
/*0068*/ ST [R2], R0; /* 0x9000000000201c85 */
/*0070*/ EXIT ; /* 0x8000000000001de7 */
EDIT: THE SAME CODE COMPILED IN RELEASE MODE FOR SM = 2.1
Function : _Z11simple_copyPfPKf
.headerflags #"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ NOP; /* 0x4000000000001de4 */
/*0010*/ MOV R0, c[0x0][0x14]; /* 0x2800400050001de4 */
/*0018*/ S2R R2, SR_CTAID.Y; /* 0x2c00000098009c04 */
/*0020*/ SHL R0, R0, 0x5; /* 0x6000c00014001c03 */
/*0028*/ S2R R3, SR_TID.Y; /* 0x2c0000008800dc04 */
/*0030*/ ISCADD R3, R2, R3, 0x5; /* 0x400000000c20dca3 */
/*0038*/ S2R R4, SR_CTAID.X; /* 0x2c00000094011c04 */
/*0040*/ S2R R5, SR_TID.X; /* 0x2c00000084015c04 */
/*0048*/ ISCADD R2, R4, R5, 0x5; /* 0x4000000014409ca3 */
/*0050*/ IMAD R2, R0, R3, R2; /* 0x200400000c009ca3 */
/*0058*/ ISCADD R0, R2, c[0x0][0x24], 0x2; /* 0x4000400090201c43 */
/*0060*/ ISCADD R2, R2, c[0x0][0x20], 0x2; /* 0x4000400080209c43 */
/*0068*/ LD R0, [R0]; /* 0x8000000000001c85 */
/*0070*/ ST [R2], R0; /* 0x9000000000201c85 */
/*0078*/ EXIT ; /* 0x8000000000001de7 */
The answer to both questions is no.
If you try to delete instructions from the final binary payload. you will change the length of code sections and break the ELF and fatbinary files. To fix that would require hand crafting headers whose formats are not readily documented, which sounds like a lot of work just to optimize out a couple of instructions.
And inline native assembler is not supported, but I am sure you knew that already.
And finally, I can't reproduce using CUDA 5.0:
Fatbin elf code:
================
arch = sm_20
code version = [1,6]
producer = cuda
host = mac
compile_size = 32bit
identifier = pumpkinhead.cu
code for sm_20
Function : _Z11simple_copyPfPKf
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x98001c042c000000*/ S2R R0, SR_CTAid_Y;
/*0010*/ /*0x88009c042c000000*/ S2R R2, SR_Tid_Y;
/*0018*/ /*0x9400dc042c000000*/ S2R R3, SR_CTAid_X;
/*0020*/ /*0x84011c042c000000*/ S2R R4, SR_Tid_X;
/*0028*/ /*0x08001ca340000000*/ ISCADD R0, R0, R2, 0x5;
/*0030*/ /*0x10309ca340000000*/ ISCADD R2, R3, R4, 0x5;
/*0038*/ /*0x50001ca350004000*/ IMUL R0, R0, c [0x0] [0x14];
/*0040*/ /*0x08009ca340000000*/ ISCADD R2, R0, R2, 0x5;
/*0048*/ /*0x90201c4340004000*/ ISCADD R0, R2, c [0x0] [0x24], 0x2;
/*0050*/ /*0x80209c4340004000*/ ISCADD R2, R2, c [0x0] [0x20], 0x2;
/*0058*/ /*0x00001c8580000000*/ LD R0, [R0];
/*0060*/ /*0x00201c8590000000*/ ST [R2], R0;
/*0068*/ /*0x00001de780000000*/ EXIT;
.....................................
Are you sure the code you have shown was compiled with release settings?
I'm trying to understand how to use __threadfence(), as it seems like a powerful synchronization primitive that lets different blocks work together without going through the huge hassle of ending a kernel and starting a new one. The CUDA C Programming guide has an example of it (Appendix B.5), which is fleshed out in the "threadFenceReduction" sample in the SDK, so it seems like something we "should" be using.
However, when I have tried using __threadfence(), it is shockingly slow. See the code below for an example. From what I understand, __threadfence() should just make sure that all pending memory transfers from the current thread block are finished, before proceeding. Memory latency is somewhat better than a microsecond, I believe, so the total time to deal with the 64KB of memory transfers in the included code, on a GTX680, should be somewhere around a microsecond. Instead, the __threadfence() instruction seems to take around 20 microseconds! Instead of using __threadfence() to synchronize, I can instead end the kernel, and launch an entirely new kernel (in the same, default, stream so that it is synchronized), in less then a third of the time!
What is going on here? Does my code have a bug in it that I'm not noticing? Or is __threadfence() really 20x slower than it should be, and 6x slower than an entire kernel launch+cleanup?
Time for 1000 runs of the threadfence kernel: 27.716831 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 25.962912 ms
Synchronizing without threadfence, by splitting to two kernels: 7.653344 ms
Answer: 120
#include "cuda.h"
#include <cstdio>
__device__ unsigned int count = 0;
__shared__ bool isLastBlockDone;
__device__ int scratch[16];
__device__ int junk[16000];
__device__ int answer;
__global__ void usethreadfence() //just like the code example in B.5 of the CUDA C Programming Guide
{
if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x; //do some more memory writes to make the kernel nontrivial
__threadfence();
if (threadIdx.x==0) {
unsigned int value = atomicInc(&count, gridDim.x);
isLastBlockDone = (value == (gridDim.x - 1));
}
__syncthreads();
if (isLastBlockDone && threadIdx.x==0) {
// The last block sums the results stored in scratch[0 .. gridDim.x-1]
int sum=0;
for (int i=0;i<gridDim.x;i++) sum+=scratch[i];
answer=sum;
}
}
__global__ void justthreadfence() //first three lines of the previous kernel, so we can compare speeds
{
if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x;
__threadfence();
}
__global__ void usetwokernels_1() //this and the next kernel reproduce the functionality of the first kernel, but faster!
{
if (threadIdx.x==0) scratch[blockIdx.x]=blockIdx.x;
junk[threadIdx.x+blockIdx.x*1000]=17+threadIdx.x;
}
__global__ void usetwokernels_2()
{
if (threadIdx.x==0) {
int sum=0;
for (int i=0;i<gridDim.x;i++) sum+=scratch[i];
answer=sum;
}
}
int main() {
int sum;
cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
for (int i=0;i<1000;i++) usethreadfence<<<16,1000>>>();
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Time for 1000 runs of the threadfence kernel: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
cudaMemcpyFromSymbol(&sum,answer,sizeof(int)); printf("Answer: %d\n",sum);
cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
for (int i=0;i<1000;i++) justthreadfence<<<16,1000>>>();
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Time for 1000 runs of just the first 3 lines, including threadfence: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0);
for (int i=0;i<1000;i++) {usetwokernels_1<<<16,1000>>>(); usetwokernels_2<<<16,1000>>>();}
cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf ("Synchronizing without threadfence, by splitting to two kernels: %f ms\n", time); cudaEventDestroy(start); cudaEventDestroy(stop);
cudaMemcpyFromSymbol(&sum,answer,sizeof(int)); printf("Answer: %d\n",sum);
}
I have tested your code, compiled with CUDA 6.0, on two different cards: GT540M (Fermi) and Kepler K20c (Kepler) and these are the results
GT540M
Time for 1000 runs of the threadfence kernel: 303.373688 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 300.395416 ms
Synchronizing without threadfence, by splitting to two kernels: 597.729919 ms
Answer: 120
Kepler K20c
Time for 1000 runs of the threadfence kernel: 10.164096 ms
Answer: 120
Time for 1000 runs of just the first 3 lines, including threadfence: 8.808896 ms
Synchronizing without threadfence, by splitting to two kernels: 17.330784 ms
Answer: 120
I do not observe any particularly slow behavior of __threadfence() against the other two considered cases.
This can be justified by resorting to the disassembled codes.
usethreadfence()
c[0xe][0x0] = scratch
c[0xe][0x4] = junk
c[0xe][0xc] = count
c[0x0][0x14] = gridDim.x
/*0000*/ MOV R1, c[0x1][0x100];
/*0008*/ S2R R0, SR_TID.X; R0 = threadIdx.x
/*0010*/ ISETP.NE.AND P0, PT, R0, RZ, PT; P0 = (R0 != 0)
/*0018*/ S2R R5, SR_CTAID.X; R5 = blockIdx.x
/*0020*/ IMAD R3, R5, 0x3e8, R0; R3 = R5 * 1000 + R0 = threadIdx.x + blockIdx.x * 1000
if (threadIdx.x == 0)
/*0028*/ #!P0 ISCADD R2, R5, c[0xe][0x0], 0x2; R2 = scratch + threadIdx.x
/*0030*/ IADD R4, R0, 0x11; R4 = R0 + 17 = threadIdx.x + 17
/*0038*/ ISCADD R3, R3, c[0xe][0x4], 0x2; R3 = junk + threadIdx.x + blockIdx.x * 1000
/*0040*/ #!P0 ST [R2], R5; scratch[threadIdx.x] = blockIdx.x
/*0048*/ ST [R3], R4; junk[threadIdx.x + blockIdx.x * 1000] = threadIdx.x + 17
/*0050*/ MEMBAR.GL; __threadfence
/*0058*/ #P0 BRA.U 0x98; if (threadIdx.x != 0) branch to 0x98
if (threadIdx.x == 0)
/*0060*/ #!P0 MOV R2, c[0xe][0xc]; R2 = &count
/*0068*/ #!P0 MOV R3, c[0x0][0x14]; R3 = gridDim.x
/*0070*/ #!P0 ATOM.INC R2, [R2], R3; R2 = value = count + 1; *(&count) ++
/*0078*/ #!P0 IADD R3, R3, -0x1; R3 = R3 - 1 = gridDim.x - 1
/*0080*/ #!P0 ISETP.EQ.AND P1, PT, R2, R3, PT; P1 = (R2 == R3) = 8 value == (gridDim.x - 1))
/*0088*/ #!P0 SEL R2, RZ, 0x1, !P1; if (!P1) R2 = RZ otherwise R2 = 1 (R2 = isLastBlockDone)
/*0090*/ #!P0 STS.U8 [RZ], R2; Stores R2 (i.e., isLastBlockDone) to shared memory to [0]
/*0098*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; P0 = (R0 == 0) = (threadIdx.x == 0)
/*00a0*/ BAR.RED.POPC RZ, RZ, RZ, PT; __syncthreads()
/*00a8*/ LDS.U8 R0, [RZ]; R0 = R2 = isLastBlockDone
/*00b0*/ ISETP.NE.AND P0, PT, R0, RZ, P0; P0 = (R0 == 0)
/*00b8*/ #!P0 EXIT; if (isLastBlockDone != 0) exits
/*00c0*/ ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT; IMPLEMENTING THE FOR LOOP WITH A LOOP UNROLL OF 4
/*00c8*/ MOV R0, RZ;
/*00d0*/ #!P0 BRA 0x1b8;
/*00d8*/ MOV R2, c[0x0][0x14];
/*00e0*/ ISETP.GT.AND P0, PT, R2, 0x3, PT;
/*00e8*/ MOV R2, RZ;
/*00f0*/ #!P0 BRA 0x170;
/*00f8*/ MOV R3, c[0x0][0x14];
/*0100*/ IADD R7, R3, -0x3;
/*0108*/ NOP;
/*0110*/ ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0118*/ IADD R2, R2, 0x4;
/*0120*/ LD R4, [R3];
/*0128*/ ISETP.LT.U32.AND P0, PT, R2, R7, PT;
/*0130*/ LD R5, [R3+0x4];
/*0138*/ LD R6, [R3+0x8];
/*0140*/ LD R3, [R3+0xc];
/*0148*/ IADD R0, R4, R0;
/*0150*/ IADD R0, R5, R0;
/*0158*/ IADD R0, R6, R0;
/*0160*/ IADD R0, R3, R0;
/*0168*/ #P0 BRA 0x110;
/*0170*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*0178*/ #!P0 BRA 0x1b8;
/*0180*/ ISCADD R3, R2, c[0xe][0x0], 0x2;
/*0188*/ IADD R2, R2, 0x1;
/*0190*/ LD R3, [R3];
/*0198*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT;
/*01a0*/ NOP;
/*01a8*/ IADD R0, R3, R0;
/*01b0*/ #P0 BRA 0x180;
/*01b8*/ MOV R2, c[0xe][0x8];
/*01c0*/ ST [R2], R0;
/*01c8*/ EXIT;
justthreadfence()
Function : _Z15justthreadfencev
.headerflags #"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R3, SR_TID.X; /* 0x2c0000008400dc04 */
/*0010*/ ISETP.NE.AND P0, PT, R3, RZ, PT; /* 0x1a8e0000fc31dc23 */
/*0018*/ S2R R4, SR_CTAID.X; /* 0x2c00000094011c04 */
/*0020*/ IMAD R2, R4, 0x3e8, R3; /* 0x2006c00fa0409ca3 */
/*0028*/ #!P0 ISCADD R0, R4, c[0xe][0x0], 0x2; /* 0x4000780000402043 */
/*0030*/ IADD R3, R3, 0x11; /* 0x4800c0004430dc03 */
/*0038*/ ISCADD R2, R2, c[0xe][0x4], 0x2; /* 0x4000780010209c43 */
/*0040*/ #!P0 ST [R0], R4; /* 0x9000000000012085 */
/*0048*/ ST [R2], R3; /* 0x900000000020dc85 */
/*0050*/ MEMBAR.GL; /* 0xe000000000001c25 */
/*0058*/ EXIT; /* 0x8000000000001de7 */
usetwokernels_1()
Function : _Z15usetwokernels_1v
.headerflags #"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_TID.X; /* 0x2c00000084001c04 */
/*0010*/ ISETP.NE.AND P0, PT, R0, RZ, PT; /* 0x1a8e0000fc01dc23 */
/*0018*/ S2R R2, SR_CTAID.X; /* 0x2c00000094009c04 */
/*0020*/ IMAD R4, R2, 0x3e8, R0; /* 0x2000c00fa0211ca3 */
/*0028*/ #!P0 ISCADD R3, R2, c[0xe][0x0], 0x2; /* 0x400078000020e043 */
/*0030*/ IADD R0, R0, 0x11; /* 0x4800c00044001c03 */
/*0038*/ ISCADD R4, R4, c[0xe][0x4], 0x2; /* 0x4000780010411c43 */
/*0040*/ #!P0 ST [R3], R2; /* 0x900000000030a085 */
/*0048*/ ST [R4], R0; /* 0x9000000000401c85 */
/*0050*/ EXIT; /* 0x8000000000001de7 */
.....................................
usetwokernels_1()
Function : _Z15usetwokernels_2v
.headerflags #"EF_CUDA_SM20 EF_CUDA_PTX_SM(EF_CUDA_SM20)"
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_TID.X; /* 0x2c00000084001c04 */
/*0010*/ ISETP.NE.AND P0, PT, R0, RZ, PT; /* 0x1a8e0000fc01dc23 */
/*0018*/ #P0 EXIT; /* 0x80000000000001e7 */
/*0020*/ ISETP.NE.AND P0, PT, RZ, c[0x0][0x14], PT; /* 0x1a8e400053f1dc23 */
/*0028*/ MOV R0, RZ; /* 0x28000000fc001de4 */
/*0030*/ #!P0 BRA 0x130; /* 0x40000003e00021e7 */
/*0038*/ MOV R2, c[0x0][0x14]; /* 0x2800400050009de4 */
/*0040*/ ISETP.GT.AND P0, PT, R2, 0x3, PT; /* 0x1a0ec0000c21dc23 */
/*0048*/ MOV R2, RZ; /* 0x28000000fc009de4 */
/*0050*/ #!P0 BRA 0xe0; /* 0x40000002200021e7 */
/*0058*/ MOV R3, c[0x0][0x14]; /* 0x280040005000dde4 */
/*0060*/ IADD R7, R3, -0x3; /* 0x4800fffff431dc03 */
/*0068*/ NOP; /* 0x4000000000001de4 */
/*0070*/ NOP; /* 0x4000000000001de4 */
/*0078*/ NOP; /* 0x4000000000001de4 */
/*0080*/ ISCADD R3, R2, c[0xe][0x0], 0x2; /* 0x400078000020dc43 */
/*0088*/ LD R4, [R3]; /* 0x8000000000311c85 */
/*0090*/ IADD R2, R2, 0x4; /* 0x4800c00010209c03 */
/*0098*/ LD R5, [R3+0x4]; /* 0x8000000010315c85 */
/*00a0*/ ISETP.LT.U32.AND P0, PT, R2, R7, PT; /* 0x188e00001c21dc03 */
/*00a8*/ LD R6, [R3+0x8]; /* 0x8000000020319c85 */
/*00b0*/ LD R3, [R3+0xc]; /* 0x800000003030dc85 */
/*00b8*/ IADD R0, R4, R0; /* 0x4800000000401c03 */
/*00c0*/ IADD R0, R5, R0; /* 0x4800000000501c03 */
/*00c8*/ IADD R0, R6, R0; /* 0x4800000000601c03 */
/*00d0*/ IADD R0, R3, R0; /* 0x4800000000301c03 */
/*00d8*/ #P0 BRA 0x80; /* 0x4003fffe800001e7 */
/*00e0*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT; /* 0x188e40005021dc03 */
/*00e8*/ #!P0 BRA 0x130; /* 0x40000001000021e7 */
/*00f0*/ NOP; /* 0x4000000000001de4 */
/*00f8*/ NOP; /* 0x4000000000001de4 */
/*0100*/ ISCADD R3, R2, c[0xe][0x0], 0x2; /* 0x400078000020dc43 */
/*0108*/ IADD R2, R2, 0x1; /* 0x4800c00004209c03 */
/*0110*/ LD R3, [R3]; /* 0x800000000030dc85 */
/*0118*/ ISETP.LT.U32.AND P0, PT, R2, c[0x0][0x14], PT; /* 0x188e40005021dc03 */
/*0120*/ IADD R0, R3, R0; /* 0x4800000000301c03 */
/*0128*/ #P0 BRA 0x100; /* 0x4003ffff400001e7 */
/*0130*/ MOV R2, c[0xe][0x8]; /* 0x2800780020009de4 */
/*0138*/ ST [R2], R0; /* 0x9000000000201c85 */
/*0140*/ EXIT; /* 0x8000000000001de7 */
.....................................
As it can be seen, the instructions of justthreadfencev() are strictly contained in those of usethreadfence(), while those of usetwokernels_1() and usetwokernels_2() are practically a partitioning of those of justthreadfencev(). So, the difference in timings could be ascribed to the kernel launch overhead of the second kernel.
The following code sums every 32 elements in an array to the very first element of each 32 element group:
int i = threadIdx.x;
int warpid = i&31;
if(warpid < 16){
s_buf[i] += s_buf[i+16];__syncthreads();
s_buf[i] += s_buf[i+8];__syncthreads();
s_buf[i] += s_buf[i+4];__syncthreads();
s_buf[i] += s_buf[i+2];__syncthreads();
s_buf[i] += s_buf[i+1];__syncthreads();
}
I thought I can eliminate all the __syncthreads() in the code, since all the operations are done in the same warp. But if I eliminate them, I get garbage results back. It shall not affect performance too much, but I want to know why I need __syncthreads() here.
I'm providing an answer here because I think that the above two are not fully satisfactory. The "intellectual property" of this answer belongs to Mark Harris, who has pointed out this issue in this presentation (slide 22), and to #talonmies, who has pointed this problem out to the OP in the comments above.
Let me first try to resume what the OP was asking, filtering his mistakes.
The OP seems to be dealing with the last step of reduction in shared memory reduction, warp reduction by loop unrolling. He is doing something like
template <class T>
__device__ void warpReduce(T *sdata, int tid) {
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
template <class T>
__global__ void reduce4_no_synchthreads(T *g_idata, T *g_odata, unsigned int N)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // Local thread index
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; // Global thread index - Fictitiously double the block dimension
// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i+blockDim.x];
sdata[tid] = mySum;
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>32; s>>=1)
{
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
if (tid < 32) warpReduce(sdata, tid);
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
As pointed out by Mark Harris and talonmies, the shared memory variable sdata must be declared as volatile, to prevent compiler optimizations. So, the right way to define the __device__ function above is:
template <class T>
__device__ void warpReduce(volatile T *sdata, int tid) {
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
Let us now see the disassembled codes corresponding to the two cases above examined, i.e., sdata declared as not volatile or volatile (code compiled for Fermi architecture).
Not volatile
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ SHL R3, R0, 0x1; /* 0x6000c0000400dc03 */
/*0018*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0020*/ IMAD R3, R3, c[0x0][0x8], R2; /* 0x200440002030dca3 */
/*0028*/ IADD R4, R3, c[0x0][0x8]; /* 0x4800400020311c03 */
/*0030*/ ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT; /* 0x188e4000a031dc03 */
/*0038*/ ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT; /* 0x1b0e4000a043dc03 */
/*0040*/ #P0 ISCADD R3, R3, c[0x0][0x20], 0x2; /* 0x400040008030c043 */
/*0048*/ #!P1 ISCADD R4, R4, c[0x0][0x20], 0x2; /* 0x4000400080412443 */
/*0050*/ #!P0 MOV R5, RZ; /* 0x28000000fc0161e4 */
/*0058*/ #!P1 LD R4, [R4]; /* 0x8000000000412485 */
/*0060*/ #P0 LD R5, [R3]; /* 0x8000000000314085 */
/*0068*/ SHL R3, R2, 0x2; /* 0x6000c0000820dc03 */
/*0070*/ NOP; /* 0x4000000000001de4 */
/*0078*/ #!P1 IADD R5, R4, R5; /* 0x4800000014416403 */
/*0080*/ MOV R4, c[0x0][0x8]; /* 0x2800400020011de4 */
/*0088*/ STS [R3], R5; /* 0xc900000000315c85 */
/*0090*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0098*/ MOV R6, c[0x0][0x8]; /* 0x2800400020019de4 */
/*00a0*/ ISETP.LT.U32.AND P0, PT, R6, 0x42, PT; /* 0x188ec0010861dc03 */
/*00a8*/ #P0 BRA 0x118; /* 0x40000001a00001e7 */
/*00b0*/ NOP; /* 0x4000000000001de4 */
/*00b8*/ NOP; /* 0x4000000000001de4 */
/*00c0*/ MOV R6, R4; /* 0x2800000010019de4 */
/*00c8*/ SHR.U32 R4, R4, 0x1; /* 0x5800c00004411c03 */
/*00d0*/ ISETP.GE.U32.AND P0, PT, R2, R4, PT; /* 0x1b0e00001021dc03 */
/*00d8*/ #!P0 IADD R7, R4, R2; /* 0x480000000841e003 */
/*00e0*/ #!P0 SHL R7, R7, 0x2; /* 0x6000c0000871e003 */
/*00e8*/ #!P0 LDS R7, [R7]; /* 0xc10000000071e085 */
/*00f0*/ #!P0 IADD R5, R7, R5; /* 0x4800000014716003 */
/*00f8*/ #!P0 STS [R3], R5; /* 0xc900000000316085 */
/*0100*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0108*/ ISETP.GT.U32.AND P0, PT, R6, 0x83, PT; /* 0x1a0ec0020c61dc03 */
/*0110*/ #P0 BRA 0xc0; /* 0x4003fffea00001e7 */
/*0118*/ ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT; /* 0x1a0ec0007c21dc03 */
/*0120*/ #P0 BRA.U 0x198; /* 0x40000001c00081e7 */
/*0128*/ #!P0 LDS R8, [R3]; /* 0xc100000000322085 */
/*0130*/ #!P0 LDS R5, [R3+0x80]; /* 0xc100000200316085 */
/*0138*/ #!P0 LDS R4, [R3+0x40]; /* 0xc100000100312085 */
/*0140*/ #!P0 LDS R7, [R3+0x20]; /* 0xc10000008031e085 */
/*0148*/ #!P0 LDS R6, [R3+0x10]; /* 0xc10000004031a085 */
/*0150*/ #!P0 IADD R8, R8, R5; /* 0x4800000014822003 */
/*0158*/ #!P0 IADD R8, R8, R4; /* 0x4800000010822003 */
/*0160*/ #!P0 LDS R5, [R3+0x8]; /* 0xc100000020316085 */
/*0168*/ #!P0 IADD R7, R8, R7; /* 0x480000001c81e003 */
/*0170*/ #!P0 LDS R4, [R3+0x4]; /* 0xc100000010312085 */
/*0178*/ #!P0 IADD R6, R7, R6; /* 0x480000001871a003 */
/*0180*/ #!P0 IADD R5, R6, R5; /* 0x4800000014616003 */
/*0188*/ #!P0 IADD R4, R5, R4; /* 0x4800000010512003 */
/*0190*/ #!P0 STS [R3], R4; /* 0xc900000000312085 */
/*0198*/ ISETP.NE.AND P0, PT, R2, RZ, PT; /* 0x1a8e0000fc21dc23 */
/*01a0*/ #P0 BRA.U 0x1c0; /* 0x40000000600081e7 */
/*01a8*/ #!P0 ISCADD R0, R0, c[0x0][0x24], 0x2; /* 0x4000400090002043 */
/*01b0*/ #!P0 LDS R2, [RZ]; /* 0xc100000003f0a085 */
/*01b8*/ #!P0 ST [R0], R2; /* 0x900000000000a085 */
/*01c0*/ EXIT; /* 0x8000000000001de7 */
Lines /*0128*/-/*0148*/, /*0160*/ and /*0170*/ correspond to the shared memory loads to registers and line /*0190*/ to the shared memory store from register. The intermediate lines correspond to the summations, as performed in registers. So, the intermediate results are kept in registers (which are private to each thread) and not flushed each time to shared memory, preventing the threads to have full visibility of the intermediate results.
volatile
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ SHL R3, R0, 0x1; /* 0x6000c0000400dc03 */
/*0018*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0020*/ IMAD R3, R3, c[0x0][0x8], R2; /* 0x200440002030dca3 */
/*0028*/ IADD R4, R3, c[0x0][0x8]; /* 0x4800400020311c03 */
/*0030*/ ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT; /* 0x188e4000a031dc03 */
/*0038*/ ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT; /* 0x1b0e4000a043dc03 */
/*0040*/ #P0 ISCADD R3, R3, c[0x0][0x20], 0x2; /* 0x400040008030c043 */
/*0048*/ #!P1 ISCADD R4, R4, c[0x0][0x20], 0x2; /* 0x4000400080412443 */
/*0050*/ #!P0 MOV R5, RZ; /* 0x28000000fc0161e4 */
/*0058*/ #!P1 LD R4, [R4]; /* 0x8000000000412485 */
/*0060*/ #P0 LD R5, [R3]; /* 0x8000000000314085 */
/*0068*/ SHL R3, R2, 0x2; /* 0x6000c0000820dc03 */
/*0070*/ NOP; /* 0x4000000000001de4 */
/*0078*/ #!P1 IADD R5, R4, R5; /* 0x4800000014416403 */
/*0080*/ MOV R4, c[0x0][0x8]; /* 0x2800400020011de4 */
/*0088*/ STS [R3], R5; /* 0xc900000000315c85 */
/*0090*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0098*/ MOV R6, c[0x0][0x8]; /* 0x2800400020019de4 */
/*00a0*/ ISETP.LT.U32.AND P0, PT, R6, 0x42, PT; /* 0x188ec0010861dc03 */
/*00a8*/ #P0 BRA 0x118; /* 0x40000001a00001e7 */
/*00b0*/ NOP; /* 0x4000000000001de4 */
/*00b8*/ NOP; /* 0x4000000000001de4 */
/*00c0*/ MOV R6, R4; /* 0x2800000010019de4 */
/*00c8*/ SHR.U32 R4, R4, 0x1; /* 0x5800c00004411c03 */
/*00d0*/ ISETP.GE.U32.AND P0, PT, R2, R4, PT; /* 0x1b0e00001021dc03 */
/*00d8*/ #!P0 IADD R7, R4, R2; /* 0x480000000841e003 */
/*00e0*/ #!P0 SHL R7, R7, 0x2; /* 0x6000c0000871e003 */
/*00e8*/ #!P0 LDS R7, [R7]; /* 0xc10000000071e085 */
/*00f0*/ #!P0 IADD R5, R7, R5; /* 0x4800000014716003 */
/*00f8*/ #!P0 STS [R3], R5; /* 0xc900000000316085 */
/*0100*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0108*/ ISETP.GT.U32.AND P0, PT, R6, 0x83, PT; /* 0x1a0ec0020c61dc03 */
/*0110*/ #P0 BRA 0xc0; /* 0x4003fffea00001e7 */
/*0118*/ ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT; /* 0x1a0ec0007c21dc03 */
/*0120*/ SSY 0x1f0; /* 0x6000000320000007 */
/*0128*/ #P0 NOP.S; /* 0x40000000000001f4 */
/*0130*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*0138*/ LDS R4, [R3+0x80]; /* 0xc100000200311c85 */
/*0140*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*0148*/ STS [R3], R6; /* 0xc900000000319c85 */
/*0150*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*0158*/ LDS R4, [R3+0x40]; /* 0xc100000100311c85 */
/*0160*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*0168*/ STS [R3], R6; /* 0xc900000000319c85 */
/*0170*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*0178*/ LDS R4, [R3+0x20]; /* 0xc100000080311c85 */
/*0180*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*0188*/ STS [R3], R6; /* 0xc900000000319c85 */
/*0190*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*0198*/ LDS R4, [R3+0x10]; /* 0xc100000040311c85 */
/*01a0*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*01a8*/ STS [R3], R6; /* 0xc900000000319c85 */
/*01b0*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*01b8*/ LDS R4, [R3+0x8]; /* 0xc100000020311c85 */
/*01c0*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*01c8*/ STS [R3], R6; /* 0xc900000000319c85 */
/*01d0*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*01d8*/ LDS R4, [R3+0x4]; /* 0xc100000010311c85 */
/*01e0*/ IADD R4, R5, R4; /* 0x4800000010511c03 */
/*01e8*/ STS.S [R3], R4; /* 0xc900000000311c95 */
/*01f0*/ ISETP.NE.AND P0, PT, R2, RZ, PT; /* 0x1a8e0000fc21dc23 */
/*01f8*/ #P0 BRA.U 0x218; /* 0x40000000600081e7 */
/*0200*/ #!P0 ISCADD R0, R0, c[0x0][0x24], 0x2; /* 0x4000400090002043 */
/*0208*/ #!P0 LDS R2, [RZ]; /* 0xc100000003f0a085 */
/*0210*/ #!P0 ST [R0], R2; /* 0x900000000000a085 */
/*0218*/ EXIT; /* 0x8000000000001de7 */
As it can be seen from lines /*0130*/-/*01e8*/, now each time a summation is performed, the intermediate result is immediately flushed to shared memory for full thread visibility.
Maybe have a look at these Slides from Mark Harris. Why reinvent the wheel.
www.uni-graz.at/~haasegu/Lectures/GPU_CUDA/Lit/reduction.pdf?page=35
Each reduction step is dependent on the other.
So you can only leave out the synchronization in the last excecuted warp equals 32 active threads in the reduction phase.
One step before you need 64 threads and hence need a synchronisation since parallel execution is not guaranteed since you use 2 warps.