CUDA bug with threadIdx? - cuda

I think I hit a CUDA bug. Can someone confirm/comment the code (see below).
The code (attached) will produce different results depending on the "BUG" define. With BUG=0 the result is 8 (correct), while with BUG=1 it is 4 (and it is wrong). The difference in the code is only here:
#if BUG
unsigned int na=threadIdx.x, nb=threadIdx.y, nc=threadIdx.z;
#else
unsigned int na=0, nb=0, nc=0;
#endif
I submit only ONE thread, so na==nb==nc==0 in both cases and I also check this with statements:
assert( na==0 && nb==0 && nc==0 );
printf("INITIAL VALUES: %u %u %u\n",na,nb,nc);
Here is my compilation & run:
nvcc -arch=sm_21 -DBUG=0 -o bug0 bug.cu
nvcc -arch=sm_21 -DBUG=1 -o bug1 bug.cu
./bug0
./bug1
nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2012 NVIDIA
Corporation Built on Fri_Sep_21_17:28:58_PDT_2012 Cuda compilation
tools, release 5.0, V0.2.1221
nvcc runs with g++-4.6
Finally here is the test code:
/* Compilation & run
nvcc -arch=sm_21 -DBUG=0 -o bug0 bug.cu
nvcc -arch=sm_21 -DBUG=1 -o bug1 bug.cu
./bug0
./bug1
*/
#include <stdio.h>
#include <assert.h>
__global__
void b(unsigned int *res)
{
#if BUG
unsigned int na=threadIdx.x, nb=threadIdx.y, nc=threadIdx.z;
#else
unsigned int na=0, nb=0, nc=0;
#endif
assert( na==0 && nb==0 && nc==0 );
printf("INITIAL VALUES: %u %u %u\n",na,nb,nc);
unsigned int &iter=*res, na_max=2, nb_max=2, nc_max=2;
iter=0;
while(true)
{
printf("a-iter=%u %u %u %u\n",iter,na,nb,nc);
if( na>=na_max )
{
na = 0;
nb += blockDim.y;
printf("b-iter=%u %u %u %u\n",iter,na,nb,nc);
if( nb>=nb_max )
{
printf("c-iter=%u %u %u %u\n",iter,na,nb,nc);
nb = 0;
nc += blockDim.z;
if( nc>=nc_max )
break; // end of loop
}
else
printf("c-else\n");
}
else
printf("b-else\n");
printf("result %u %u %u\n",na,nb,nc);
iter++;
na += blockDim.x;
}
}
int main(void)
{
unsigned int res, *d_res;
cudaMalloc(&d_res,sizeof(unsigned int));
b<<<1,1>>>(d_res);
cudaMemcpy(&res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaFree(d_res);
printf("There are %u combinations (correct is 8)\n",res);
return 0;
}

This appears to be an assembler bug. If I take a simplified version of your example:
template<int bug>
__global__
void b(unsigned int *res)
{
unsigned int na, nb, nc;
switch(bug) {
case 1:
na=threadIdx.x;
nb=threadIdx.y;
nc=threadIdx.z;
break;
default:
na = nb = nc = 0;
break;
}
unsigned int &iter=*res, na_max=2, nb_max=2, nc_max=2;
iter=0;
while(true)
{
if( na>=na_max )
{
na = 0;
nb += blockDim.y;
if( nb>=nb_max )
{
nb = 0;
nc += blockDim.z;
if( nc>=nc_max ) break;
}
}
iter++;
na += blockDim.x;
}
}
and instantiate both versions, the PTX emitted appears to be the same with the exception of the use of tid.{xyz} in the version with bug=1 (on the right):
.visible .entry _Z1bILi0EEvPj( .visible .entry _Z1bILi1EEvPj(
.param .u64 _Z1bILi0EEvPj_param_0 .param .u64 _Z1bILi1EEvPj_param_0
) )
{ {
.reg .pred %p<4>; .reg .pred %p<4>;
.reg .s32 %r<28>; .reg .s32 %r<28>;
.reg .s64 %rd<3>; .reg .s64 %rd<3>;
ld.param.u64 %rd2, [_Z1bILi0EEvPj_param_0]; ld.param.u64 %rd2, [_Z1bILi1EEvPj_param_0];
cvta.to.global.u64 %rd1, %rd2; cvta.to.global.u64 %rd1, %rd2;
mov.u32 %r26, 0; .loc 2 11 1
.loc 2 22 1 mov.u32 %r27, %tid.x;
st.global.u32 [%rd1], %r26; .loc 2 12 1
.loc 2 33 1 mov.u32 %r25, %tid.y;
mov.u32 %r1, %ntid.z; .loc 2 13 1
.loc 2 28 1 mov.u32 %r26, %tid.z;
mov.u32 %r2, %ntid.y; mov.u32 %r24, 0;
.loc 2 39 1 .loc 2 22 1
mov.u32 %r3, %ntid.x; st.global.u32 [%rd1], %r24;
mov.u32 %r27, %r26; .loc 2 33 1
mov.u32 %r25, %r26; mov.u32 %r4, %ntid.z;
mov.u32 %r24, %r26; .loc 2 28 1
mov.u32 %r5, %ntid.y;
BB0_1: .loc 2 39 1
.loc 2 25 1 mov.u32 %r6, %ntid.x;
setp.lt.u32 %p1, %r27, 2;
#%p1 bra BB0_4; BB1_1:
.loc 2 25 1
.loc 2 28 1 setp.lt.u32 %p1, %r27, 2;
add.s32 %r25, %r2, %r25; #%p1 bra BB1_4;
.loc 2 30 1
setp.lt.u32 %p2, %r25, 2; .loc 2 28 1
mov.u32 %r27, 0; add.s32 %r25, %r5, %r25;
.loc 2 30 1 .loc 2 30 1
#%p2 bra BB0_4; setp.lt.u32 %p2, %r25, 2;
mov.u32 %r27, 0;
.loc 2 33 1 .loc 2 30 1
add.s32 %r26, %r1, %r26; #%p2 bra BB1_4;
.loc 2 34 1
setp.gt.u32 %p3, %r26, 1; .loc 2 33 1
mov.u32 %r27, 0; add.s32 %r26, %r4, %r26;
mov.u32 %r25, %r27; .loc 2 34 1
.loc 2 34 1 setp.gt.u32 %p3, %r26, 1;
#%p3 bra BB0_5; mov.u32 %r27, 0;
mov.u32 %r25, %r27;
BB0_4: .loc 2 34 1
.loc 2 38 1 #%p3 bra BB1_5;
add.s32 %r24, %r24, 1;
st.global.u32 [%rd1], %r24; BB1_4:
.loc 2 39 1 .loc 2 38 1
add.s32 %r27, %r3, %r27; add.s32 %r24, %r24, 1;
bra.uni BB0_1; st.global.u32 [%rd1], %r24;
.loc 2 39 1
BB0_5: add.s32 %r27, %r6, %r27;
.loc 2 41 2 bra.uni BB1_1;
ret;
} BB1_5:
.loc 2 41 2
ret;
}
The assembler output is another story however (again bug=0 on the left and bug=1on the right):
/*0008*/ MOV R1, c [0x0] [0x44]; MOV R1, c [0x0] [0x44];
/*0010*/ MOV R6, c [0x0] [0x140]; MOV R6, c [0x0] [0x140];
/*0018*/ MOV R7, c [0x0] [0x144]; MOV R7, c [0x0] [0x144];
/*0020*/ S2R R0, SR_Tid_X; MOV R0, RZ;
/*0028*/ MOV R4, RZ; MOV R2, RZ;
/*0030*/ S2R R3, SR_Tid_Z; MOV R3, RZ;
/*0038*/ ST.E [R6], RZ; MOV R4, RZ;
/*0048*/ S2R R2, SR_Tid_Y; ST.E [R6], RZ;
/*0050*/ ISETP.LT.U32.AND P0, pt, R0, 0x2, pt; ISETP.LT.U32.AND P0, pt, R2, 0x2, pt;
/*0058*/ SSY 0xd0; #P0 BRA 0xb0;
/*0060*/ #P0 BRA 0xc0; IADD R3, R3, c [0x0] [0x2c];
/*0068*/ IADD R2, R2, c [0x0] [0x2c]; MOV R2, RZ;
/*0070*/ MOV R0, RZ; ISETP.LT.U32.AND P0, pt, R3, 0x2, pt;
/*0078*/ ISETP.LT.U32.AND P0, pt, R2, 0x2, pt; #P0 BRA 0xb0;
/*0088*/ SSY 0xa0; IADD R0, R0, c [0x0] [0x30];
/*0090*/ #P0 BRA 0xc0; MOV R2, RZ;
/*0098*/ IADD.S R3, R3, c [0x0] [0x30]; ISETP.GT.U32.AND P0, pt, R0, 0x1, pt;
/*00a0*/ ISETP.GT.U32.AND P0, pt, R3, 0x1, pt; MOV R3, RZ;
/*00a8*/ MOV R0, RZ; #P0 EXIT;
/*00b0*/ MOV R2, RZ; IADD R4, R4, 0x1;
/*00b8*/ #P0 EXIT; IADD R2, R2, c [0x0] [0x28];
/*00c8*/ IADD.S R4, R4, 0x1; ST.E [R6], R4;
/*00d0*/ ST.E [R6], R4; BRA 0x50;
/*00d8*/ IADD R0, R0, c [0x0] [0x28]; BRA 0xd8;
/*00e0*/ BRA 0x50; NOP CC.T;
/*00e8*/ BRA 0xe8; NOP CC.T;
/*00f0*/ NOP CC.T; NOP CC.T;
/*00f8*/ NOP CC.T; NOP CC.T;
The code on the right lacks two SSY instructions, and running it causes the kernel to sit in an infinite loop which would be consistant with some kind of SIMT correctness problem, like undetected branch divergence or divergence around a synchronisation barrier. What is really interesting is that it hangs when running only a single thread in a single block.
I would suggest filing a bug report on the NVIDIA registered developer site if I were you.

Related

Printing a value entered by user using syscall 3 and syscall 5 EduMIPS64

I trying to read an input from user and print it.
In the beginning, I print a request to the user, the user enter a value and I want to print it.
.data
params_sys5: .space 8
params_sys3: .space 8
prompt_msg_LBound: .asciiz "Enter lower bound for x,y\n"
prompt_msg_LBound_val: .asciiz "Lower bound for x,y = %d\n"
xyL: .word64 0
prompt_msg_UBound: .asciiz "Enter upper bound for x,y\n"
prompt_msg_UBound_val: .asciiz "Upper bound for x,y = %d\n"
xyU: .word64 0
prompt_msg_UBoundZ: .asciiz "Enter upper bound for z\n"
prompt_msg_UBoundZ_val: .asciiz "Lower bound for z = %d\n"
zU: .word64 0
prompt_msgAns: .asciiz "x = %d, y = %d, z = %d\n"
.word64 0
.word64 0
.word64 0
xyL_Len: .word64 0
xyU_Len: .word64 0
zU_Len: .word64 0
xyL_text: .space 32
xyU_text: .space 32
zU_text: .space 32
ZeroCode: .word64 0x30 ;Ascii '0'
.text
main: daddi r4, r0, prompt_msg_LBound
jal print_string
daddi r8, r0, xyL_text ;r8 = xyL_text
daddi r14, r0, params_sys3
daddi r9, r0, 32
jal read_keyboard_input
sd r1, xyL_Len(r0) ;save first number length
ld r10, xyL_Len(r0) ;n = r10 = length of xyL_text
daddi r17, r0, xyL_text
jal convert_string_to_integer ;r17 = &source string,r10 = string length,returns computed number in r11
sd r11, xyL(r0)
daddi r4, r0, prompt_msg_LBound_val
jal print_string
end: syscall 0
print_string: sw $a0, params_sys5(r0)
daddi r14, r0, params_sys5
syscall 5
jr r31
read_keyboard_input: sd r0, 0(r14) ;read from keyboard
sd r8, 8(r14) ;destination address
sd r9, 16(r14) ;destination size
syscall 3
jr r31
convert_string_to_integer: daddi r13, r0, 1 ;r13 = constant 1
daddi r20, r0, 10 ;r20 = constant 10
movz r11, r0, r0 ;x1 = r11 = 0
ld r19, ZeroCode(r0)
For1: beq r10, r0, EndFor1
dmultu r11, r20 ;lo = x * 10
mflo r11 ;x = r11 = lo = r11 * 10
movz r16, r0, r0 ;r16 = 0
lbu r16, 0(r17) ;r16 = text[i]
dsub r16, r16, r19 ;r16 = text[i] - '0'
dadd r11, r11, r16 ;x = x + text[i] - '0'
dsub r10, r10, r13 ;n--
dadd r17, r17, r13 ;i++
b For1
EndFor1: jr r31
I'm trying to get the first number, the lower bound of x,y.
For example, I type the number 5, so in the end the xyL representation is 5 but the printed string is:
Enter lower bound for x,y
Lower bound for x,y = 0
How do I print the entered value and after that do same with the next string?
Thanks.
Edit:=======================================================================
I changed the .data by adding another data type .space 8 to save the address and now instead of jumping to print_string to print the value, I call syscall 5, for example:
prompt_msg_LBound: .asciiz "Enter lower bound for x,y\n"
prompt_msg_LBound_val: .asciiz "Lower bound for x,y = %d\n"
LBound_val_addr: .space 8
xyL: .space 8
and in the .code section:
sd r11, xyL(r0)
daddi r5, r0, prompt_msg_LBound_val
sd r5, LBound_val_addr(r0)
daddi r14 ,r0, LBound_val_addr
syscall 5
But I still want to use the print_string to print the string:prompt_msg_LBound_val with the user entered value.
How can I do that?
The print_string sample function in the manual is not meant to be used with placeholders, just with plain strings.
If you add placeholders to the format string, then SYSCALL 5 will keep reading from memory the value of those placeholders. In this case, it just reads and display the value 0, which by accident is what's in memory.
See the printf() example from the manual (slightly updated and annotated) to check how to use placeholders:
.data
format_str: .asciiz "%dth of %s:\n%s version %i.%i.%i is being tested!"
s1: .asciiz "February"
s2: .asciiz "EduMIPS64"
fs_addr: .space 4 ; Will store the address of the format string
.word 10 ; The literal value 10.
s1_addr: .space 4 ; Will store the address of the string "February"
s2_addr: .space 4 ; Will store the address of the string "EduMIPS64"
.word 1 ; The literal value 1.
.word 2 ; The literal value 2.
.word 6 ; The literal value 6.
test:
.code
daddi r5, r0, format_str
sw r5, fs_addr(r0)
daddi r2, r0, s1
daddi r3, r0, s2
sd r2, s1_addr(r0)
sd r3, s2_addr(r0)
daddi r14, r0, fs_addr
syscall 5
syscall 0

CUDA shared memory and warp synchronization

Following host code test.c and device code test0.cu are intended to give the same result.
test.c
$ cat test.c
#include <stdio.h>
#include <string.h>
int main()
{
int data[32];
int dummy[32];
for (int i = 0; i < 32; i++)
data[i] = i;
memcpy(dummy, data, sizeof(data));
for (int i = 1; i < 32; i++)
data[i] += dummy[i - 1];
memcpy(dummy, data, sizeof(data));
for (int i = 2; i < 32; i++)
data[i] += dummy[i - 2];
memcpy(dummy, data, sizeof(data));
for (int i = 4; i < 32; i++)
data[i] += dummy[i - 4];
memcpy(dummy, data, sizeof(data));
for (int i = 8; i < 32; i++)
data[i] += dummy[i - 8];
memcpy(dummy, data, sizeof(data));
for (int i = 16; i < 32; i++)
data[i] += dummy[i - 16];
printf("kernel : ");
for (int i = 0; i < 32; i++)
printf("%4i ", data[i]);
printf("\n");
}
$
test0.cu
$ cat test0.cu
#include <stdio.h>
__global__ void kernel0(int *data)
{
size_t t_id = threadIdx.x;
if (1 <= t_id)
data[t_id] += data[t_id - 1];
if (2 <= t_id)
data[t_id] += data[t_id - 2];
if (4 <= t_id)
data[t_id] += data[t_id - 4];
if (8 <= t_id)
data[t_id] += data[t_id - 8];
if (16 <= t_id)
data[t_id] += data[t_id - 16];
}
int main()
{
int data[32];
int result[32];
int *data_d;
cudaMalloc(&data_d, sizeof(data));
for (int i = 0; i < 32; i++)
data[i] = i;
dim3 gridDim(1);
dim3 blockDim(32);
cudaMemcpy(data_d, data, sizeof(data), cudaMemcpyHostToDevice);
kernel0<<<gridDim, blockDim>>>(data_d);
cudaMemcpy(result, data_d, sizeof(data), cudaMemcpyDeviceToHost);
printf("kernel0 : ");
for (int i = 0; i < 32; i++)
printf("%4i ", result[i]);
printf("\n");
}
$
If I compile and run them, they do give the same result as I expected.
$ gcc -o test test.c
$ ./test
kernel : 0 1 3 6 10 15 21 28 36 45 55 66 78 91 105 120 136 153 171 190 210 231 253 276 300 325 351 378 406 435 465 496
$ nvcc -o test_dev0 test0.cu
$ ./test_dev0
kernel0 : 0 1 3 6 10 15 21 28 36 45 55 66 78 91 105 120 136 153 171 190 210 231 253 276 300 325 351 378 406 435 465 496
$
However, if I use shared memory instead of global memory in the device code, as in test1.cu, it gives different result.
test1.cu
$ cat test1.cu
#include <stdio.h>
__global__ void kernel1(int *data)
{
__shared__ int data_s[32];
size_t t_id = threadIdx.x;
data_s[t_id] = data[t_id];
if (1 <= t_id)
data_s[t_id] += data_s[t_id - 1];
if (2 <= t_id)
data_s[t_id] += data_s[t_id - 2];
if (4 <= t_id)
data_s[t_id] += data_s[t_id - 4];
if (8 <= t_id)
data_s[t_id] += data_s[t_id - 8];
if (16 <= t_id)
data_s[t_id] += data_s[t_id - 16];
data[t_id] = data_s[t_id];
}
int main()
{
int data[32];
int result[32];
int *data_d;
cudaMalloc(&data_d, sizeof(data));
for (int i = 0; i < 32; i++)
data[i] = i;
dim3 gridDim(1);
dim3 blockDim(32);
cudaMemcpy(data_d, data, sizeof(data), cudaMemcpyHostToDevice);
kernel1<<<gridDim, blockDim>>>(data_d);
cudaMemcpy(result, data_d, sizeof(data), cudaMemcpyDeviceToHost);
printf("kernel1 : ");
for (int i = 0; i < 32; i++)
printf("%4i ", result[i]);
printf("\n");
}
$
If I compile test1.cu and run it, it gives different result from test0.cu or test.c.
$ nvcc -o test_dev1 test1.cu
$ ./test_dev1
kernel1 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
$
Is warp synchronization not supposed to work with shared memory?
Some investigation into this issue:
When using CUDA8.0, if I compile test1.cu with -arch=sm_61 option(I'm testing with GTX 1080), it gives same result as test0.cu and test.c.
$ nvcc -o test_dev1_arch -arch=sm_61 test1.cu
$ ./test_dev1_arch
kernel1 : 0 1 3 6 10 15 21 28 36 45 55 66 78 91 105 120 136 153 171 190 210 231 253 276 300 325 351 378 406 435 465 496
$
But this does not apply to newer versions of CUDA. If I use any newer version than 8.0, the test result is different even if I give the -arch=sm_61 option.
Your device code has undefined behavior due to race conditions in both cases, using shared memory or using global memory. You have multiple threads that concurrently read and modify the same int object.
Is warp synchronization not supposed to work with shared memory?
I don't see any warp synchronization in your code.
The fact that the hardware executes warps in lock step (which is not necessarily true to begin with) is completely irrelevant because it is not the hardware who reads your C++ code. It is whatever toolchain you use to translate your C++ code into the machine code that will actually run on your hardware. And C++ compilers are allowed to optimize based on the abstract rules of the C++ language.
Let's look at the machine code that's actually generated for your example (using CUDA 10 here on my machine):
_Z7kernel1Pi:
/*0008*/ MOV R1, c[0x0][0x20] ;
/*0010*/ S2R R9, SR_TID.X ;
/*0018*/ SHL R8, R9.reuse, 0x2 ;
/*0028*/ SHR.U32 R0, R9, 0x1e ;
/*0030*/ IADD R2.CC, R8, c[0x0][0x140] ;
/*0038*/ IADD.X R3, R0, c[0x0][0x144] ;
/*0048*/ LDG.E R0, [R2] ;
/*0050*/ ISETP.NE.AND P0, PT, R9.reuse, RZ, PT ;
/*0058*/ ISETP.GE.U32.AND P1, PT, R9, 0x2, PT ;
/*0068*/ #P0 LDS.U.32 R5, [R8+-0x4] ;
/*0070*/ { ISETP.GE.U32.AND P2, PT, R9.reuse, 0x4, PT ;
/*0078*/ #P1 LDS.U.32 R6, [R8+-0x8] }
/*0088*/ ISETP.GE.U32.AND P3, PT, R9, 0x8, PT ;
/*0090*/ #P2 LDS.U.32 R7, [R8+-0x10] ;
/*0098*/ { ISETP.GE.U32.AND P4, PT, R9, 0x10, PT SLOT 0;
/*00a8*/ #P3 LDS.U.32 R9, [R8+-0x20] SLOT 1 }
/*00b0*/ #P4 LDS.U.32 R10, [R8+-0x40] ;
/*00b8*/ { MOV R4, R0 ;
/*00c8*/ STS [R8], R0 }
/*00d0*/ #P0 IADD R5, R4, R5 ;
/*00d8*/ { #P0 MOV R4, R5 ;
/*00e8*/ #P0 STS [R8], R5 }
/*00f0*/ #P1 IADD R6, R4, R6 ;
/*00f8*/ { #P1 MOV R4, R6 ;
/*0108*/ #P1 STS [R8], R6 }
/*0110*/ #P2 IADD R7, R4, R7 ;
/*0118*/ { #P2 MOV R4, R7 ;
/*0128*/ #P2 STS [R8], R7 }
/*0130*/ #P3 IADD R9, R4, R9 ;
/*0138*/ { #P3 MOV R4, R9 ;
/*0148*/ #P3 STS [R8], R9 }
/*0150*/ #P4 IADD R10, R4, R10 ;
/*0158*/ #P4 STS [R8], R10 ;
/*0168*/ #P4 MOV R4, R10 ;
/*0170*/ STG.E [R2], R4 ;
/*0178*/ EXIT ;
.L_1:
/*0188*/ BRA `(.L_1) ;
.L_14:
As you can see, the compiler (in this particular case, the "culprit" was actually the PTX assembler) has translated your sequence of ifs into a bunch of instructions that set up predicates based on the if conditions. It first fetches all the values it's ever going to need from shared memory into registers using conditional loads. Only after that, it performs all the additions and conditional stores using the already loaded values. This is a perfectly legal interpretation of your C++ code. Since you did not specify any synchronization or memory ordering constraints, the compiler can operate under the assumption that there are no potentially concurrent conflicts, and all these loads and stores can be reordered in whatever way it sees fit.
To fix your code, use explicit warp synchronization:
__global__ void kernel1(int *data)
{
__shared__ int data_s[32];
size_t t_id = threadIdx.x;
data_s[t_id] = data[t_id];
__syncwarp();
if (1 <= t_id)
data_s[t_id] += data_s[t_id - 1];
__syncwarp();
if (2 <= t_id)
data_s[t_id] += data_s[t_id - 2];
__syncwarp();
if (4 <= t_id)
data_s[t_id] += data_s[t_id - 4];
__syncwarp();
if (8 <= t_id)
data_s[t_id] += data_s[t_id - 8];
__syncwarp();
if (16 <= t_id)
data_s[t_id] += data_s[t_id - 16];
data[t_id] = data_s[t_id];
}
The reason why this problem only manifests starting with CUDA 9.0 is that warp-level synchronization was only really introduced in CUDA 9.0 when Volta and "independent thread scheduling" made it a necessity. Before CUDA 9.0, warp-synchronous programming was not officially supported. But compilers used to be rather conservative when it came to actually breaking code like in your example above. The reason probably being that such "warp-synchronous" programming (note the quotes) was often the only way to even get close to peak performance, there was no real alternative and, thus, people were doing it all the time. It still was undefined behavior, though, and NVIDIA kept warning us. It just happened to just work in many cases…
It seems that the point I missed was to declare the shared memory with volatile qualifier. This fixed the issue. (Test code)
However, as stated in the answer by Michael Kenzel, this kind of implicit warp-synchronous programming should be generally avoided, even though this is introduced in the classic parallel reduction(on page 22) provided by NVIDIA itself.
Since future compiler and memory hardware might work differently, it would be dangerous to rely on it. Using __syncwarp() similar to the solution provided by Michael Kenzel should be a better solution. With help from this NVIDIA dev blog article, the safe solution would be:
__global__ void kernel(int *data)
{
__shared__ int data_s[32];
size_t t_id = threadIdx.x;
data_s[t_id] = data[t_id];
int v = data_s[t_id];
unsigned mask = 0xffffffff; __syncwarp(mask);
mask = __ballot_sync(0xffffffff, 1 <= t_id);
if (1 <= t_id) {
v += data_s[t_id - 1]; __syncwarp(mask);
data_s[t_id] = v; __syncwarp(mask);
}
mask = __ballot_sync(0xffffffff, 2 <= t_id);
if (2 <= t_id) {
v += data_s[t_id - 2]; __syncwarp(mask);
data_s[t_id] = v; __syncwarp(mask);
}
mask = __ballot_sync(0xffffffff, 4 <= t_id);
if (4 <= t_id) {
v += data_s[t_id - 4]; __syncwarp(mask);
data_s[t_id] = v; __syncwarp(mask);
}
mask = __ballot_sync(0xffffffff, 8 <= t_id);
if (8 <= t_id) {
v += data_s[t_id - 8]; __syncwarp(mask);
data_s[t_id] = v; __syncwarp(mask);
}
mask = __ballot_sync(0xffffffff, 16 <= t_id);
if (16 <= t_id) {
v += data_s[t_id - 16]; __syncwarp(mask);
data_s[t_id] = v;
}
data[t_id] = data_s[t_id];
}

LC-3 output streaming random characters

I have been writing this program in assimbly language that encrypts or decrypts a string of text. At the end it should be simply outputting the encoded message but instead I am just getting a massive number of random characters. Anyone have any idea whats going on here?
.ORIG x3000
;CLEAR REGISTERS
AGAIN AND R0, R0, 0 ;CLEAR R0
AND R1, R1, 0 ;CLEAR R1
AND R2, R2, 0 ;CLEAR R2
AND R3, R3, 0 ;CLEAR R3
AND R4, R4, 0 ;CLEAR R4
AND R5, R5, 0 ;CLEAR R5
AND R6, R6, 0 ;CLEAR R6
;ENCRYPT/DECRYPT PROMPT
LEA R0, PROMPT_E ;LOADS PROMPT_E INTO R0
PUTS ;PRINTS R0
GETC ;GETS INPUT
OUT ;ECHO TO SCREEN
STI R0, MEMX3100 ;X3100 <- R0
;KEY PROMPT
LEA R0, PROMPT_K ;LOADS PROMPT_E INTO R0
PUTS ;PRINTS R0
GETC ;GETS INPUT
OUT ;ECHO TO SCREEN
STI R0, CYPHERKEY ;X3101 <- R0
;MESSAGE PROMPT
LD R6, MEMX3102 ;R6 <- MEMX3102
LEA R0, PROMPT_M ;LOADS PROMPT_E INTO R0
PUTS ;PRINTS R0
LOOP1 GETC ;GETS INPUT
OUT ;ECHO TO SCREEN
ADD R1, R0, #-10 ;R1 <- R0-10
BRZ NEXT ;BRANCH NEXT IF ENTER
STR R0, R6, #0 ;X3102 <- R0
ADD R6, R6, #1 ;INCRIMENT COUT
LD R2, NUM21 ;R2 <- -12546
ADD R5, R6, R2 ;R5 - R2
STI R5, MEMX4000 ;MEMX4000 <- R5
LD R1, NUM20 ;R1 <- NUM20
ADD R1, R6, R1 ;CHECK FOR 20
BRN LOOP1 ;CREATES WHILE LOOP
;Function choose
NEXT LDI R6, MEMX3100 ;R6 <- X3100
LD R1, NUM68 ;R1 <- -68
ADD R1, R6, R1 ;CHECKS FOR D INPUT
BRZ DECRYPT
;ENCRYPT FUNCTION(DEFAULT)
LD R4, MEMX3102 ;R6 <- X3102
LOOP2 LDR R1, R4, #0 ;R1 <- MEM[R4+0]
LDI R5, ASCII ;R5 <- ASCII
ADD R1, R1, R5 ;STRIPS ASCII
AND R6, R1, #1 ;R6 <- R1 AND #1
BRZ LSBOI ;BRANCH IF LSB = 0
ADD R1, R1, #-1 ;R1 <- R1-1
BRNZP KEYLOAD ;BRANCH TO KEYLOAD
LSBOI ADD R1, R1, #1 ;R1 <- R1+1
KEYLOAD LDI R2, CYPHERKEY ;R2 <- CYPHERKEY
ADD R1, R1, R2 ;R1 <- R1+R2
STR R1, R4, #21 ;MEM[R4+21] <- R1
ADD R4, R4, #1 ;R4 <- R4 + 1
LD R5, MEMX4000 ;R5 <- COUNT
NOT R5, R5 ;NOT R5
ADD R5, R5, R4 ;CHECK FOR NEGATIVE
BRN LOOP2 ;LOOP
BRNZP NEXT2 ;BRANCH WHEN DONE
;DECRYPT FUNCTION DECRYPT LD R4, MEMX3102 ;R4 <- X3102 LOOP3 LDR R1, R4, #0 ;R1 <- MEM[R4+0] LDI R5, ASCII ;R5 <- ASCII ADD R1, R1, R5 ;STRIPS ASCII LDI R2, CYPHERKEY ;R2 <- CYPHERKEY NOT R2, R2 ;R2 <- NOT R2 ADD R1, R1, R2 ;R1 <- R1 - CYPHERKEY AND R6, R1,
#1 ;R6 <- R1 AND #1 BRZ LSBOI2 ;BRANCH IF LSB = 0 ADD R1, R1, #-1 ;R1 <- R1-1 BRNZP NEXTTASK1 ;BRANCH TO KEYLOAD LSBOI2 ADD R1, R1, #1 ;R1 <- R1+1 NEXTTASK1 STR R1, R4, #21 ;MEM[R4+21] <- R1 ADD R4, R4, #1 ;R4 <- R4 + 1 LD R5, MEMX4000 ;R5 <- COUNT NOT R5, R5 ;NOT R5 ADD R5, R5, R4 ;CHECK FOR NEGATIVE BRN LOOP3 ;LOOP
;OUTPUT NEXT2 LD R4, MEMX3102 ;R4 <- X3102 LOOP4 LDR R0, R4,
#21 ;R0 <- [R4+21] OUT ;PRINT R0 ADD R4, R4, #1 ;R4 <- R4+1 LD R5, MEMX4000 ;R5 <- COUNT NOT R5, R5 ;NOT R5 ADD R5, R5, R4 ;CHECK FOR NEGATIVE BRN LOOP4
HALT MEMX4000 .FILL X4000 ASCII .FILL #-30 NUM21 .FILL #-12546 NUM20 .FILL #-12566 MEMX3102 .FILL X3102 CYPHERKEY .FILL X3101 MEMX3100 .FILL X3100 NUM68 .FILL #-68 NUM32 .FILL #-32 PROMPT_E .STRINGZ "\nTYPE E TO ENCRYPT OR TYPE D TO DECRYPT (UPPER CASE): " PROMPT_K .STRINGZ "\nENTER THE ENCRYPTION KEY (A SINGLE DIGIT FROM 1 TO 9) " PROMPT_M .STRINGZ "\nINPUT A MESSAGE OF NO MORE THAN 20 CHARACTERS THEN PRESS <ENTER> "
.END
There are a number of different things that are going on in your program, here are some of the things I've found:
Encoding loop loops more times than the number of characters entered
The encryption key is stored and used in its ASCII form
characters from the user are stored in the middle of the PROMPT_M text
encoding loop cycles for thousands of times
Encoding loop didn't change any of the stored characters at location x3102
Output routine doesn't loop, so it only outputs one char
From what I've seen your program takes a non ascii char from the user adds it to the ascii form of the encryption key and then stores that hundreds of times at every memory offset 21 locations from x3102. When your output routine runs it pulls the value stored at x3117 and outputs that one char, then halts the program.

How do i do a bitshift right in binary?

Hopefully this is a simple question but I cannot for the life of me figure out how to do a bitshift in binary. This is being done in the LC3 environemnt. I just need to know how to arithmetical divide by two and shift to the right. I know going left is simple by just adding the binary value to itself, but I have tried the opposite for bitshift right(subtracting from itself, NOTing and then subtracting etc etc.) Would be much appreciated.
Or if you have a better way to move x00A0 to x000A that would also be fantastic. Thanks!
This is an older post, but I ran into the same issue so I figured I would post what I've found.
When you have to do a bit-shift to the right you're normally halving the the binary number (divide by 2) but that can be a challenge in the LC-3. This is the code I wrote to preform a bit-shift to the right.
; Bit shift to the right
.ORIG x3000
MAIN
LD R3, VALUE
AND R5, R5, #0 ; Reseting our bit counter
B_RIGHT_LOOP
ADD R3, R3, #-2 ; Subtract 2 from the value stored in R3
BRn BR_END ; Exit the loop as soon as the number in R3 has gone negative
ADD R5, R5, #1 ; Add 1 to the bit counter
BR B_RIGHT_LOOP ; Start the loop over again
BR_END
ST R5, ANSWER ; Store the shifted value into the ANSWER variable
HALT ; Stop the program
; Variables
VALUE .FILL x3BBC ; Value is the number we want to do a bit-shift to the right
ANSWER .FILL x0000
.END
Keep in mind that with this code the left most bit B[0] is lost. Also this code doesn't work if the number we are trying to shift to the right is negative. So if bit [15] is set this code won't work.
Example:
VALUE .FILL x8000 ; binary value = 1000 0000 0000 0000
; and values higher than x8000
; won't work because their 15th
; bit is set
This should at least get you going on the right track.
.ORIG x3000
BR main
;»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»
; UL7AAjr
; shift right register R0
; used rigisters R1, R2, R3, R4, R5
;»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»
shift_right
AND R4, R4, #0 ; R4 - counter = 15 times
ADD R4, R4, #15
AND R1, R1, #0 ; R1 - temp result
LEA R2, _sr_masks ; R2 - masks pointer
_sr_loop
LDR R3, R2, #0 ; load mask into R3
AND R5, R0, R3 ; check bit in R0
BRZ _sr_zero ; go sr_zero if bit is zero
LDR R3, R2, #1 ; R3 next mask index
ADD R1, R1, R3 ; add mask to temp result
_sr_zero
ADD R2, R2, #1 ; next mask address
ADD R4, R4, #-1 ; all bits done?
BRNP _sr_loop
AND R0, R0, #0 ; R0 = R1
ADD R0, R0, R1
RET
_sr_masks
.FILL x8000
.FILL x4000
.FILL x2000
.FILL x1000
.FILL x0800
.FILL x0400
.FILL x0200
.FILL x0100
.FILL x0080
.FILL x0040
.FILL x0020
.FILL x0010
.FILL x0008
.FILL x0004
.FILL x0002
.FILL x0001
;»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»»
main
LD R0, data
JSR shift_right
HALT
data .FILL xFFFF
.END
; right shift R0 1-bit with sign-extention
; Algorithm: build bit form msb one by one
.ORIG x3000
AND R1, R1, #0 ; r1 = 0
ADD R2, R1, #14 ; r2 = 14
ADD R0, R0, #0 ; r0 = r0
BRzp LOOP
ADD R1, R1, #-1 ; r1 = xffff
LOOP ADD R1, R1, R1 ; r1 << 1
ADD R0, R0, R0 ; r0 << 1
BRzp MSB0
ADD R1, R1, #1 ; r1++
MSB0 ADD R2, R2, #-1 ; cnt--
BRp LOOP
ADD R0, R1, #0 ; r0 = r1
HALT
.END
; right shift R0 1-bit with sign-extention
; Algorithm: left-rotate 14 times with proper sign
.ORIG x3000
LD R1, CNT
ADD R2, R0, #0
LOOP ADD R0, R0, R0 ; r0 << 1
BRzp NEXTBIT
ADD R0, R0, #1
NEXTBIT ADD R1, R1, #-1
BRp LOOP
LD R3, MASK
AND R0, R0, R3
ADD R2, R2, #0
BRzp DONE
NOT R3, R3
ADD R0, R0, R3
DONE HALT
MASK .FILL x3FFF
CNT .FILL 14
.END
; right shift R0 1-bit with sign-extention
; Algorithm: look-uo table and auto-stop
.ORIG x3000
AND R1, R1, #0 ; r1 = 0
LEA R2, TABLE ; r2 = table[]
AND R0, R0, #-2
LOOP BRzp MSB0
LDR R3, R2, #0 ; r3 = table[r2]
ADD R1, R1, R3 ; r1 += r3
MSB0 ADD R2, R2, #1 ; r2++
ADD R0, R0, R0 ; r0 << 1
BRnp LOOP
ADD R0, R1, #0 ; r0 = r1
HALT
TABLE
.FILL xC000
.FILL x2000
.FILL x1000
.FILL x0800
.FILL x0400
.FILL x0200
.FILL x0100
.FILL x0080
.FILL x0040
.FILL x0020
.FILL x0010
.FILL x0008
.FILL x0004
.FILL x0002
.FILL x0001
.END

There are too much gap of time between different position of global memory for atomicAdd functions [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 8 years ago.
Improve this question
The main idea of kernel is to get the latency of atomicAdd for global memory, so the first thing is to get the basic latency of atomicAdd for one thread and one block. There are too much gap of time between different position of global memory, why? The kernel is below:
kernel.h
#ifndef _KERNEL_H_
#define _KERNEL_H_
template <class T,class ITYPE>
__global__ void collision(T * y,T * oldVal,ITYPE * interval,ITYPE * time)
{
ITYPE warp,vector_lane,thread_lane,thread_id,partial;
warp = 32;
vector_lane = (blockDim.x+warp-1)/warp;
thread_lane = threadIdx.x & (warp-1);
thread_id = threadIdx.x / warp;
ITYPE threads = threadIdx.x;
ITYPE start_time,end_time;
ITYPE position = 0;
T value = 1.0;
T old = 0.0f;
partial = threadIdx.x & (warp-1);
start_time = clock();
//set different value for variable position
old = atomicAdd(&y[position],value);
end_time = clock();
if (thread_lane==0)
time[blockIdx.x*vector_lane+thread_id]=end_time-start_time;
oldVal[2]=old;
}
template <class T,class ITYPE>
void __collision__(T * y,T * oldVal,ITYPE * interval,ITYPE * time,ITYPE & number_SM)
{
const unsigned int THREADS_PER_BLOCK = 1;
const unsigned int NUM_BLOCKS = 1;
//get the number of multiprocessors
ITYPE dev = 0;
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
number_SM = deviceProp.multiProcessorCount;
printf("multiProcessors=%d\n",number_SM);
if (NUM_BLOCKS<13)
number_SM = NUM_BLOCKS;
printf("THREADS_PER_BLOCK=%d\n",THREADS_PER_BLOCK);
printf("NUM_BLOCKS=%d\n",NUM_BLOCKS);
collision<T,ITYPE><<<NUM_BLOCKS,THREADS_PER_BLOCK>>>(y,oldVal,interval,time);
}
#endif
The code of collision.cu is below:
#include "run.h"
using namespace std;
typedef float VALUETYPE;
typedef int INDEXTYPE;
int main(int argc,char *args[])
{
launtch<VALUETYPE,INDEXTYPE>();
}
The code of run.h is below:
#ifndef _RUN_H_
#define _RUN_H_
#include <stdio.h>
#include <iostream>
#include <string>
#include "kernel.h"
#include <shrQATest.h>
#include <shrUtils.h>
#include <helper_cuda.h>
using namespace std;
template <class T,class ITYPE>
void launtch()
{
const ITYPE LENGTH = 64*208;
ITYPE number_SM = 1;
T * y = new T[LENGTH];
T * oldVal = new T[LENGTH];
ITYPE * interval = new ITYPE[LENGTH];
ITYPE * time = new ITYPE[LENGTH];
memset(y,0.0f,sizeof(T)*LENGTH);
memset(oldVal,0.0f,sizeof(T)*LENGTH);
memset(time,0,sizeof(ITYPE)*LENGTH);
T * dy;
T * dOldVal;
ITYPE * dinterval;
ITYPE * dtime;
checkCudaErrors(cudaMalloc(&dy,LENGTH*sizeof(T)));
checkCudaErrors(cudaMalloc(&dOldVal,LENGTH*sizeof(T)));
checkCudaErrors(cudaMalloc(&dinterval,LENGTH*sizeof(ITYPE)));
checkCudaErrors(cudaMalloc(&dtime,LENGTH*sizeof(ITYPE)));
checkCudaErrors(cudaMemcpy(dy,y,sizeof(T)*LENGTH,cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(dOldVal,oldVal,sizeof(T)*LENGTH,cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(dinterval,interval,sizeof(ITYPE)*LENGTH,cudaMemcpyHostToDevice));
__collision__<T,ITYPE>(dy,dOldVal,dinterval,dtime,number_SM);
checkCudaErrors(cudaMemcpy(time,dtime,LENGTH*sizeof(ITYPE),cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(y,dy,LENGTH*sizeof(T),cudaMemcpyDeviceToHost));
ITYPE sum=0,count=0;
for (ITYPE i=0;i<LENGTH;i++)
{
if (time[i]>0)
{
sum+=time[i];
count++;
cout<<" ["<<i<<"]="<<time[i];
if (count%10==0)
cout<<endl;
}
}
cout<<endl<<"number_SM="<<number_SM<<endl;
cout<<"average="<<sum/number_SM<<endl;
cout<<"y[2]="<<y[2]<<endl;
}
#endif
The detail of makefile is below:
NVIDIA = /root/NVIDIA_CUDA-5.0_Samples
CUDA = /usr/local/cuda-5.0
#NVIDINCADD = -I$(NVIDIA)/shared/inc -I$(NVIDIA)/C/common/inc
NVIDINCADD = -I$(NVIDIA)/common/inc
CUDAINCADD = -I$(CUDA)/include -I$(CUDA)/shared/inc
CC = -L/usr/lib64 -lstdc++
GCCOPT = -O2 -fno-rtti -fno-exceptions
INTELOPT = -O3 -fno-rtti -xW -restrict -fno-alias
#DEB = -g
#NVCC = -G
#ARCH = -arch=sm_13
ARCH = -arch=sm_35
collision:collision.cu
nvcc $(DEB) $(NVCC) $(ARCH) -lm $(NVIDINCADD) $(CUDAINCADD) -o $(#) $(<)
clean:
rm -f collision
rm -f a.out
If the value of position is 0, the value of time[0] is 46; and the postion is 2, the time[0] is 369. The platform is K20M and CUDA 5.0.
Wow that's a huge amount of code mostly unrelated to what you are trying to show. Next time try and eliminate the unnecessary parts.
Also, you are passing a float value as the second parameter to memset. memset sets byte quantities and expects an unsigned char in the second parameter.
With your code, I was able to reproduce some variation between a position value of 0 and 2. For the 0 case I got a time of 76 and for the 2 case I got a time of 118, so not as large as your variation.
However, because you are making the change and then re-compiling the code, the compiler can emit different instruction streams for each case, making the results appear to be different.
I would suggest trying this code instead:
#include <iostream>
#define DWIDTH 32
typedef float mytype;
template <typename T>
__global__ void collision(int *time, T *data, T *old ){
for (int i = 0; i < DWIDTH; i++){
unsigned long start_time = clock64();
T my_old = atomicAdd(data+i, (T) 1);
unsigned long end_time = clock64();
time[i] = end_time - start_time;
old[i] = my_old;
}
}
int main(){
mytype *h_data, *d_data;
int *h_time, *d_time;
mytype *h_old, *d_old;
cudaMalloc((void **)&d_time, DWIDTH*sizeof(int));
h_time = (int *)malloc(DWIDTH*sizeof(int));
cudaMalloc((void **)&d_data, DWIDTH*sizeof(mytype));
h_data = (mytype *)malloc(DWIDTH*sizeof(mytype));
cudaMalloc((void **)&d_old, DWIDTH*sizeof(mytype));
h_old = (mytype *)malloc(DWIDTH*sizeof(mytype));
for (int i=0; i<DWIDTH; i++){
h_time[i] = 0;
h_data[i] = (mytype) 0;
}
cudaMemcpy(d_data, h_data, DWIDTH*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_time, h_time, DWIDTH*sizeof(int), cudaMemcpyHostToDevice);
collision<<<1,1>>>(d_time, d_data, d_old);
cudaMemcpy(h_time, d_time, DWIDTH*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_data, d_data, DWIDTH*sizeof(mytype), cudaMemcpyDeviceToHost);
cudaMemcpy(h_old, d_old, DWIDTH*sizeof(mytype), cudaMemcpyDeviceToHost);
std::cout << "times:" << std::endl;
for (int i = 0; i < DWIDTH; i++)
std::cout << h_time[i] << " ";
std::cout << std::endl << "data:" << std::endl;
for (int i = 0; i < DWIDTH; i++)
std::cout << h_data[i] << " ";
std::cout << std::endl << "old:" << std::endl;
for (int i = 0; i < DWIDTH; i++)
std::cout << h_old[i] << " ";
std::cout << std::endl;
return 0;
}
When I compile for sm_35 and run on a K20m, I get:
$ nvcc -arch=sm_35 -o t284 t284.cu
$ ./t284
times:
98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98 98
data:
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
old:
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
$
The benefit of this code is that the compiler doesn't have the opportunity to emit a different instruction stream based on whether I set position to 0 or 2. Therefore I get consistent results.
With your code, when I compile with a non-zero value for position (and sm_20) I get sass like this:
/*0038*/ /*0x40011c042c000001*/ S2R R4, SR_ClockLo;
/*0040*/ /*0x04411e036000c000*/ SHL.W R4, R4, 0x1;
/*0048*/ /*0x80015de428004000*/ MOV R5, c [0x0] [0x20];
/*0050*/ /*0x10519c034801c000*/ IADD R6.CC, R5, 0x4;
/*0058*/ /*0x00015de218fe0000*/ MOV32I R5, 0x3f800000;
/*0060*/ /*0x93f1dc4348004000*/ IADD.X R7, RZ, c [0x0] [0x24];
/*0068*/ /*0x00615e056c7e2800*/ ATOM.E.ADD.F32.FTZ.RN R5, [R6], R5;
/*0070*/ /*0x40019c042c000001*/ S2R R6, SR_ClockLo;
When I compile with a zero value for position (and sm_20) I get sass like this:
/*0048*/ /*0x40019c042c000001*/ S2R R6, SR_ClockLo;
/*0050*/ /*0x04619e036000c000*/ SHL.W R6, R6, 0x1;
/*0058*/ /*0x0001dde218fe0000*/ MOV32I R7, 0x3f800000;
/*0060*/ /*0x0021de056c7e1000*/ ATOM.E.ADD.F32.FTZ.RN R2, [R2], R7;
/*0068*/ /*0x4000dc042c000001*/ S2R R3, SR_ClockLo;
And so we can see that with your code, the value of position may have an impact on the code generated, and therefore the timing.