R0 is violated after function returns - function

I am implementing a uart queue in s3c44b0x (ARM7TDMI), the uart0 ISR will enqueue the char while the main loop will dequeue the char. however, while dequeuing, the value (in R0) returned may be not the one dequeued from the queue, and I found R0 is violated after returning from the dequeue function (input 'v' cont., and test() is in the main loop):
wish for your help.
CHAR cliDequeue(void)
{
CHAR bTmpCh;
if (gwCliQSize == 0)
{
return 0;
}
bTmpCh = gabCliQ[gwCliQTail]; /* char is enqueued in the Q in ISR */
gwCliQTail++;
gwCliQTail %= MAX_CLI_QUEUE_LEN;
ASSERT(gwCliQSize > 0);
gwCliQSize--;
ASSERT(bTmpCh == 'v'); /* will not assert */
//uartPutChar(bTmpCh);
return bTmpCh;
}
void test(void)
{
CHAR bTestCh;
bTestCh = cliDequeue();
if (bTestCh != 0)
{
ASSERT(bTestCh == 'v'); /* assert here ! */
uartPutChar(bTestCh);
}
}

We don't have enough information / context to answer definitively. It would also be helpful if you posted the corresponding assembly code so that we could see how/when things are moved in & out of R0. REgardless, a few things spring to mind immediately from your posted C code.
(0) Are the variables shared between interrupts & the main loop declared as volatile?
(1)
In CliDequeue, you're accessing an array which is shared with an ISR. It appears to be a single reader / single writer construct, so that isn't automatically bad, but your housekeeping isn't airtight.
For example, one invariant you must be sure to satisfy is that the queue size & tail pointer are in sync. Yet, unless this routine is called with interrupts disabled, your tail pointer & queue size aren't adjusted as a single transaction.
(2)
Furthermore, I'd guess that gwCliQSize is also adjusted in the interrupt (incremented in ISR, decremented in the application). Another race condition. To perform gwCliQSize--, behind the scenes you are probably reading from memory to a register, decrementing the register, then writing it back. What happens if you read 5 from memory into R1, then an interrupt fires and increments it to 6, then you exit the ISR, and the register decrement and writeback (with a value of 4).
(3)
Lastly, it's possible (although not too likely) that bTmpCh or bTestCh are stored on the stack, and that your stack is getting corrupted / slammed by another task / interrupt / etc. So when your assert fails, you're thinking it's R0 that is corrupted, but really it could be that the value moved into R0 before return, or the value moved out of R0 into a stack variable, is getting clobbered.
I've nattered on enough. There are other possibilities but from what you've posted (and not posted) it's impossible to say for sure.
P.S. If you've used a debugger and it's really & literally R0's value that is getting corrupted, not just the value of the character in the queue, that points to a problem in your scheduler / context switcher / ISR pre- or post-amble etc...

here is the assembly code:
for the test():
0x00001308 E92D4010 STMDB R13!,{R4,R14}
37: bTestCh = cliDequeue();
38:
0x0000130C EB000207 BL cliDequeue(0x00001B30)
0x00001310 E1A04000 MOV R4,R0
39: if (bTestCh != 0)
40: {
0x00001314 E3540000 CMP R4,#pTest(0x00000000)
0x00001318 0A000007 BEQ 0x0000133C
41: ASSERT(bTestCh == 'v');
0x0000131C E1A00000 NOP
0x00001320 E3540076 CMP R4,#0x00000076
0x00001324 0A000001 BEQ 0x00001330
0x00001328 E1A00000 NOP
0x0000132C EAFFFFFE B 0x0000132C
0x00001330 E1A00000 NOP
42: uartPutChar(bTestCh);
43: }
0x00001334 E1A00004 MOV R0,R4
0x00001338 EB00014A BL uartPutChar(0x00001868)
44: }
45:
46: int main(void)
0x0000133C E8BD4010 LDMIA R13!,{R4,R14}
0x00001340 E12FFF1E BX R14
for the cliDequeu(), BTW, gwCliQSize is defined as
UINT32 volatile gwCliQSize;
0x00001B30 E59F00D4 LDR R0,[PC,#0x00D4]
0x00001B34 E5900000 LDR R0,[R0]
0x00001B38 E3500000 CMP R0,#pTest(0x00000000)
0x00001B3C 1A000001 BNE 0x00001B48
78: return 0;
79: }
80:
81: bTmpCh = gabCliQ[gwCliQTail];
82: gwCliQTail++;
83: gwCliQTail %= MAX_CLI_QUEUE_LEN;
84: ASSERT(gwCliQSize > 0);
85: gwCliQSize--;
86:
87: //chCheck(bTmpCh);
88: ASSERT(bTmpCh == 'v'); /* will not assert */
89: //uartPutChar(bTmpCh);
90:
91: return bTmpCh;
0x00001B40 E3A00000 MOV R0,#pTest(0x00000000)
92: }
93:
94:
95: void cliQInit(void)
0x00001B44 E12FFF1E BX R14
81: bTmpCh = gabCliQ[gwCliQTail];
0x00001B48 E59F00C0 LDR R0,[PC,#0x00C0]
0x00001B4C E59F20C4 LDR R2,[PC,#0x00C4]
0x00001B50 E5922000 LDR R2,[R2]
0x00001B54 E7D01002 LDRB R1,[R0,R2]
82: gwCliQTail++;
0x00001B58 E59F00B8 LDR R0,[PC,#0x00B8]
0x00001B5C E5900000 LDR R0,[R0]
0x00001B60 E2800001 ADD R0,R0,#0x00000001
0x00001B64 E59F20AC LDR R2,[PC,#0x00AC]
0x00001B68 E5820000 STR R0,[R2]
83: gwCliQTail %= MAX_CLI_QUEUE_LEN;
0x00001B6C E2820000 ADD R0,R2,#pTest(0x00000000)
0x00001B70 E5900000 LDR R0,[R0]
0x00001B74 E20000FF AND R0,R0,#0x000000FF
0x00001B78 E5820000 STR R0,[R2]
84: ASSERT(gwCliQSize > 0);
0x00001B7C E1A00000 NOP
0x00001B80 E59F0084 LDR R0,[PC,#0x0084]
0x00001B84 E5900000 LDR R0,[R0]
0x00001B88 E3500000 CMP R0,#pTest(0x00000000)
0x00001B8C 1A000001 BNE 0x00001B98
0x00001B90 E1A00000 NOP
0x00001B94 EAFFFFFE B 0x00001B94
0x00001B98 E1A00000 NOP
85: gwCliQSize--;
86:
87: //chCheck(bTmpCh);
0x00001B9C E59F0068 LDR R0,[PC,#0x0068]
0x00001BA0 E5900000 LDR R0,[R0]
0x00001BA4 E2400001 SUB R0,R0,#0x00000001
0x00001BA8 E59F205C LDR R2,[PC,#0x005C]
0x00001BAC E5820000 STR R0,[R2]
88: ASSERT(bTmpCh == 'v'); /* will not assert */
89: //uartPutChar(bTmpCh);
90:
0x00001BB0 E1A00000 NOP
0x00001BB4 E3510076 CMP R1,#0x00000076
0x00001BB8 0A000001 BEQ 0x00001BC4
0x00001BBC E1A00000 NOP
0x00001BC0 EAFFFFFE B 0x00001BC0
0x00001BC4 E1A00000 NOP
91: return bTmpCh;
92: }
93:
94:
95: void cliQInit(void)
0x00001BC8 E1A00001 MOV R0,R1
0x00001BCC EAFFFFDC B 0x00001B44
for cliEnqueue:
void cliEnqueue(CHAR bC)
{
if (gwCliQSize == MAX_CLI_QUEUE_LEN)
{
ASSERT(0);
}
gabCliQ[gwCliQHeader] = bC;
gwCliQHeader++;
gwCliQHeader %= MAX_CLI_QUEUE_LEN;
gwCliQSize++;
}
assembly:
0x00001A5C E59F11AC LDR R1,[PC,#0x01AC]
0x00001A60 E59F21AC LDR R2,[PC,#0x01AC]
0x00001A64 E5922000 LDR R2,[R2]
0x00001A68 E7C10002 STRB R0,[R1,R2]
25: gwCliQHeader++;
0x00001A6C E59F11A0 LDR R1,[PC,#0x01A0]
0x00001A70 E5911000 LDR R1,[R1]
0x00001A74 E2811001 ADD R1,R1,#0x00000001
0x00001A78 E59F2194 LDR R2,[PC,#0x0194]
0x00001A7C E5821000 STR R1,[R2]
26: gwCliQHeader %= MAX_CLI_QUEUE_LEN;
0x00001A80 E2821000 ADD R1,R2,#pTest(0x00000000)
0x00001A84 E5911000 LDR R1,[R1]
0x00001A88 E20110FF AND R1,R1,#0x000000FF
0x00001A8C E5821000 STR R1,[R2]
27: gwCliQSize++;
0x00001A90 E59F1174 LDR R1,[PC,#0x0174]
0x00001A94 E5911000 LDR R1,[R1]
0x00001A98 E2811001 ADD R1,R1,#0x00000001
0x00001A9C E59F2168 LDR R2,[PC,#0x0168]
0x00001AA0 E5821000 STR R1,[R2]
28: }
29:
30:
31: static void chCheck(CHAR cTmpChar)
32: {
0x00001AA4 E12FFF1E BX R14
0),1): that gwCliQSize and the array are shared between ISR and main loop.
2) gwCliQSize is defined as volatile
3) from the assembly, bTestCh is R4 (moved from R0) and bTmpCh is R1 (moved to R0 before B)
4) I am using the J-LINK, but without J-LINK (run from flash), it still exists.

Related

Is there a way to access value of constant memory bank in CUDA

I have been trying to debug cuda programs that use inline PTX assembly. Specifically, I am debugging at the instruction level, and am trying to determine the values of arguments to the instructions. Occasionally, the disassembly includes a reference to constant memory. I am trying to have gdb print the value of this constant memory, but have not found any documentation that shows how to do this.
For instance, a disassembly includes
IADD R0, R0, c[0x0] [0x148]
I want to determine how to have gdb print the value of c[0x0] [0x148]. I have tried using print * (#constant) ... but this does not seem to work (I pass 0x148 here and it prints out nothing). Is this possible to do in cuda-gdb?
I have tried to avoid this by passing the compiler option --disable-optimizer-constants during compilation, but this does not work.
The way to do this is to
print *(void * #parameter *) addr
where addr is the address inside the constant bank 0 that should be printed.
Example
Suppose we have a simple kernel in a file called foo.cu:
#include <cuda.h>
#include <stdio.h>
#include <cuda_runtime.h>
__global__ void myKernel(int a, int b, int *d)
{
*d = a + b;
}
int main(int argc, char *argv[]) {
if (argc < 3) {
printf("Requires inputs a and b to be specified\n");
return 0;
}
int * dev_d;
int d;
cudaMalloc(&dev_d, sizeof(*dev_d));
myKernel<<<1, 1>>>(atoi(argv[1]), atoi(argv[2]), dev_d);
cudaMemcpy(&d, dev_d, sizeof(d), cudaMemcpyDeviceToHost);
cudaFree(dev_d);
printf("D is: %d\n", d);
return 0;
}
which is compiled via
$ nvcc foo.cu -o foo.out
Next, suppose we are interested in disassembling this program, so we execute cuda-gdb with a command-line for our program:
$ cuda-gdb --args ./foo.out 10 15
Inside cuda-gdb, we get to the kernel by typing
(cuda-gdb) set cuda break_on_launch application
(cuda-gdb) start
Temporary breakpoint 1, 0x000055555555b12a in main ()
(cuda-gdb) cont
Inside the kernel, we view the disassembly we are interested in debugging:
(cuda-gdb) x/15i $pc
=> 0x555555b790a8 <_Z8myKerneliiPi+8>: MOV R1, c[0x0][0x20]
0x555555b790b0 <_Z8myKerneliiPi+16>: MOV R0, c[0x0][0x144]
0x555555b790b8 <_Z8myKerneliiPi+24>: MOV R2, c[0x0][0x148]
0x555555b790c0 <_Z8myKerneliiPi+32>:
0x555555b790c8 <_Z8myKerneliiPi+40>: MOV R3, c[0x0][0x14c]
0x555555b790d0 <_Z8myKerneliiPi+48>: IADD R0, R0, c[0x0][0x140]
0x555555b790d8 <_Z8myKerneliiPi+56>: STG.E [R2], R0
0x555555b790e0 <_Z8myKerneliiPi+64>:
0x555555b790e8 <_Z8myKerneliiPi+72>: NOP
0x555555b790f0 <_Z8myKerneliiPi+80>: NOP
0x555555b790f8 <_Z8myKerneliiPi+88>: NOP
0x555555b79100 <_Z8myKerneliiPi+96>:
0x555555b79108 <_Z8myKerneliiPi+104>: EXIT
0x555555b79110 <_Z8myKerneliiPi+112>: BRA 0x70
0x555555b79118 <_Z8myKerneliiPi+120>: NOP
The second argument being passed to the IADD instruction is in one of the constant memory banks. Let's find out what its value actually is. We advance go to the IADD instruction:
(cuda-gdb) stepi 4
0x0000555555b790d0 in myKernel(int, int, int*)<<<(1,1,1),(1,1,1)>>> ()
(cuda-gdb) x/i $pc
=> 0x555555b790d0 <_Z8myKerneliiPi+48>: IADD R0, R0, c[0x0][0x140]
We can now obtain the contents of c[0x0][0x140] as follows:
(cuda-gdb) print (int) *(void * #parameter *) 0x140
$1 = 10
Here, we knew the argument should have 32 bits, so we cast it as an (32-bit) int. If we hadn't done this, we would get too many bits, e.g.:
(cuda-gdb) print *(void * #parameter *) 0x140
$2 = 0xf0000000a
Note the hexadecimal format can be retained by adding /x after the print command:
(cuda-gdb) print/x (int) *(void * #parameter *)0x140
$3 = 0xa

Two functions/subroutines in ARM assembly language

I am stuck with an exercise of ARM.
The following program should calculate the result of 2((x-1)^2 + 1) but there is a mistake in the program that leads it into an infinite loop.
I think that I still don't understand completely subroutines and for this reason I am not seeing where the mistake is.
_start:
mov r0, #4
bl g
mov r7, #1
swi #0
f:
mul r1, r0, r0
add r0, r1, #1
mov pc, lr
g:
sub r0, r0, #1
bl f
add r0, r0, r0
mov pc, lr
The infinite loop starts in subroutine g: in the line of mov pc, lr and instead of returning to _start it goes to the previous line add r0, r0, r0 and then again to the last line of subroutine g:.
So I guess that the problem is the last line of subroutine g: but I can't find the way to return to _start without using mov pc, lr. I mean, this should be the command used when we have a branch with link.
Also, in this case r0 = 4, so the result of the program should be 20.
This is because you don't save lr on the stack prior to calling f, and the initial return address was therefore lost: if you only have one level of subroutine calls, using lr without saving it is fine, but if you have more then one, you need to preserve the previous value of lr.
For example, when compiling this C example using Compiler Explorer with ARM gcc 4.56.4 (Linux), and options -mthumb -O0,
void f()
{
}
void g()
{
f();
}
void start()
{
g();
}
The generated code will be:
f():
push {r7, lr}
add r7, sp, #0
mov sp, r7
pop {r7, pc}
g():
push {r7, lr}
add r7, sp, #0
bl f()
mov sp, r7
pop {r7, pc}
start():
push {r7, lr}
add r7, sp, #0
bl g()
mov sp, r7
pop {r7, pc}
If you were running this on bare metal, not under Linux, you'd need your stack pointer to be initialized a correct value.
Assuming you are running from RAM on a bare-metal system/simulator, you could setup a minimal stack of 128 bytes:
.text
.balign 8
_start:
adr r0, . + 128 // set top of stack at _start + 128
mov sp, r0
...
But it looks like you're writing a Linux executable that exits with a swi/r7=1 exit system call. So don't do that, it would make your program crash when it tries to write to the stack.

Assembly how to return from a function call when a condition is met

So I'm learning assembly and I've just learned to create a function I create a label, use BL to branch to the label, then use BX LR to return from the function call.
I know LR stores the address of the PC + 4 bits, the program pointer then moves to the address of the label, then the PC increments though the code until it reaches BX LR and then the PC now points to the address LR is pointing to.
My question is if I have the following pseudo code:
func initArr()
for(i = 0; i < max; i++)
arr[i] = i
return;
How can I translate this to assembly. As I know BX LR returns from the function, however I want to return once I find out I >= max. Would this work?:
.global _start
_start:
LDR R1, =A
MOV R2, #0x00
BL _initArr
_end:
MOV R1, #0
MOV R7, #1
SWI 0
_initArr:
CMP R2, #MAX
BEQ LR #this
STR R2, [R1]
ADD R1, #0x04
ADD R2, #0x01
B _initArr
.data
.equ MAX, 10
A: rept MAX
byte 0x00
endr
To be more specific:
Does conditional branches paired with LR work in the same manner BX LR works?
(optional) is the layout of my code correct

Function call with more than 4 registers ARM assembly

I am trying to pass r0-r5 into the function check. However only the registers r0-r3 are copied by reference. In my main function i have this code.
push {lr}
mov r0, #1
mov r1, #2
mov r2, #3
mov r3, #4
mov r4, #5
mov r5, #6
bl check
pop {lr}
bx lr
Inside my check function i have this code. This is in a separate file also not sure if that matters
m: .asciz "%d, %d ~ (%d, %d, %d)
...
push {lr}
ldr r0, =m
bl printf
pop {lr}
bx lr
The output for this is 2, 3 ~ (4, 33772, 1994545180). I am trying to learn assembly so can you please explain the answer with some googling i know i need to use the stack but, I am not sure how to use it and would like to learn how. Thanks in advance.
you could just try it and see
void check ( unsigned int, unsigned int, unsigned int, unsigned int, unsigned int );
void call_check ( void )
{
check(1,2,3,4,5);
}
arm-linux-gnueabi-gcc -c -O2 check.c -o check.o
arm-linux-gnueabi-objdump -D check.o
00000000 <call_check>:
0: e52de004 push {lr} ; (str lr, [sp, #-4]!)
4: e3a03005 mov r3, #5
8: e24dd00c sub sp, sp, #12
c: e58d3000 str r3, [sp]
10: e3a00001 mov r0, #1
14: e3a01002 mov r1, #2
18: e3a02003 mov r2, #3
1c: e3a03004 mov r3, #4
20: ebfffffe bl 0 <check>
24: e28dd00c add sp, sp, #12
28: e8bd8000 ldmfd sp!, {pc}
now of course this could be hand optimized and still work just fine. Maybe they are keeping the stack aligned on a 16 byte/4 word/64 bit boundary is the reason for the additional 12 byte modification to the stack pointer? dont know. but other than that you can see that you naturally need to save the link register since you are calling another function. r0 - r3 are obvious and then per the eabi the first thing on the stack is the 5th word worth of parameters.
Likewise for your check function you can simply let the compiler get you started. If you look at your code, r0 is coming in as your first parameter and then you trash it by changing it to the first parameter for printf. you need 6 parameters for printf to pass in. you need to move them over one the first parameter to check is the second parameter to printf, the second to check is third to printf and so on. so the code has to do that shift (two of which now are on the stack).

LLVM use of carry and zero flags

I'm starting to read LLVM docs and IR documentation.
In common architectures, an asm cmp instruction "result" value is -at least- 3 bits long, let's say the first bit is the SIGN flag, the second bit is the CARRY flag and the third bit is the ZERO flag.
Question 1)
Why the IR icmp instruction result value is only i1? (you can choose only one flag)
Why doesn't IR define, let's call it a icmp2 instruction returning an i3 having SIGN,CARRY and ZERO flags?
This i3 value can be acted upon with a switch instruction, or maybe a specific br2 instruction, like:
%result = cmp2 i32 %a, i32 %b
br2 i3 %result onzero label %EQUAL, onsign label %A_LT_B
#here %a GT %b
Question 2)
Does this make sense? Could this br2 instruction help create new optimizations? i.e. remove all jmps? it is necessary or the performance gains are negligible?
The reason I'm asking this -besides not being an expert in LLVM- is because in my first tests I was expecting some kind of optimization to be made by LLVM in order to avoid making the comparison twice and also avoid all branches by using asm conditional-move instructions.
My Tests:
I've compiled with clang-LLVM this:
#include <stdlib.h>
#include <inttypes.h>
typedef int32_t i32;
i32 compare (i32 a, i32 b){
// return (a - b) & 1;
if (a>b) return 1;
if (a<b) return -1;
return 0;
}
int main(int argc, char** args){
i32 n,i;
i32 a,b,avg;
srand(0); //fixed seed
for (i=0;i<500;i++){
for (n=0;n<1e6;n++){
a=rand();
b=rand();
avg+=compare(a,b);
}
}
return avg;
}
Output asm is:
...
mov r15d, -1
...
.LBB1_2: # Parent Loop BB1_1 Depth=1
# => This Inner Loop Header: Depth=2
call rand
mov r12d, eax
call rand
mov ecx, 1
cmp r12d, eax
jg .LBB1_4
# BB#3: # in Loop: Header=BB1_2 Depth=2
mov ecx, 0
cmovl ecx, r15d
.LBB1_4: # %compare.exit
# in Loop: Header=BB1_2 Depth=2
add ebx, ecx
...
I expected (all jmps removed in the inner loop):
mov r15d, -1
mov r13d, 1 # HAND CODED
call rand
mov r12d, eax
call rand
xor ecx,ecx # HAND CODED
cmp r12d, eax
cmovl ecx, r15d # HAND CODED
cmovg ecx, r13d # HAND CODED
add ebx, ecx
Performance difference (1s) seems to be negligible (on a VM under VirtualBox):
LLVM generated asm: 12.53s
hancoded asm: 11.53s
diff: 1s, in 500 millions iterations
Question 3)
Are my performance measures correct? Here's the makefile and the full hancoded.compare.s
makefile:
CC=clang -mllvm --x86-asm-syntax=intel
all:
$(CC) -S -O3 compare.c
$(CC) compare.s -o compare.test
$(CC) handcoded.compare.s -o handcoded.compare.test
echo `time ./compare.test`
echo `time ./handcoded.compare.test`
echo `time ./compare.test`
echo `time ./handcoded.compare.test`
hand coded (fixed) asm:
.text
.file "handcoded.compare.c"
.globl compare
.align 16, 0x90
.type compare,#function
compare: # #compare
.cfi_startproc
# BB#0:
mov eax, 1
cmp edi, esi
jg .LBB0_2
# BB#1:
xor ecx, ecx
cmp edi, esi
mov eax, -1
cmovge eax, ecx
.LBB0_2:
ret
.Ltmp0:
.size compare, .Ltmp0-compare
.cfi_endproc
.globl main
.align 16, 0x90
.type main,#function
main: # #main
.cfi_startproc
# BB#0:
push rbp
.Ltmp1:
.cfi_def_cfa_offset 16
push r15
.Ltmp2:
.cfi_def_cfa_offset 24
push r14
.Ltmp3:
.cfi_def_cfa_offset 32
push r12
.Ltmp4:
.cfi_def_cfa_offset 40
push rbx
.Ltmp5:
.cfi_def_cfa_offset 48
.Ltmp6:
.cfi_offset rbx, -48
.Ltmp7:
.cfi_offset r12, -40
.Ltmp8:
.cfi_offset r14, -32
.Ltmp9:
.cfi_offset r15, -24
.Ltmp10:
.cfi_offset rbp, -16
xor r14d, r14d
xor edi, edi
call srand
mov r15d, -1
mov r13d, 1 # HAND CODED
# implicit-def: EBX
.align 16, 0x90
.LBB1_1: # %.preheader
# =>This Loop Header: Depth=1
# Child Loop BB1_2 Depth 2
mov ebp, 1000000
.align 16, 0x90
.LBB1_2: # Parent Loop BB1_1 Depth=1
# => This Inner Loop Header: Depth=2
call rand
mov r12d, eax
call rand
xor ecx,ecx #hand coded
cmp r12d, eax
cmovl ecx, r15d #hand coded
cmovg ecx, r13d #hand coded
add ebx, ecx
.LBB1_3:
dec ebp
jne .LBB1_2
# BB#5: # in Loop: Header=BB1_1 Depth=1
inc r14d
cmp r14d, 500
jne .LBB1_1
# BB#6:
mov eax, ebx
pop rbx
pop r12
pop r14
pop r15
pop rbp
ret
.Ltmp11:
.size main, .Ltmp11-main
.cfi_endproc
.ident "Debian clang version 3.5.0-1~exp1 (trunk) (based on LLVM 3.5.0)"
.section ".note.GNU-stack","",#progbits
Question 1: LLVM IR is machine independent. Some machines might not even have a carry flag, or even a zero flag or sign flag. The return value is i1 which suffices to indicate TRUE or FALSE. You can set the comparison condition like 'eq' and then check the result to see if the two operands are equal or not, etc.
Question 2: LLVM IR does not care about optimization initially. The main goal is to generate a Static Single Assignment (SSA) based representation of instructions. Optimization happens in later passes of which some are machine independent and some are machine dependent. Your br2 idea will assume that the machine will support those 3 flags which might be a wrong assumption,
Question 3: I am not sure what you are trying to do here. Can you explain more?