Assembly how to return from a function call when a condition is met - function

So I'm learning assembly and I've just learned to create a function I create a label, use BL to branch to the label, then use BX LR to return from the function call.
I know LR stores the address of the PC + 4 bits, the program pointer then moves to the address of the label, then the PC increments though the code until it reaches BX LR and then the PC now points to the address LR is pointing to.
My question is if I have the following pseudo code:
func initArr()
for(i = 0; i < max; i++)
arr[i] = i
return;
How can I translate this to assembly. As I know BX LR returns from the function, however I want to return once I find out I >= max. Would this work?:
.global _start
_start:
LDR R1, =A
MOV R2, #0x00
BL _initArr
_end:
MOV R1, #0
MOV R7, #1
SWI 0
_initArr:
CMP R2, #MAX
BEQ LR #this
STR R2, [R1]
ADD R1, #0x04
ADD R2, #0x01
B _initArr
.data
.equ MAX, 10
A: rept MAX
byte 0x00
endr
To be more specific:
Does conditional branches paired with LR work in the same manner BX LR works?
(optional) is the layout of my code correct

Related

arm cortex-m33 (trustzone, silabs efm32pg22) - assembler hardfaults accessing GPIO or almost any peripherals areas, any hint?

I am just lost here with this code trying to configure on baremetal the silicon labs efm32pg22 in theirs devkit accessed through internal J-Link from segger studio (great fast ide) - I have such example blink hello world in C working from theirs simplicity studio, but was trying to achieve the same thing I did on microchip pic32 mc00 or samd21g17d easily in pure assembler, having only clocks and startup configured through gui in mplab x... well, here I tried to go to segger IDE where is NO startup/clocks config easy way, or I didnt found it yet. On hardware level, registers of such cortex beasts are different by manufacturer, in C/C++ there is some not cheap unification over cmsis - but I want only to know what minimal is needed to just have working raw GPIO after clock/startup ... Segger project is generic cortex-m for specific efm32pg22 so cortex-M33 with trust-zone security - I probably dont know what all is locked or switched off or in which state MCU is, if privileged or nonprivileged - there are 2 sets of registers mapping, but nothing works. As far as I try to "store" or even "load" on GPIO config registers (or SMU regs to query someting too) it is throw hardfault exception. All using segger ide debugger over onboard j-link. Kindly please, what I am doing wrong, whats missing here?
in C, I have only this code:
extern void blink(void);
int main ( void )
{
blink();
}
In blink.s I have this:
;#https://github.com/hubmartin/ARM-cortex-M-bare-metal-assembler-examples/blob/master/02%20-%20Bare%20metal%20blinking%20LED/main.S
;#https://sites.google.com/site/hubmartin/arm/arm-cortex-bare-metal-assembly/02---arm-cortex-bare-metal-assembly-blinking-led
;#https://mecrisp-stellaris-folkdoc.sourceforge.io/projects/blink-f0disco-gdbtui/doc/readme.html
;#https://microcontrollerslab.com/use-gpio-pins-tm4c123g-tiva-launchpad/
;#!!! ENABLE GPIO CLOCK SOURCE ON EFM32 !!!
;#https://community.silabs.com/s/share/a5U1M000000knsWUAQ/hello-world-part-2-create-firmware-to-blink-the-led?language=en_US
;#EFM32 GPIO
;#https://www.silabs.com/documents/public/application-notes/an0012-efm32-gpio.pdf
;# ARM thumb2 ISA
;#https://www.engr.scu.edu/~dlewis/book3/docs/ARM_and_Thumb-2_Instruction_Set.pdf
;#https://sciencezero.4hv.org/index.php?title=ARM:_Cortex-M3_Thumb-2_instruction_set
;#!!! https://stackoverflow.com/questions/48561243/gnu-arm-assembler-changes-orr-into-movw
;#segger assembler
;#https://studio.segger.com/segger/UM20006_Assembler.pdf
;#https://www.segger.com/doc/UM20006_Assembler.html
;#!!! unfortunatelly, we dont know here yet how to include ASM SFR defines, nor for MPLAB ARM (Harmony) !!!
;##include <xc.h>
;##include "definitions.h"
.cpu cortex-m33
.thumb
.text
.section .text.startup.main,"ax",%progbits
.balign 2
.p2align 2,,3
.global blink
//.arch armv8-m.base
.arch armv6-m
.syntax unified
.code 16
.thumb_func
.fpu softvfp
.type blink, %function
//!!! here we have manually entered GPIO PORT defines for PIC32CM
.equ SYSCFG_BASE_ADDRESS, 0x50078000
.equ SMU_BASE_ADDRESS, 0x54008000
//.equ SMU_BASE_ADDRESS, 0x5400C000
.equ CMU_BASE_ADDRESS, 0x50008000
.equ GPIO_BASE_ADDRESS, 0x5003C000 // this differs totally from both "special" infineon and microchip "standard?" cortex devices !!!
.equ DELAY, 40000
// Vector table
.word 0x20001000 // Vector #0 - Stack pointer init value (0x20000000 is RAM address and 0x1000 is 4kB size, stack grows "downwards")
.word blink // Vector #1 - Reset vector - where the code begins
// Vector #3..#n - I don't use Systick and another interrupts right now
// so it is not necessary to define them and code can start here
blink:
LDR r0, =(SYSCFG_BASE_ADDRESS + 0x200) // SYSCFG SYSCFG_CTRL
LDR r1, =0 // 0 diable address faults exceptions
ldr r1, [r0] // Store R0 value to r1
LDR r0, =(CMU_BASE_ADDRESS) // CMU CMU_SYSCLKCTRL PCLKPRESC + CLKSEL
LDR r1, =0b10000000001 // FSRCO 20MHz + PCLK = HCLK/2 = 10MHz
STR r1, [r0, 0x70] // Store R0 value to r1
LDR r0, =(CMU_BASE_ADDRESS) // CMU CMU_CLKEN0
LDR r1, [r0, 0x64]
LDR r2, =(1 << 25) // GPIO CLK EN
orrs r1, r2 // !!! HORROR !!! -- orr is not possible in thumb2 ?? only orrs !! (width suffix)
STR r1, [r0, 0x64] // Store R0 value to r1
LDR r1, [r0, 0x68]
LDR r2, =(1 << 14) // SMU CLK EN
orrs r1, r2 // !!! HORROR !!! -- orr is not possible in thumb2 ?? only orrs !! (width suffix)
STR r1, [r0, 0x68] // Store R0 value to r1
//LDR r0, =(SMU_BASE_ADDRESS) // SMU SMU_LOCK
//LDR r1, =11325013 // SMU UNLOCK CODE
//STR r1, [r0, 0x08] //Store R0 value to r1
ldr r0, =(SMU_BASE_ADDRESS) // SMU reading values, detection - AGAIN, HARD FAULTS !!!!!!!
ldr r1, [r0, 0x04]
ldr r1, [r0, 0x20]
ldr r1, [r0, 0x40]
//LDR r0, =(GPIO_BASE_ADDRESS + 0x300) // GPIO UNLOCK
//LDR r1, =0xA534
//STR r1, [r0] // Store R0 value to r1
//!! THIS BELOW IS OLD FOR SAMD , WE STILL SIMPLY CANT ENABLE GPIO !!!!
// Enable PORTA pin 4 as output
LDR r0, =(GPIO_BASE_ADDRESS) // DIR PORTA
LDR r1, =0b00000000000001000000000000000000
STR r1, [r0, 0x04] // Store R0 value to r1
LDR R2, =1
loop:
// Write high to pin PA04
LDR r0, =GPIO_BASE_ADDRESS // OUT PORTA
LDR r1, =0b10000 // PORT_PA04
STR r1, [r0, 0x10] // Store R1 value to address pointed by R0
// Dummy counter to slow down my loop
LDR R0, =0
LDR R1, =DELAY
loop0:
ADD R0, R2
cmp R0, R1
bne loop0
// Write low to PA04
LDR r0, =GPIO_BASE_ADDRESS // OUT PORTA
LDR r1, =0b00000
STR r1, [r0, 0x10] // Store R1 value to address pointed by R0
// Dummy counter to slow down my loop
LDR R0, =0
LDR R1, =DELAY
loop1:
ADD R0, R2
cmp R0, R1
bne loop1
b loop
UPDATE: well, now I tried it again in SimplicityStudio, placing blink() call after pregenerated system init:
extern void blink(void);
int main(void)
{
// Initialize Silicon Labs device, system, service(s) and protocol stack(s).
// Note that if the kernel is present, processing task(s) will be created by
// this call.
sl_system_init();
blink();
}
having this code in blink.s: - and here it works this way and blinks ...
.cpu cortex-m33
.thumb
.text
.section .text.startup.main,"ax",%progbits
.balign 2
.p2align 2,,3
.global blink
//.arch armv8-m.base
.arch armv6-m
.syntax unified
.code 16
.thumb_func
.fpu softvfp
.type blink, %function
/*
//!!! here we have manually entered GPIO PORT defines for PIC32CM
.equ SYSCFG_BASE_ADDRESS, 0x50078000
.equ SMU_BASE_ADDRESS, 0x54008000
//.equ SMU_BASE_ADDRESS, 0x5400C000
.equ CMU_BASE_ADDRESS, 0x50008000
*/
.equ GPIO_BASE_ADDRESS, 0x5003C000 // this differs totally from both "special" infineon and microchip "standard?" cortex devices !!!
.equ DELAY, 400000
// Vector table
.word 0x20001000 // Vector #0 - Stack pointer init value (0x20000000 is RAM address and 0x1000 is 4kB size, stack grows "downwards")
.word blink // Vector #1 - Reset vector - where the code begins
// Vector #3..#n - I don't use Systick and another interrupts right now
// so it is not necessary to define them and code can start here
blink:
// Enable PORTA pin 4 as output
LDR r0, =(GPIO_BASE_ADDRESS) // DIR PORTA
LDR r1, =0b00000000000001000000000000000000
STR r1, [r0, 0x04]
loop:
// Write high to pin PA04
LDR r0, =GPIO_BASE_ADDRESS // OUT PORTA
LDR r1, =0b10000 // PORT_PA04
STR r1, [r0, 0x10]
// Dummy counter to slow down my loop
LDR R0, =0
LDR R1, =DELAY
loop0:
ADD R0, R2
cmp R0, R1
bne loop0
// Write low to PA04
LDR r0, =GPIO_BASE_ADDRESS // OUT PORTA
LDR r1, =0b00000
STR r1, [r0, 0x10]
// Dummy counter to slow down my loop
LDR R0, =0
LDR R1, =DELAY
loop1:
ADD R0, R2
cmp R0, R1
bne loop1
b loop
... so NOW, I am just curious, what all is missing in pure assembly code to bring that cortex-m33 into some "easy" state, just ignoring trustzone, probably to use it similary as say, plain cortex-m3 ??
can anybody help? I am digging deeply into this datasheet/ref manual, but no luck till now ...
https://www.silabs.com/documents/public/reference-manuals/efm32pg22-rm.pdf
UPDATE AGAIN: umm, will try to figure out ... by traversing system_init C-code its clear whats going on, there are also some chip errata workarounds, but I never touched DCDC while initializing, this may be culprit...
void sl_platform_init(void)
{
CHIP_Init();
sl_device_init_nvic();
sl_board_preinit();
sl_device_init_dcdc();
sl_device_init_hfxo();
sl_device_init_lfxo();
sl_device_init_clocks();
sl_device_init_emu();
sl_board_init();
}
well, okay, manufacturer specific code generation for MCU startup IS really important and useful thing )) ... such MCUs from different manufacturers are really much different at registers level (even that all are "cortex-m" core based), that its worthless to try to configure them manually in assembly if there is enough flash available, and it mostly IS. So, till now, no luck with segger/keil/iar "generic" arm/cortex IDEs to do this properly on specific parts, so using manufacturer specific IDE to (mostly) graphically configure startup clocks and peripherals IS CRUCIAL, or at least, its really easiest way (I know, quite expensive observation after all the assembly tries... )). After then, its easy to make even pure assembly "blink" helloworld test called as extern C-function. You may be asking why I am still considering assembly if there are even CMSIS (on arm) "platform abstraction layer" C-headers at least (no, it doesnt help in abstraction, as the devices are still very different, you only have registers symbols #defines and typedefs and enums to do something in C easily, okay). But I am trying to compare some C-compiled code with handwriten assembly for some specific purpose, which needs forced optimized algorithm from scratch and its often quite easier to think/design it directly in assembly that to rely on very complexly described C-compiler optimisations (each compiler has its own LONG document how his optimisations work and at this level, C is simply still too abstract and moving target, the more, you try to write something for even different MCU architectures (think ARM cortex-m, PIC32/mips, and/or even PIC16/18 + PIC24, AVR , MSP430 ...) - while general algorithm may be described in shared pseudoassenbly to be as near to hardware as possible, withnout knowing all optimization quirks of each architecture C compiler(s) - there are often MORE different C compilers too. So, to compare C-compiler generated code with handwriten assembly you can do it, and I already tried such assembly blink on MANY VERY different architectures, in case I definitelly used mfg specific IDE to genearte startup in C, using all the GUI configurations and code generation down to always compilable empty C project, of course, having very different code size output using such generated startups. Most advanced MCUs are really very complex, mostly in clocks configuration and pins functions config and then different peripheral devices too, sure. Some similarities are possible only at single mfg level, to some extent, so MCU of single manufacturer often share similar approach, obviously. So final solution is to have startup generated and then switch to assembly immediatelly, this is feasible. Sure that in case of small flash, its further possible to optimize even startup code, but its mostly important on smallest 8bit parts, where startup IS quite easy anyway or the generated code is also small, obviously.

Two functions/subroutines in ARM assembly language

I am stuck with an exercise of ARM.
The following program should calculate the result of 2((x-1)^2 + 1) but there is a mistake in the program that leads it into an infinite loop.
I think that I still don't understand completely subroutines and for this reason I am not seeing where the mistake is.
_start:
mov r0, #4
bl g
mov r7, #1
swi #0
f:
mul r1, r0, r0
add r0, r1, #1
mov pc, lr
g:
sub r0, r0, #1
bl f
add r0, r0, r0
mov pc, lr
The infinite loop starts in subroutine g: in the line of mov pc, lr and instead of returning to _start it goes to the previous line add r0, r0, r0 and then again to the last line of subroutine g:.
So I guess that the problem is the last line of subroutine g: but I can't find the way to return to _start without using mov pc, lr. I mean, this should be the command used when we have a branch with link.
Also, in this case r0 = 4, so the result of the program should be 20.
This is because you don't save lr on the stack prior to calling f, and the initial return address was therefore lost: if you only have one level of subroutine calls, using lr without saving it is fine, but if you have more then one, you need to preserve the previous value of lr.
For example, when compiling this C example using Compiler Explorer with ARM gcc 4.56.4 (Linux), and options -mthumb -O0,
void f()
{
}
void g()
{
f();
}
void start()
{
g();
}
The generated code will be:
f():
push {r7, lr}
add r7, sp, #0
mov sp, r7
pop {r7, pc}
g():
push {r7, lr}
add r7, sp, #0
bl f()
mov sp, r7
pop {r7, pc}
start():
push {r7, lr}
add r7, sp, #0
bl g()
mov sp, r7
pop {r7, pc}
If you were running this on bare metal, not under Linux, you'd need your stack pointer to be initialized a correct value.
Assuming you are running from RAM on a bare-metal system/simulator, you could setup a minimal stack of 128 bytes:
.text
.balign 8
_start:
adr r0, . + 128 // set top of stack at _start + 128
mov sp, r0
...
But it looks like you're writing a Linux executable that exits with a swi/r7=1 exit system call. So don't do that, it would make your program crash when it tries to write to the stack.

Function call with more than 4 registers ARM assembly

I am trying to pass r0-r5 into the function check. However only the registers r0-r3 are copied by reference. In my main function i have this code.
push {lr}
mov r0, #1
mov r1, #2
mov r2, #3
mov r3, #4
mov r4, #5
mov r5, #6
bl check
pop {lr}
bx lr
Inside my check function i have this code. This is in a separate file also not sure if that matters
m: .asciz "%d, %d ~ (%d, %d, %d)
...
push {lr}
ldr r0, =m
bl printf
pop {lr}
bx lr
The output for this is 2, 3 ~ (4, 33772, 1994545180). I am trying to learn assembly so can you please explain the answer with some googling i know i need to use the stack but, I am not sure how to use it and would like to learn how. Thanks in advance.
you could just try it and see
void check ( unsigned int, unsigned int, unsigned int, unsigned int, unsigned int );
void call_check ( void )
{
check(1,2,3,4,5);
}
arm-linux-gnueabi-gcc -c -O2 check.c -o check.o
arm-linux-gnueabi-objdump -D check.o
00000000 <call_check>:
0: e52de004 push {lr} ; (str lr, [sp, #-4]!)
4: e3a03005 mov r3, #5
8: e24dd00c sub sp, sp, #12
c: e58d3000 str r3, [sp]
10: e3a00001 mov r0, #1
14: e3a01002 mov r1, #2
18: e3a02003 mov r2, #3
1c: e3a03004 mov r3, #4
20: ebfffffe bl 0 <check>
24: e28dd00c add sp, sp, #12
28: e8bd8000 ldmfd sp!, {pc}
now of course this could be hand optimized and still work just fine. Maybe they are keeping the stack aligned on a 16 byte/4 word/64 bit boundary is the reason for the additional 12 byte modification to the stack pointer? dont know. but other than that you can see that you naturally need to save the link register since you are calling another function. r0 - r3 are obvious and then per the eabi the first thing on the stack is the 5th word worth of parameters.
Likewise for your check function you can simply let the compiler get you started. If you look at your code, r0 is coming in as your first parameter and then you trash it by changing it to the first parameter for printf. you need 6 parameters for printf to pass in. you need to move them over one the first parameter to check is the second parameter to printf, the second to check is third to printf and so on. so the code has to do that shift (two of which now are on the stack).

Creating a function in assembly language (TASM)

I wanted to print the first 20 numbers using loop.
Printing the first nine numbers is absolutely fine as the hexadecimal and decimal codes are the same, but from the 10th number I had to convert each number into its appropriate code and then convert it and store it to string and eventually display it
That is,
If (NUMBER > 9)
ADD 6D
;10d = 0ah --(+6)--> 16d = 10h
IF NUMBER IS > 19
ADD 12D
;20d = 14h --(+12)--> 32d = 20h
Then rotating and shifting each number to get the desired output number, that is,
DAA # let al = 74h = 0111.0100
XOR AH,AH # ah = 0 (Just in case it wasn't)
# ax = 0000.0000.0111.0100
ROR AX,4 # ax = 0100.0000.0000.0111 = 4007h
SHR AH,4 # ax = 0000.0100.0000.0111 = 0407h
ADD AX,3030h # ax = 0011.0100.0011.0111 = 3437h = ASCII "74" (Reversed due to little endian)
And then storing the result in to the string and displaying it, that is,
MOV BX,OFFSET Result ;Let Result is an empty string
MOV byte ptr[BX],5 ;Size of the string
MOV byte ptr[BX+4],'$' ;String terminator
MOV byte ptr[BX+3],AH ;storing number
MOV byte ptr[BX+2],AL
MOV DX,BX
ADD DX,02 ;Displaying the result
MOV AH,09H ;Interrupt 21 service to display string
INT 21H
And here is the complete code with proper commenting,
MOV CX,20 ;Number of iterations
MOV DX,0 ;First value of the sequence
L1:
PUSH DX
ADD DX,30H ; 30H is equal to 0 in hexadecimal , 31H = 1 and so on
MOV AH,02H ; INTERRUPT Service to print the DX content
INT 21H
POP DX
ADD DX,1
CMP DX,09 ; if number is > 9 i.e 0A then go to L2
JA L2
LOOP L1
L2:
PUSH DX
MOV AX,DX
CMP AX,14H ;If number is equal to 14H(20) then Jump to L3
JE L3
ADD AX,6D ;If less than 20 then add 6D
XOR AH,AH ;Clear the content of AH
ROR AX,4 ;Rotating and Shifting for to properly store
SHR AH,4
ADC AX,3030h
MOV BX,OFFSET Result
MOV byte ptr[BX],5
MOV byte ptr[BX+4],'$'
MOV byte ptr[BX+3],AH
MOV byte ptr[BX+2],AL
MOV DX,BX
ADD DX,02
MOV AH,09H
INT 21H
POP DX
ADD DX,1
LOOP L2
;If the number is equal to 20 come here, ->
; Every step is repeated here just to change 6D to 12D
L3:
ADD AX,12D
XOR AH,AH
ROR AX,1
ROR AX,1
ROR AX,1
ROR AX,1
SHR AH,1
SHR AH,1
SHR AH,1
SHR AH,1
ADC AX,3030h
MOV BX,OFFSET Result
MOV byte ptr[BX],5
MOV byte ptr[BX+4],'$'
MOV byte ptr[BX+3],AH
MOV byte ptr[BX+2],AL
MOV DX,BX
ADD DX,02
MOV AH,09H
INT 21H
Is there any proper way to do it, creating a function and using if/else (jumps) to get the desired output rather than repeating the code again and again?
PSEUDO CODE:
VAR = 6
IF Number is > 9
ADD AX,VAR
Else IF Number is > 19
ADD AX,(VAR*2)
ELSE IF NUMBER is > 29
ADD AX,(VAR*3)
So you just want to print 0 ... 20 as ASCII characters? It looks like you understand that the numerals are identified as 0x30 ... 0x39 for '0' to '9', so you could use integer division to generate the character for the tens digit:
I usually work with C but conversion to assembler shouldn't be too complicated since these are all fundamental operations and there are no function calls.
int i_value = 29;
int i_tens = i_value/10; //Integer division! 29/10 = 2, save for later use
char c_tens = '0' + i_tens;
char c_ones = '0' + i_value-(10*i_tens); // Subtract N*10 from value
The output will be c_tens = 0x32, c_ones = 0x39. You should be able to wrap this inside of a loop pretty easily using a pair of registers.
Pseudocode
regA <- num_iterations //For example, 20
regB <- 0 //Initialize counter register
LOOP:
//Do conversion for the current iteration.
//Manipulate bytes for output as necessary.
regB <- regB +1
branch not equal regA, regB LOOP
The following code counts from 0 up to 99 (ax contains the ASCII number):
count proc
mov cx, 100 ; loop runs the times specified in the cx register
xor bx, bx ; set counter to zero
print:
mov ax, bx
aam ; Converts binary to unpacked BCD
xor ax, 3030h ; Converts upacked BCD to ASCII
; Print here (ax now contains the numer in ASCII representation)
inc bx ; Increase counter
loop print
ret
count endp

R0 is violated after function returns

I am implementing a uart queue in s3c44b0x (ARM7TDMI), the uart0 ISR will enqueue the char while the main loop will dequeue the char. however, while dequeuing, the value (in R0) returned may be not the one dequeued from the queue, and I found R0 is violated after returning from the dequeue function (input 'v' cont., and test() is in the main loop):
wish for your help.
CHAR cliDequeue(void)
{
CHAR bTmpCh;
if (gwCliQSize == 0)
{
return 0;
}
bTmpCh = gabCliQ[gwCliQTail]; /* char is enqueued in the Q in ISR */
gwCliQTail++;
gwCliQTail %= MAX_CLI_QUEUE_LEN;
ASSERT(gwCliQSize > 0);
gwCliQSize--;
ASSERT(bTmpCh == 'v'); /* will not assert */
//uartPutChar(bTmpCh);
return bTmpCh;
}
void test(void)
{
CHAR bTestCh;
bTestCh = cliDequeue();
if (bTestCh != 0)
{
ASSERT(bTestCh == 'v'); /* assert here ! */
uartPutChar(bTestCh);
}
}
We don't have enough information / context to answer definitively. It would also be helpful if you posted the corresponding assembly code so that we could see how/when things are moved in & out of R0. REgardless, a few things spring to mind immediately from your posted C code.
(0) Are the variables shared between interrupts & the main loop declared as volatile?
(1)
In CliDequeue, you're accessing an array which is shared with an ISR. It appears to be a single reader / single writer construct, so that isn't automatically bad, but your housekeeping isn't airtight.
For example, one invariant you must be sure to satisfy is that the queue size & tail pointer are in sync. Yet, unless this routine is called with interrupts disabled, your tail pointer & queue size aren't adjusted as a single transaction.
(2)
Furthermore, I'd guess that gwCliQSize is also adjusted in the interrupt (incremented in ISR, decremented in the application). Another race condition. To perform gwCliQSize--, behind the scenes you are probably reading from memory to a register, decrementing the register, then writing it back. What happens if you read 5 from memory into R1, then an interrupt fires and increments it to 6, then you exit the ISR, and the register decrement and writeback (with a value of 4).
(3)
Lastly, it's possible (although not too likely) that bTmpCh or bTestCh are stored on the stack, and that your stack is getting corrupted / slammed by another task / interrupt / etc. So when your assert fails, you're thinking it's R0 that is corrupted, but really it could be that the value moved into R0 before return, or the value moved out of R0 into a stack variable, is getting clobbered.
I've nattered on enough. There are other possibilities but from what you've posted (and not posted) it's impossible to say for sure.
P.S. If you've used a debugger and it's really & literally R0's value that is getting corrupted, not just the value of the character in the queue, that points to a problem in your scheduler / context switcher / ISR pre- or post-amble etc...
here is the assembly code:
for the test():
0x00001308 E92D4010 STMDB R13!,{R4,R14}
37: bTestCh = cliDequeue();
38:
0x0000130C EB000207 BL cliDequeue(0x00001B30)
0x00001310 E1A04000 MOV R4,R0
39: if (bTestCh != 0)
40: {
0x00001314 E3540000 CMP R4,#pTest(0x00000000)
0x00001318 0A000007 BEQ 0x0000133C
41: ASSERT(bTestCh == 'v');
0x0000131C E1A00000 NOP
0x00001320 E3540076 CMP R4,#0x00000076
0x00001324 0A000001 BEQ 0x00001330
0x00001328 E1A00000 NOP
0x0000132C EAFFFFFE B 0x0000132C
0x00001330 E1A00000 NOP
42: uartPutChar(bTestCh);
43: }
0x00001334 E1A00004 MOV R0,R4
0x00001338 EB00014A BL uartPutChar(0x00001868)
44: }
45:
46: int main(void)
0x0000133C E8BD4010 LDMIA R13!,{R4,R14}
0x00001340 E12FFF1E BX R14
for the cliDequeu(), BTW, gwCliQSize is defined as
UINT32 volatile gwCliQSize;
0x00001B30 E59F00D4 LDR R0,[PC,#0x00D4]
0x00001B34 E5900000 LDR R0,[R0]
0x00001B38 E3500000 CMP R0,#pTest(0x00000000)
0x00001B3C 1A000001 BNE 0x00001B48
78: return 0;
79: }
80:
81: bTmpCh = gabCliQ[gwCliQTail];
82: gwCliQTail++;
83: gwCliQTail %= MAX_CLI_QUEUE_LEN;
84: ASSERT(gwCliQSize > 0);
85: gwCliQSize--;
86:
87: //chCheck(bTmpCh);
88: ASSERT(bTmpCh == 'v'); /* will not assert */
89: //uartPutChar(bTmpCh);
90:
91: return bTmpCh;
0x00001B40 E3A00000 MOV R0,#pTest(0x00000000)
92: }
93:
94:
95: void cliQInit(void)
0x00001B44 E12FFF1E BX R14
81: bTmpCh = gabCliQ[gwCliQTail];
0x00001B48 E59F00C0 LDR R0,[PC,#0x00C0]
0x00001B4C E59F20C4 LDR R2,[PC,#0x00C4]
0x00001B50 E5922000 LDR R2,[R2]
0x00001B54 E7D01002 LDRB R1,[R0,R2]
82: gwCliQTail++;
0x00001B58 E59F00B8 LDR R0,[PC,#0x00B8]
0x00001B5C E5900000 LDR R0,[R0]
0x00001B60 E2800001 ADD R0,R0,#0x00000001
0x00001B64 E59F20AC LDR R2,[PC,#0x00AC]
0x00001B68 E5820000 STR R0,[R2]
83: gwCliQTail %= MAX_CLI_QUEUE_LEN;
0x00001B6C E2820000 ADD R0,R2,#pTest(0x00000000)
0x00001B70 E5900000 LDR R0,[R0]
0x00001B74 E20000FF AND R0,R0,#0x000000FF
0x00001B78 E5820000 STR R0,[R2]
84: ASSERT(gwCliQSize > 0);
0x00001B7C E1A00000 NOP
0x00001B80 E59F0084 LDR R0,[PC,#0x0084]
0x00001B84 E5900000 LDR R0,[R0]
0x00001B88 E3500000 CMP R0,#pTest(0x00000000)
0x00001B8C 1A000001 BNE 0x00001B98
0x00001B90 E1A00000 NOP
0x00001B94 EAFFFFFE B 0x00001B94
0x00001B98 E1A00000 NOP
85: gwCliQSize--;
86:
87: //chCheck(bTmpCh);
0x00001B9C E59F0068 LDR R0,[PC,#0x0068]
0x00001BA0 E5900000 LDR R0,[R0]
0x00001BA4 E2400001 SUB R0,R0,#0x00000001
0x00001BA8 E59F205C LDR R2,[PC,#0x005C]
0x00001BAC E5820000 STR R0,[R2]
88: ASSERT(bTmpCh == 'v'); /* will not assert */
89: //uartPutChar(bTmpCh);
90:
0x00001BB0 E1A00000 NOP
0x00001BB4 E3510076 CMP R1,#0x00000076
0x00001BB8 0A000001 BEQ 0x00001BC4
0x00001BBC E1A00000 NOP
0x00001BC0 EAFFFFFE B 0x00001BC0
0x00001BC4 E1A00000 NOP
91: return bTmpCh;
92: }
93:
94:
95: void cliQInit(void)
0x00001BC8 E1A00001 MOV R0,R1
0x00001BCC EAFFFFDC B 0x00001B44
for cliEnqueue:
void cliEnqueue(CHAR bC)
{
if (gwCliQSize == MAX_CLI_QUEUE_LEN)
{
ASSERT(0);
}
gabCliQ[gwCliQHeader] = bC;
gwCliQHeader++;
gwCliQHeader %= MAX_CLI_QUEUE_LEN;
gwCliQSize++;
}
assembly:
0x00001A5C E59F11AC LDR R1,[PC,#0x01AC]
0x00001A60 E59F21AC LDR R2,[PC,#0x01AC]
0x00001A64 E5922000 LDR R2,[R2]
0x00001A68 E7C10002 STRB R0,[R1,R2]
25: gwCliQHeader++;
0x00001A6C E59F11A0 LDR R1,[PC,#0x01A0]
0x00001A70 E5911000 LDR R1,[R1]
0x00001A74 E2811001 ADD R1,R1,#0x00000001
0x00001A78 E59F2194 LDR R2,[PC,#0x0194]
0x00001A7C E5821000 STR R1,[R2]
26: gwCliQHeader %= MAX_CLI_QUEUE_LEN;
0x00001A80 E2821000 ADD R1,R2,#pTest(0x00000000)
0x00001A84 E5911000 LDR R1,[R1]
0x00001A88 E20110FF AND R1,R1,#0x000000FF
0x00001A8C E5821000 STR R1,[R2]
27: gwCliQSize++;
0x00001A90 E59F1174 LDR R1,[PC,#0x0174]
0x00001A94 E5911000 LDR R1,[R1]
0x00001A98 E2811001 ADD R1,R1,#0x00000001
0x00001A9C E59F2168 LDR R2,[PC,#0x0168]
0x00001AA0 E5821000 STR R1,[R2]
28: }
29:
30:
31: static void chCheck(CHAR cTmpChar)
32: {
0x00001AA4 E12FFF1E BX R14
0),1): that gwCliQSize and the array are shared between ISR and main loop.
2) gwCliQSize is defined as volatile
3) from the assembly, bTestCh is R4 (moved from R0) and bTmpCh is R1 (moved to R0 before B)
4) I am using the J-LINK, but without J-LINK (run from flash), it still exists.