SUB X5, X9, X5
SUB X5, X12, X12
f_align:
- ADD $-1, X5
+ SUB $1, X5
MOVB 0(X11), X14
MOVB X14, 0(X10)
ADD $1, X10
MOV X21, 56(X10)
ADD $64, X10
ADD $64, X11
- ADD $-64, X12
+ SUB $64, X12
BGE X12, X9, f_loop64
BEQZ X12, done
MOV X17, 24(X10)
ADD $32, X10
ADD $32, X11
- ADD $-32, X12
+ SUB $32, X12
BGE X12, X9, f_loop32
BEQZ X12, done
MOV X15, 8(X10)
ADD $16, X10
ADD $16, X11
- ADD $-16, X12
+ SUB $16, X12
BGE X12, X9, f_loop16
BEQZ X12, done
MOV X14, 0(X10)
ADD $8, X10
ADD $8, X11
- ADD $-8, X12
+ SUB $8, X12
BGE X12, X9, f_loop8
BEQZ X12, done
JMP f_loop4_check
MOVB X21, 7(X10)
ADD $8, X10
ADD $8, X11
- ADD $-8, X12
+ SUB $8, X12
BGE X12, X9, f_loop8_unaligned
f_loop4_check:
MOVB X17, 3(X10)
ADD $4, X10
ADD $4, X11
- ADD $-4, X12
+ SUB $4, X12
BGE X12, X9, f_loop4
f_loop1:
MOVB X14, 0(X10)
ADD $1, X10
ADD $1, X11
- ADD $-1, X12
+ SUB $1, X12
JMP f_loop1
backward:
// Move one byte at a time until we reach 8 byte alignment.
SUB X5, X12, X12
b_align:
- ADD $-1, X5
- ADD $-1, X10
- ADD $-1, X11
+ SUB $1, X5
+ SUB $1, X10
+ SUB $1, X11
MOVB 0(X11), X14
MOVB X14, 0(X10)
BNEZ X5, b_align
MOV $64, X9
BLT X12, X9, b_loop32_check
b_loop64:
- ADD $-64, X10
- ADD $-64, X11
+ SUB $64, X10
+ SUB $64, X11
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV X19, 40(X10)
MOV X20, 48(X10)
MOV X21, 56(X10)
- ADD $-64, X12
+ SUB $64, X12
BGE X12, X9, b_loop64
BEQZ X12, done
MOV $32, X9
BLT X12, X9, b_loop16_check
b_loop32:
- ADD $-32, X10
- ADD $-32, X11
+ SUB $32, X10
+ SUB $32, X11
MOV 0(X11), X14
MOV 8(X11), X15
MOV 16(X11), X16
MOV X15, 8(X10)
MOV X16, 16(X10)
MOV X17, 24(X10)
- ADD $-32, X12
+ SUB $32, X12
BGE X12, X9, b_loop32
BEQZ X12, done
MOV $16, X9
BLT X12, X9, b_loop8_check
b_loop16:
- ADD $-16, X10
- ADD $-16, X11
+ SUB $16, X10
+ SUB $16, X11
MOV 0(X11), X14
MOV 8(X11), X15
MOV X14, 0(X10)
MOV X15, 8(X10)
- ADD $-16, X12
+ SUB $16, X12
BGE X12, X9, b_loop16
BEQZ X12, done
MOV $8, X9
BLT X12, X9, b_loop4_check
b_loop8:
- ADD $-8, X10
- ADD $-8, X11
+ SUB $8, X10
+ SUB $8, X11
MOV 0(X11), X14
MOV X14, 0(X10)
- ADD $-8, X12
+ SUB $8, X12
BGE X12, X9, b_loop8
BEQZ X12, done
JMP b_loop4_check
MOV $8, X9
BLT X12, X9, b_loop4_check
b_loop8_unaligned:
- ADD $-8, X10
- ADD $-8, X11
+ SUB $8, X10
+ SUB $8, X11
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB X19, 5(X10)
MOVB X20, 6(X10)
MOVB X21, 7(X10)
- ADD $-8, X12
+ SUB $8, X12
BGE X12, X9, b_loop8_unaligned
b_loop4_check:
MOV $4, X9
BLT X12, X9, b_loop1
b_loop4:
- ADD $-4, X10
- ADD $-4, X11
+ SUB $4, X10
+ SUB $4, X11
MOVB 0(X11), X14
MOVB 1(X11), X15
MOVB 2(X11), X16
MOVB X15, 1(X10)
MOVB X16, 2(X10)
MOVB X17, 3(X10)
- ADD $-4, X12
+ SUB $4, X12
BGE X12, X9, b_loop4
b_loop1:
BEQZ X12, done
- ADD $-1, X10
- ADD $-1, X11
+ SUB $1, X10
+ SUB $1, X11
MOVB 0(X11), X14
MOVB X14, 0(X10)
- ADD $-1, X12
+ SUB $1, X12
JMP b_loop1
done: