// void runtime·memmove(void*, void*, uintptr)
TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
- // A0 = to
- // A1 = from
- // A2 = n
- ADD A1, A2, T5
+ // X10 = to
+ // X11 = from
+ // X12 = n
+ BEQ X10, X11, done
+ BEQZ X12, done
// If the destination is ahead of the source, start at the end of the
// buffer and go backward.
- BLTU A1, A0, b
+ BGTU X10, X11, backward
- // If less than eight bytes, do one byte at a time.
- SLTU $8, A2, T3
- BNE T3, ZERO, f_outcheck
+ // If less than 8 bytes, do single byte copies.
+ MOV $8, X9
+ BLT X12, X9, f_loop4_check
- // Do one byte at a time until from is eight-aligned.
- JMP f_aligncheck
+ // Check alignment - if alignment differs we have to do one byte at a time.
+ AND $7, X10, X5
+ AND $7, X11, X6
+ BNE X5, X6, f_loop8_unaligned_check
+ BEQZ X5, f_loop_check
+
+ // Move one byte at a time until we reach 8 byte alignment.
+ SUB X5, X9, X5
+ SUB X5, X12, X12
f_align:
- MOVB (A1), T3
- MOVB T3, (A0)
- ADD $1, A0
- ADD $1, A1
-f_aligncheck:
- AND $7, A1, T3
- BNE T3, ZERO, f_align
-
- // Do eight bytes at a time as long as there is room.
- ADD $-7, T5, T6
- JMP f_wordscheck
-f_words:
- MOV (A1), T3
- MOV T3, (A0)
- ADD $8, A0
- ADD $8, A1
-f_wordscheck:
- SLTU T6, A1, T3
- BNE T3, ZERO, f_words
-
- // Finish off the remaining partial word.
- JMP f_outcheck
-f_out:
- MOVB (A1), T3
- MOVB T3, (A0)
- ADD $1, A0
- ADD $1, A1
-f_outcheck:
- BNE A1, T5, f_out
+ SUB $1, X5
+ MOVB 0(X11), X14
+ MOVB X14, 0(X10)
+ ADD $1, X10
+ ADD $1, X11
+ BNEZ X5, f_align
- RET
+f_loop_check:
+ MOV $16, X9
+ BLT X12, X9, f_loop8_check
+ MOV $32, X9
+ BLT X12, X9, f_loop16_check
+ MOV $64, X9
+ BLT X12, X9, f_loop32_check
+f_loop64:
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV 16(X11), X16
+ MOV 24(X11), X17
+ MOV 32(X11), X18
+ MOV 40(X11), X19
+ MOV 48(X11), X20
+ MOV 56(X11), X21
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ MOV X16, 16(X10)
+ MOV X17, 24(X10)
+ MOV X18, 32(X10)
+ MOV X19, 40(X10)
+ MOV X20, 48(X10)
+ MOV X21, 56(X10)
+ ADD $64, X10
+ ADD $64, X11
+ SUB $64, X12
+ BGE X12, X9, f_loop64
+ BEQZ X12, done
+
+f_loop32_check:
+ MOV $32, X9
+ BLT X12, X9, f_loop16_check
+f_loop32:
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV 16(X11), X16
+ MOV 24(X11), X17
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ MOV X16, 16(X10)
+ MOV X17, 24(X10)
+ ADD $32, X10
+ ADD $32, X11
+ SUB $32, X12
+ BGE X12, X9, f_loop32
+ BEQZ X12, done
+
+f_loop16_check:
+ MOV $16, X9
+ BLT X12, X9, f_loop8_check
+f_loop16:
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ ADD $16, X10
+ ADD $16, X11
+ SUB $16, X12
+ BGE X12, X9, f_loop16
+ BEQZ X12, done
+
+f_loop8_check:
+ MOV $8, X9
+ BLT X12, X9, f_loop4_check
+f_loop8:
+ MOV 0(X11), X14
+ MOV X14, 0(X10)
+ ADD $8, X10
+ ADD $8, X11
+ SUB $8, X12
+ BGE X12, X9, f_loop8
+ BEQZ X12, done
+ JMP f_loop4_check
+
+f_loop8_unaligned_check:
+ MOV $8, X9
+ BLT X12, X9, f_loop4_check
+f_loop8_unaligned:
+ MOVB 0(X11), X14
+ MOVB 1(X11), X15
+ MOVB 2(X11), X16
+ MOVB 3(X11), X17
+ MOVB 4(X11), X18
+ MOVB 5(X11), X19
+ MOVB 6(X11), X20
+ MOVB 7(X11), X21
+ MOVB X14, 0(X10)
+ MOVB X15, 1(X10)
+ MOVB X16, 2(X10)
+ MOVB X17, 3(X10)
+ MOVB X18, 4(X10)
+ MOVB X19, 5(X10)
+ MOVB X20, 6(X10)
+ MOVB X21, 7(X10)
+ ADD $8, X10
+ ADD $8, X11
+ SUB $8, X12
+ BGE X12, X9, f_loop8_unaligned
+
+f_loop4_check:
+ MOV $4, X9
+ BLT X12, X9, f_loop1
+f_loop4:
+ MOVB 0(X11), X14
+ MOVB 1(X11), X15
+ MOVB 2(X11), X16
+ MOVB 3(X11), X17
+ MOVB X14, 0(X10)
+ MOVB X15, 1(X10)
+ MOVB X16, 2(X10)
+ MOVB X17, 3(X10)
+ ADD $4, X10
+ ADD $4, X11
+ SUB $4, X12
+ BGE X12, X9, f_loop4
+
+f_loop1:
+ BEQZ X12, done
+ MOVB 0(X11), X14
+ MOVB X14, 0(X10)
+ ADD $1, X10
+ ADD $1, X11
+ SUB $1, X12
+ JMP f_loop1
+
+backward:
+ ADD X10, X12, X10
+ ADD X11, X12, X11
-b:
- ADD A0, A2, T4
- // If less than eight bytes, do one byte at a time.
- SLTU $8, A2, T3
- BNE T3, ZERO, b_outcheck
+ // If less than 8 bytes, do single byte copies.
+ MOV $8, X9
+ BLT X12, X9, b_loop4_check
- // Do one byte at a time until from+n is eight-aligned.
- JMP b_aligncheck
+ // Check alignment - if alignment differs we have to do one byte at a time.
+ AND $7, X10, X5
+ AND $7, X11, X6
+ BNE X5, X6, b_loop8_unaligned_check
+ BEQZ X5, b_loop_check
+
+ // Move one byte at a time until we reach 8 byte alignment.
+ SUB X5, X12, X12
b_align:
- ADD $-1, T4
- ADD $-1, T5
- MOVB (T5), T3
- MOVB T3, (T4)
-b_aligncheck:
- AND $7, T5, T3
- BNE T3, ZERO, b_align
-
- // Do eight bytes at a time as long as there is room.
- ADD $7, A1, T6
- JMP b_wordscheck
-b_words:
- ADD $-8, T4
- ADD $-8, T5
- MOV (T5), T3
- MOV T3, (T4)
-b_wordscheck:
- SLTU T5, T6, T3
- BNE T3, ZERO, b_words
-
- // Finish off the remaining partial word.
- JMP b_outcheck
-b_out:
- ADD $-1, T4
- ADD $-1, T5
- MOVB (T5), T3
- MOVB T3, (T4)
-b_outcheck:
- BNE T5, A1, b_out
+ SUB $1, X5
+ SUB $1, X10
+ SUB $1, X11
+ MOVB 0(X11), X14
+ MOVB X14, 0(X10)
+ BNEZ X5, b_align
+
+b_loop_check:
+ MOV $16, X9
+ BLT X12, X9, b_loop8_check
+ MOV $32, X9
+ BLT X12, X9, b_loop16_check
+ MOV $64, X9
+ BLT X12, X9, b_loop32_check
+b_loop64:
+ SUB $64, X10
+ SUB $64, X11
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV 16(X11), X16
+ MOV 24(X11), X17
+ MOV 32(X11), X18
+ MOV 40(X11), X19
+ MOV 48(X11), X20
+ MOV 56(X11), X21
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ MOV X16, 16(X10)
+ MOV X17, 24(X10)
+ MOV X18, 32(X10)
+ MOV X19, 40(X10)
+ MOV X20, 48(X10)
+ MOV X21, 56(X10)
+ SUB $64, X12
+ BGE X12, X9, b_loop64
+ BEQZ X12, done
+
+b_loop32_check:
+ MOV $32, X9
+ BLT X12, X9, b_loop16_check
+b_loop32:
+ SUB $32, X10
+ SUB $32, X11
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV 16(X11), X16
+ MOV 24(X11), X17
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ MOV X16, 16(X10)
+ MOV X17, 24(X10)
+ SUB $32, X12
+ BGE X12, X9, b_loop32
+ BEQZ X12, done
+
+b_loop16_check:
+ MOV $16, X9
+ BLT X12, X9, b_loop8_check
+b_loop16:
+ SUB $16, X10
+ SUB $16, X11
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ SUB $16, X12
+ BGE X12, X9, b_loop16
+ BEQZ X12, done
+
+b_loop8_check:
+ MOV $8, X9
+ BLT X12, X9, b_loop4_check
+b_loop8:
+ SUB $8, X10
+ SUB $8, X11
+ MOV 0(X11), X14
+ MOV X14, 0(X10)
+ SUB $8, X12
+ BGE X12, X9, b_loop8
+ BEQZ X12, done
+ JMP b_loop4_check
+
+b_loop8_unaligned_check:
+ MOV $8, X9
+ BLT X12, X9, b_loop4_check
+b_loop8_unaligned:
+ SUB $8, X10
+ SUB $8, X11
+ MOVB 0(X11), X14
+ MOVB 1(X11), X15
+ MOVB 2(X11), X16
+ MOVB 3(X11), X17
+ MOVB 4(X11), X18
+ MOVB 5(X11), X19
+ MOVB 6(X11), X20
+ MOVB 7(X11), X21
+ MOVB X14, 0(X10)
+ MOVB X15, 1(X10)
+ MOVB X16, 2(X10)
+ MOVB X17, 3(X10)
+ MOVB X18, 4(X10)
+ MOVB X19, 5(X10)
+ MOVB X20, 6(X10)
+ MOVB X21, 7(X10)
+ SUB $8, X12
+ BGE X12, X9, b_loop8_unaligned
+
+b_loop4_check:
+ MOV $4, X9
+ BLT X12, X9, b_loop1
+b_loop4:
+ SUB $4, X10
+ SUB $4, X11
+ MOVB 0(X11), X14
+ MOVB 1(X11), X15
+ MOVB 2(X11), X16
+ MOVB 3(X11), X17
+ MOVB X14, 0(X10)
+ MOVB X15, 1(X10)
+ MOVB X16, 2(X10)
+ MOVB X17, 3(X10)
+ SUB $4, X12
+ BGE X12, X9, b_loop4
+
+b_loop1:
+ BEQZ X12, done
+ SUB $1, X10
+ SUB $1, X11
+ MOVB 0(X11), X14
+ MOVB X14, 0(X10)
+ SUB $1, X12
+ JMP b_loop1
+done:
RET