src/runtime/memmove_amd64.s

   1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
   2 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
   3 //
   4 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
   5 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
   6 //         Portions Copyright 2009 The Go Authors. All rights reserved.
   7 //
   8 // Permission is hereby granted, free of charge, to any person obtaining a copy
   9 // of this software and associated documentation files (the "Software"), to deal
  10 // in the Software without restriction, including without limitation the rights
  11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12 // copies of the Software, and to permit persons to whom the Software is
  13 // furnished to do so, subject to the following conditions:
  14 //
  15 // The above copyright notice and this permission notice shall be included in
  16 // all copies or substantial portions of the Software.
  17 //
  18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
  21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24 // THE SOFTWARE.
  25
  26 //go:build !plan9
  27
  28 #include "go_asm.h"
  29 #include "textflag.h"
  30
  31 // See memmove Go doc for important implementation constraints.
  32
  33 // func memmove(to, from unsafe.Pointer, n uintptr)
  34 // ABIInternal for performance.
  35 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
  36         // AX = to
  37         // BX = from
  38         // CX = n
  39         MOVQ    AX, DI
  40         MOVQ    BX, SI
  41         MOVQ    CX, BX
  42
  43         // REP instructions have a high startup cost, so we handle small sizes
  44         // with some straightline code. The REP MOVSQ instruction is really fast
  45         // for large sizes. The cutover is approximately 2K.
  46 tail:
  47         // move_129through256 or smaller work whether or not the source and the
  48         // destination memory regions overlap because they load all data into
  49         // registers before writing it back.  move_256through2048 on the other
  50         // hand can be used only when the memory regions don't overlap or the copy
  51         // direction is forward.
  52         //
  53         // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
  54         TESTQ   BX, BX
  55         JEQ     move_0
  56         CMPQ    BX, $2
  57         JBE     move_1or2
  58         CMPQ    BX, $4
  59         JB      move_3
  60         JBE     move_4
  61         CMPQ    BX, $8
  62         JB      move_5through7
  63         JE      move_8
  64         CMPQ    BX, $16
  65         JBE     move_9through16
  66         CMPQ    BX, $32
  67         JBE     move_17through32
  68         CMPQ    BX, $64
  69         JBE     move_33through64
  70         CMPQ    BX, $128
  71         JBE     move_65through128
  72         CMPQ    BX, $256
  73         JBE     move_129through256
  74
  75         TESTB   $1, runtime·useAVXmemmove(SB)
  76         JNZ     avxUnaligned
  77
  78 /*
  79  * check and set for backwards
  80  */
  81         CMPQ    SI, DI
  82         JLS     back
  83
  84 /*
  85  * forward copy loop
  86  */
  87 forward:
  88         CMPQ    BX, $2048
  89         JLS     move_256through2048
  90
  91         // If REP MOVSB isn't fast, don't use it
  92         CMPB    internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
  93         JNE     fwdBy8
  94
  95         // Check alignment
  96         MOVL    SI, AX
  97         ORL     DI, AX
  98         TESTL   $7, AX
  99         JEQ     fwdBy8
 100
 101         // Do 1 byte at a time
 102         MOVQ    BX, CX
 103         REP;    MOVSB
 104         RET
 105
 106 fwdBy8:
 107         // Do 8 bytes at a time
 108         MOVQ    BX, CX
 109         SHRQ    $3, CX
 110         ANDQ    $7, BX
 111         REP;    MOVSQ
 112         JMP     tail
 113
 114 back:
 115 /*
 116  * check overlap
 117  */
 118         MOVQ    SI, CX
 119         ADDQ    BX, CX
 120         CMPQ    CX, DI
 121         JLS     forward
 122 /*
 123  * whole thing backwards has
 124  * adjusted addresses
 125  */
 126         ADDQ    BX, DI
 127         ADDQ    BX, SI
 128         STD
 129
 130 /*
 131  * copy
 132  */
 133         MOVQ    BX, CX
 134         SHRQ    $3, CX
 135         ANDQ    $7, BX
 136
 137         SUBQ    $8, DI
 138         SUBQ    $8, SI
 139         REP;    MOVSQ
 140
 141         CLD
 142         ADDQ    $8, DI
 143         ADDQ    $8, SI
 144         SUBQ    BX, DI
 145         SUBQ    BX, SI
 146         JMP     tail
 147
 148 move_1or2:
 149         MOVB    (SI), AX
 150         MOVB    -1(SI)(BX*1), CX
 151         MOVB    AX, (DI)
 152         MOVB    CX, -1(DI)(BX*1)
 153         RET
 154 move_0:
 155         RET
 156 move_4:
 157         MOVL    (SI), AX
 158         MOVL    AX, (DI)
 159         RET
 160 move_3:
 161         MOVW    (SI), AX
 162         MOVB    2(SI), CX
 163         MOVW    AX, (DI)
 164         MOVB    CX, 2(DI)
 165         RET
 166 move_5through7:
 167         MOVL    (SI), AX
 168         MOVL    -4(SI)(BX*1), CX
 169         MOVL    AX, (DI)
 170         MOVL    CX, -4(DI)(BX*1)
 171         RET
 172 move_8:
 173         // We need a separate case for 8 to make sure we write pointers atomically.
 174         MOVQ    (SI), AX
 175         MOVQ    AX, (DI)
 176         RET
 177 move_9through16:
 178         MOVQ    (SI), AX
 179         MOVQ    -8(SI)(BX*1), CX
 180         MOVQ    AX, (DI)
 181         MOVQ    CX, -8(DI)(BX*1)
 182         RET
 183 move_17through32:
 184         MOVOU   (SI), X0
 185         MOVOU   -16(SI)(BX*1), X1
 186         MOVOU   X0, (DI)
 187         MOVOU   X1, -16(DI)(BX*1)
 188         RET
 189 move_33through64:
 190         MOVOU   (SI), X0
 191         MOVOU   16(SI), X1
 192         MOVOU   -32(SI)(BX*1), X2
 193         MOVOU   -16(SI)(BX*1), X3
 194         MOVOU   X0, (DI)
 195         MOVOU   X1, 16(DI)
 196         MOVOU   X2, -32(DI)(BX*1)
 197         MOVOU   X3, -16(DI)(BX*1)
 198         RET
 199 move_65through128:
 200         MOVOU   (SI), X0
 201         MOVOU   16(SI), X1
 202         MOVOU   32(SI), X2
 203         MOVOU   48(SI), X3
 204         MOVOU   -64(SI)(BX*1), X4
 205         MOVOU   -48(SI)(BX*1), X5
 206         MOVOU   -32(SI)(BX*1), X6
 207         MOVOU   -16(SI)(BX*1), X7
 208         MOVOU   X0, (DI)
 209         MOVOU   X1, 16(DI)
 210         MOVOU   X2, 32(DI)
 211         MOVOU   X3, 48(DI)
 212         MOVOU   X4, -64(DI)(BX*1)
 213         MOVOU   X5, -48(DI)(BX*1)
 214         MOVOU   X6, -32(DI)(BX*1)
 215         MOVOU   X7, -16(DI)(BX*1)
 216         RET
 217 move_129through256:
 218         MOVOU   (SI), X0
 219         MOVOU   16(SI), X1
 220         MOVOU   32(SI), X2
 221         MOVOU   48(SI), X3
 222         MOVOU   64(SI), X4
 223         MOVOU   80(SI), X5
 224         MOVOU   96(SI), X6
 225         MOVOU   112(SI), X7
 226         MOVOU   -128(SI)(BX*1), X8
 227         MOVOU   -112(SI)(BX*1), X9
 228         MOVOU   -96(SI)(BX*1), X10
 229         MOVOU   -80(SI)(BX*1), X11
 230         MOVOU   -64(SI)(BX*1), X12
 231         MOVOU   -48(SI)(BX*1), X13
 232         MOVOU   -32(SI)(BX*1), X14
 233         MOVOU   -16(SI)(BX*1), X15
 234         MOVOU   X0, (DI)
 235         MOVOU   X1, 16(DI)
 236         MOVOU   X2, 32(DI)
 237         MOVOU   X3, 48(DI)
 238         MOVOU   X4, 64(DI)
 239         MOVOU   X5, 80(DI)
 240         MOVOU   X6, 96(DI)
 241         MOVOU   X7, 112(DI)
 242         MOVOU   X8, -128(DI)(BX*1)
 243         MOVOU   X9, -112(DI)(BX*1)
 244         MOVOU   X10, -96(DI)(BX*1)
 245         MOVOU   X11, -80(DI)(BX*1)
 246         MOVOU   X12, -64(DI)(BX*1)
 247         MOVOU   X13, -48(DI)(BX*1)
 248         MOVOU   X14, -32(DI)(BX*1)
 249         MOVOU   X15, -16(DI)(BX*1)
 250         // X15 must be zero on return
 251         PXOR    X15, X15
 252         RET
 253 move_256through2048:
 254         SUBQ    $256, BX
 255         MOVOU   (SI), X0
 256         MOVOU   16(SI), X1
 257         MOVOU   32(SI), X2
 258         MOVOU   48(SI), X3
 259         MOVOU   64(SI), X4
 260         MOVOU   80(SI), X5
 261         MOVOU   96(SI), X6
 262         MOVOU   112(SI), X7
 263         MOVOU   128(SI), X8
 264         MOVOU   144(SI), X9
 265         MOVOU   160(SI), X10
 266         MOVOU   176(SI), X11
 267         MOVOU   192(SI), X12
 268         MOVOU   208(SI), X13
 269         MOVOU   224(SI), X14
 270         MOVOU   240(SI), X15
 271         MOVOU   X0, (DI)
 272         MOVOU   X1, 16(DI)
 273         MOVOU   X2, 32(DI)
 274         MOVOU   X3, 48(DI)
 275         MOVOU   X4, 64(DI)
 276         MOVOU   X5, 80(DI)
 277         MOVOU   X6, 96(DI)
 278         MOVOU   X7, 112(DI)
 279         MOVOU   X8, 128(DI)
 280         MOVOU   X9, 144(DI)
 281         MOVOU   X10, 160(DI)
 282         MOVOU   X11, 176(DI)
 283         MOVOU   X12, 192(DI)
 284         MOVOU   X13, 208(DI)
 285         MOVOU   X14, 224(DI)
 286         MOVOU   X15, 240(DI)
 287         CMPQ    BX, $256
 288         LEAQ    256(SI), SI
 289         LEAQ    256(DI), DI
 290         JGE     move_256through2048
 291         // X15 must be zero on return
 292         PXOR    X15, X15
 293         JMP     tail
 294
 295 avxUnaligned:
 296         // There are two implementations of move algorithm.
 297         // The first one for non-overlapped memory regions. It uses forward copying.
 298         // The second one for overlapped regions. It uses backward copying
 299         MOVQ    DI, CX
 300         SUBQ    SI, CX
 301         // Now CX contains distance between SRC and DEST
 302         CMPQ    CX, BX
 303         // If the distance lesser than region length it means that regions are overlapped
 304         JC      copy_backward
 305
 306         // Non-temporal copy would be better for big sizes.
 307         CMPQ    BX, $0x100000
 308         JAE     gobble_big_data_fwd
 309
 310         // Memory layout on the source side
 311         // SI                                       CX
 312         // |<---------BX before correction--------->|
 313         // |       |<--BX corrected-->|             |
 314         // |       |                  |<--- AX  --->|
 315         // |<-R11->|                  |<-128 bytes->|
 316         // +----------------------------------------+
 317         // | Head  | Body             | Tail        |
 318         // +-------+------------------+-------------+
 319         // ^       ^                  ^
 320         // |       |                  |
 321         // Save head into Y4          Save tail into X5..X12
 322         //         |
 323         //         SI+R11, where R11 = ((DI & -32) + 32) - DI
 324         // Algorithm:
 325         // 1. Unaligned save of the tail's 128 bytes
 326         // 2. Unaligned save of the head's 32  bytes
 327         // 3. Destination-aligned copying of body (128 bytes per iteration)
 328         // 4. Put head on the new place
 329         // 5. Put the tail on the new place
 330         // It can be important to satisfy processor's pipeline requirements for
 331         // small sizes as the cost of unaligned memory region copying is
 332         // comparable with the cost of main loop. So code is slightly messed there.
 333         // There is more clean implementation of that algorithm for bigger sizes
 334         // where the cost of unaligned part copying is negligible.
 335         // You can see it after gobble_big_data_fwd label.
 336         LEAQ    (SI)(BX*1), CX
 337         MOVQ    DI, R10
 338         // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
 339         MOVOU   -0x80(CX), X5
 340         MOVOU   -0x70(CX), X6
 341         MOVQ    $0x80, AX
 342         // Align destination address
 343         ANDQ    $-32, DI
 344         ADDQ    $32, DI
 345         // Continue tail saving.
 346         MOVOU   -0x60(CX), X7
 347         MOVOU   -0x50(CX), X8
 348         // Make R11 delta between aligned and unaligned destination addresses.
 349         MOVQ    DI, R11
 350         SUBQ    R10, R11
 351         // Continue tail saving.
 352         MOVOU   -0x40(CX), X9
 353         MOVOU   -0x30(CX), X10
 354         // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
 355         SUBQ    R11, BX
 356         // Continue tail saving.
 357         MOVOU   -0x20(CX), X11
 358         MOVOU   -0x10(CX), X12
 359         // The tail will be put on its place after main body copying.
 360         // It's time for the unaligned heading part.
 361         VMOVDQU (SI), Y4
 362         // Adjust source address to point past head.
 363         ADDQ    R11, SI
 364         SUBQ    AX, BX
 365         // Aligned memory copying there
 366 gobble_128_loop:
 367         VMOVDQU (SI), Y0
 368         VMOVDQU 0x20(SI), Y1
 369         VMOVDQU 0x40(SI), Y2
 370         VMOVDQU 0x60(SI), Y3
 371         ADDQ    AX, SI
 372         VMOVDQA Y0, (DI)
 373         VMOVDQA Y1, 0x20(DI)
 374         VMOVDQA Y2, 0x40(DI)
 375         VMOVDQA Y3, 0x60(DI)
 376         ADDQ    AX, DI
 377         SUBQ    AX, BX
 378         JA      gobble_128_loop
 379         // Now we can store unaligned parts.
 380         ADDQ    AX, BX
 381         ADDQ    DI, BX
 382         VMOVDQU Y4, (R10)
 383         VZEROUPPER
 384         MOVOU   X5, -0x80(BX)
 385         MOVOU   X6, -0x70(BX)
 386         MOVOU   X7, -0x60(BX)
 387         MOVOU   X8, -0x50(BX)
 388         MOVOU   X9, -0x40(BX)
 389         MOVOU   X10, -0x30(BX)
 390         MOVOU   X11, -0x20(BX)
 391         MOVOU   X12, -0x10(BX)
 392         RET
 393
 394 gobble_big_data_fwd:
 395         // There is forward copying for big regions.
 396         // It uses non-temporal mov instructions.
 397         // Details of this algorithm are commented previously for small sizes.
 398         LEAQ    (SI)(BX*1), CX
 399         MOVOU   -0x80(SI)(BX*1), X5
 400         MOVOU   -0x70(CX), X6
 401         MOVOU   -0x60(CX), X7
 402         MOVOU   -0x50(CX), X8
 403         MOVOU   -0x40(CX), X9
 404         MOVOU   -0x30(CX), X10
 405         MOVOU   -0x20(CX), X11
 406         MOVOU   -0x10(CX), X12
 407         VMOVDQU (SI), Y4
 408         MOVQ    DI, R8
 409         ANDQ    $-32, DI
 410         ADDQ    $32, DI
 411         MOVQ    DI, R10
 412         SUBQ    R8, R10
 413         SUBQ    R10, BX
 414         ADDQ    R10, SI
 415         LEAQ    (DI)(BX*1), CX
 416         SUBQ    $0x80, BX
 417 gobble_mem_fwd_loop:
 418         PREFETCHNTA 0x1C0(SI)
 419         PREFETCHNTA 0x280(SI)
 420         // Prefetch values were chosen empirically.
 421         // Approach for prefetch usage as in 9.5.6 of [1]
 422         // [1] 64-ia-32-architectures-optimization-manual.pdf
 423         // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
 424         VMOVDQU (SI), Y0
 425         VMOVDQU 0x20(SI), Y1
 426         VMOVDQU 0x40(SI), Y2
 427         VMOVDQU 0x60(SI), Y3
 428         ADDQ    $0x80, SI
 429         VMOVNTDQ Y0, (DI)
 430         VMOVNTDQ Y1, 0x20(DI)
 431         VMOVNTDQ Y2, 0x40(DI)
 432         VMOVNTDQ Y3, 0x60(DI)
 433         ADDQ    $0x80, DI
 434         SUBQ    $0x80, BX
 435         JA              gobble_mem_fwd_loop
 436         // NT instructions don't follow the normal cache-coherency rules.
 437         // We need SFENCE there to make copied data available timely.
 438         SFENCE
 439         VMOVDQU Y4, (R8)
 440         VZEROUPPER
 441         MOVOU   X5, -0x80(CX)
 442         MOVOU   X6, -0x70(CX)
 443         MOVOU   X7, -0x60(CX)
 444         MOVOU   X8, -0x50(CX)
 445         MOVOU   X9, -0x40(CX)
 446         MOVOU   X10, -0x30(CX)
 447         MOVOU   X11, -0x20(CX)
 448         MOVOU   X12, -0x10(CX)
 449         RET
 450
 451 copy_backward:
 452         MOVQ    DI, AX
 453         // Backward copying is about the same as the forward one.
 454         // Firstly we load unaligned tail in the beginning of region.
 455         MOVOU   (SI), X5
 456         MOVOU   0x10(SI), X6
 457         ADDQ    BX, DI
 458         MOVOU   0x20(SI), X7
 459         MOVOU   0x30(SI), X8
 460         LEAQ    -0x20(DI), R10
 461         MOVQ    DI, R11
 462         MOVOU   0x40(SI), X9
 463         MOVOU   0x50(SI), X10
 464         ANDQ    $0x1F, R11
 465         MOVOU   0x60(SI), X11
 466         MOVOU   0x70(SI), X12
 467         XORQ    R11, DI
 468         // Let's point SI to the end of region
 469         ADDQ    BX, SI
 470         // and load unaligned head into X4.
 471         VMOVDQU -0x20(SI), Y4
 472         SUBQ    R11, SI
 473         SUBQ    R11, BX
 474         // If there is enough data for non-temporal moves go to special loop
 475         CMPQ    BX, $0x100000
 476         JA              gobble_big_data_bwd
 477         SUBQ    $0x80, BX
 478 gobble_mem_bwd_loop:
 479         VMOVDQU -0x20(SI), Y0
 480         VMOVDQU -0x40(SI), Y1
 481         VMOVDQU -0x60(SI), Y2
 482         VMOVDQU -0x80(SI), Y3
 483         SUBQ    $0x80, SI
 484         VMOVDQA Y0, -0x20(DI)
 485         VMOVDQA Y1, -0x40(DI)
 486         VMOVDQA Y2, -0x60(DI)
 487         VMOVDQA Y3, -0x80(DI)
 488         SUBQ    $0x80, DI
 489         SUBQ    $0x80, BX
 490         JA              gobble_mem_bwd_loop
 491         // Let's store unaligned data
 492         VMOVDQU Y4, (R10)
 493         VZEROUPPER
 494         MOVOU   X5, (AX)
 495         MOVOU   X6, 0x10(AX)
 496         MOVOU   X7, 0x20(AX)
 497         MOVOU   X8, 0x30(AX)
 498         MOVOU   X9, 0x40(AX)
 499         MOVOU   X10, 0x50(AX)
 500         MOVOU   X11, 0x60(AX)
 501         MOVOU   X12, 0x70(AX)
 502         RET
 503
 504 gobble_big_data_bwd:
 505         SUBQ    $0x80, BX
 506 gobble_big_mem_bwd_loop:
 507         PREFETCHNTA -0x1C0(SI)
 508         PREFETCHNTA -0x280(SI)
 509         VMOVDQU -0x20(SI), Y0
 510         VMOVDQU -0x40(SI), Y1
 511         VMOVDQU -0x60(SI), Y2
 512         VMOVDQU -0x80(SI), Y3
 513         SUBQ    $0x80, SI
 514         VMOVNTDQ        Y0, -0x20(DI)
 515         VMOVNTDQ        Y1, -0x40(DI)
 516         VMOVNTDQ        Y2, -0x60(DI)
 517         VMOVNTDQ        Y3, -0x80(DI)
 518         SUBQ    $0x80, DI
 519         SUBQ    $0x80, BX
 520         JA      gobble_big_mem_bwd_loop
 521         SFENCE
 522         VMOVDQU Y4, (R10)
 523         VZEROUPPER
 524         MOVOU   X5, (AX)
 525         MOVOU   X6, 0x10(AX)
 526         MOVOU   X7, 0x20(AX)
 527         MOVOU   X8, 0x30(AX)
 528         MOVOU   X9, 0x40(AX)
 529         MOVOU   X10, 0x50(AX)
 530         MOVOU   X11, 0x60(AX)
 531         MOVOU   X12, 0x70(AX)
 532         RET