1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
2 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved.
6 // Portions Copyright 2009 The Go Authors. All rights reserved.
8 // Permission is hereby granted, free of charge, to any person obtaining a copy
9 // of this software and associated documentation files (the "Software"), to deal
10 // in the Software without restriction, including without limitation the rights
11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 // copies of the Software, and to permit persons to whom the Software is
13 // furnished to do so, subject to the following conditions:
15 // The above copyright notice and this permission notice shall be included in
16 // all copies or substantial portions of the Software.
18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
31 // See memmove Go doc for important implementation constraints.
33 // func memmove(to, from unsafe.Pointer, n uintptr)
34 // ABIInternal for performance.
35 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
43 // REP instructions have a high startup cost, so we handle small sizes
44 // with some straightline code. The REP MOVSQ instruction is really fast
45 // for large sizes. The cutover is approximately 2K.
47 // move_129through256 or smaller work whether or not the source and the
48 // destination memory regions overlap because they load all data into
49 // registers before writing it back. move_256through2048 on the other
50 // hand can be used only when the memory regions don't overlap or the copy
51 // direction is forward.
53 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
73 JBE move_129through256
75 TESTB $1, runtime·useAVXmemmove(SB)
79 * check and set for backwards
89 JLS move_256through2048
91 // If REP MOVSB isn't fast, don't use it
92 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
101 // Do 1 byte at a time
107 // Do 8 bytes at a time
123 * whole thing backwards has
150 MOVB -1(SI)(BX*1), CX
152 MOVB CX, -1(DI)(BX*1)
168 MOVL -4(SI)(BX*1), CX
170 MOVL CX, -4(DI)(BX*1)
173 // We need a separate case for 8 to make sure we write pointers atomically.
179 MOVQ -8(SI)(BX*1), CX
181 MOVQ CX, -8(DI)(BX*1)
185 MOVOU -16(SI)(BX*1), X1
187 MOVOU X1, -16(DI)(BX*1)
192 MOVOU -32(SI)(BX*1), X2
193 MOVOU -16(SI)(BX*1), X3
196 MOVOU X2, -32(DI)(BX*1)
197 MOVOU X3, -16(DI)(BX*1)
204 MOVOU -64(SI)(BX*1), X4
205 MOVOU -48(SI)(BX*1), X5
206 MOVOU -32(SI)(BX*1), X6
207 MOVOU -16(SI)(BX*1), X7
212 MOVOU X4, -64(DI)(BX*1)
213 MOVOU X5, -48(DI)(BX*1)
214 MOVOU X6, -32(DI)(BX*1)
215 MOVOU X7, -16(DI)(BX*1)
226 MOVOU -128(SI)(BX*1), X8
227 MOVOU -112(SI)(BX*1), X9
228 MOVOU -96(SI)(BX*1), X10
229 MOVOU -80(SI)(BX*1), X11
230 MOVOU -64(SI)(BX*1), X12
231 MOVOU -48(SI)(BX*1), X13
232 MOVOU -32(SI)(BX*1), X14
233 MOVOU -16(SI)(BX*1), X15
242 MOVOU X8, -128(DI)(BX*1)
243 MOVOU X9, -112(DI)(BX*1)
244 MOVOU X10, -96(DI)(BX*1)
245 MOVOU X11, -80(DI)(BX*1)
246 MOVOU X12, -64(DI)(BX*1)
247 MOVOU X13, -48(DI)(BX*1)
248 MOVOU X14, -32(DI)(BX*1)
249 MOVOU X15, -16(DI)(BX*1)
250 // X15 must be zero on return
290 JGE move_256through2048
291 // X15 must be zero on return
296 // There are two implementations of move algorithm.
297 // The first one for non-overlapped memory regions. It uses forward copying.
298 // The second one for overlapped regions. It uses backward copying
301 // Now CX contains distance between SRC and DEST
303 // If the distance lesser than region length it means that regions are overlapped
306 // Non-temporal copy would be better for big sizes.
308 JAE gobble_big_data_fwd
310 // Memory layout on the source side
312 // |<---------BX before correction--------->|
313 // | |<--BX corrected-->| |
314 // | | |<--- AX --->|
315 // |<-R11->| |<-128 bytes->|
316 // +----------------------------------------+
317 // | Head | Body | Tail |
318 // +-------+------------------+-------------+
321 // Save head into Y4 Save tail into X5..X12
323 // SI+R11, where R11 = ((DI & -32) + 32) - DI
325 // 1. Unaligned save of the tail's 128 bytes
326 // 2. Unaligned save of the head's 32 bytes
327 // 3. Destination-aligned copying of body (128 bytes per iteration)
328 // 4. Put head on the new place
329 // 5. Put the tail on the new place
330 // It can be important to satisfy processor's pipeline requirements for
331 // small sizes as the cost of unaligned memory region copying is
332 // comparable with the cost of main loop. So code is slightly messed there.
333 // There is more clean implementation of that algorithm for bigger sizes
334 // where the cost of unaligned part copying is negligible.
335 // You can see it after gobble_big_data_fwd label.
338 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
342 // Align destination address
345 // Continue tail saving.
348 // Make R11 delta between aligned and unaligned destination addresses.
351 // Continue tail saving.
354 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
356 // Continue tail saving.
359 // The tail will be put on its place after main body copying.
360 // It's time for the unaligned heading part.
362 // Adjust source address to point past head.
365 // Aligned memory copying there
379 // Now we can store unaligned parts.
395 // There is forward copying for big regions.
396 // It uses non-temporal mov instructions.
397 // Details of this algorithm are commented previously for small sizes.
399 MOVOU -0x80(SI)(BX*1), X5
418 PREFETCHNTA 0x1C0(SI)
419 PREFETCHNTA 0x280(SI)
420 // Prefetch values were chosen empirically.
421 // Approach for prefetch usage as in 9.5.6 of [1]
422 // [1] 64-ia-32-architectures-optimization-manual.pdf
423 // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
430 VMOVNTDQ Y1, 0x20(DI)
431 VMOVNTDQ Y2, 0x40(DI)
432 VMOVNTDQ Y3, 0x60(DI)
435 JA gobble_mem_fwd_loop
436 // NT instructions don't follow the normal cache-coherency rules.
437 // We need SFENCE there to make copied data available timely.
453 // Backward copying is about the same as the forward one.
454 // Firstly we load unaligned tail in the beginning of region.
468 // Let's point SI to the end of region
470 // and load unaligned head into X4.
471 VMOVDQU -0x20(SI), Y4
474 // If there is enough data for non-temporal moves go to special loop
476 JA gobble_big_data_bwd
479 VMOVDQU -0x20(SI), Y0
480 VMOVDQU -0x40(SI), Y1
481 VMOVDQU -0x60(SI), Y2
482 VMOVDQU -0x80(SI), Y3
484 VMOVDQA Y0, -0x20(DI)
485 VMOVDQA Y1, -0x40(DI)
486 VMOVDQA Y2, -0x60(DI)
487 VMOVDQA Y3, -0x80(DI)
490 JA gobble_mem_bwd_loop
491 // Let's store unaligned data
506 gobble_big_mem_bwd_loop:
507 PREFETCHNTA -0x1C0(SI)
508 PREFETCHNTA -0x280(SI)
509 VMOVDQU -0x20(SI), Y0
510 VMOVDQU -0x40(SI), Y1
511 VMOVDQU -0x60(SI), Y2
512 VMOVDQU -0x80(SI), Y3
514 VMOVNTDQ Y0, -0x20(DI)
515 VMOVNTDQ Y1, -0x40(DI)
516 VMOVNTDQ Y2, -0x60(DI)
517 VMOVNTDQ Y3, -0x80(DI)
520 JA gobble_big_mem_bwd_loop