src/runtime/memclr_amd64.s

   1 // Copyright 2014 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 //go:build !plan9
   6
   7 #include "go_asm.h"
   8 #include "textflag.h"
   9 #include "asm_amd64.h"
  10
  11 // See memclrNoHeapPointers Go doc for important implementation constraints.
  12
  13 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
  14 // ABIInternal for performance.
  15 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
  16         // AX = ptr
  17         // BX = n
  18         MOVQ    AX, DI  // DI = ptr
  19         XORQ    AX, AX
  20
  21         // MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
  22 tail:
  23         // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
  24         TESTQ   BX, BX
  25         JEQ     _0
  26         CMPQ    BX, $2
  27         JBE     _1or2
  28         CMPQ    BX, $4
  29         JBE     _3or4
  30         CMPQ    BX, $8
  31         JB      _5through7
  32         JE      _8
  33         CMPQ    BX, $16
  34         JBE     _9through16
  35         CMPQ    BX, $32
  36         JBE     _17through32
  37         CMPQ    BX, $64
  38         JBE     _33through64
  39         CMPQ    BX, $128
  40         JBE     _65through128
  41         CMPQ    BX, $256
  42         JBE     _129through256
  43
  44         CMPB    internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
  45         JNE     skip_erms
  46
  47         // If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
  48         // Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
  49         // in the Intel Optimization Guide shows better performance for ERMSB starting
  50         // from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
  51         CMPQ    BX, $2048
  52         JAE     loop_preheader_erms
  53
  54 skip_erms:
  55 #ifndef hasAVX2
  56         CMPB    internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
  57         JE      loop_preheader_avx2
  58         // TODO: for really big clears, use MOVNTDQ, even without AVX2.
  59
  60 loop:
  61         MOVOU   X15, 0(DI)
  62         MOVOU   X15, 16(DI)
  63         MOVOU   X15, 32(DI)
  64         MOVOU   X15, 48(DI)
  65         MOVOU   X15, 64(DI)
  66         MOVOU   X15, 80(DI)
  67         MOVOU   X15, 96(DI)
  68         MOVOU   X15, 112(DI)
  69         MOVOU   X15, 128(DI)
  70         MOVOU   X15, 144(DI)
  71         MOVOU   X15, 160(DI)
  72         MOVOU   X15, 176(DI)
  73         MOVOU   X15, 192(DI)
  74         MOVOU   X15, 208(DI)
  75         MOVOU   X15, 224(DI)
  76         MOVOU   X15, 240(DI)
  77         SUBQ    $256, BX
  78         ADDQ    $256, DI
  79         CMPQ    BX, $256
  80         JAE     loop
  81         JMP     tail
  82 #endif
  83
  84 loop_preheader_avx2:
  85         VPXOR X0, X0, X0
  86         // For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
  87         // For larger sizes it is always faster, even on dual Xeons with 30M cache.
  88         // TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
  89         CMPQ    BX, $0x2000000
  90         JAE     loop_preheader_avx2_huge
  91
  92 loop_avx2:
  93         VMOVDQU Y0, 0(DI)
  94         VMOVDQU Y0, 32(DI)
  95         VMOVDQU Y0, 64(DI)
  96         VMOVDQU Y0, 96(DI)
  97         SUBQ    $128, BX
  98         ADDQ    $128, DI
  99         CMPQ    BX, $128
 100         JAE     loop_avx2
 101         VMOVDQU  Y0, -32(DI)(BX*1)
 102         VMOVDQU  Y0, -64(DI)(BX*1)
 103         VMOVDQU  Y0, -96(DI)(BX*1)
 104         VMOVDQU  Y0, -128(DI)(BX*1)
 105         VZEROUPPER
 106         RET
 107
 108 loop_preheader_erms:
 109 #ifndef hasAVX2
 110         CMPB    internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
 111         JNE     loop_erms
 112 #endif
 113
 114         VPXOR X0, X0, X0
 115         // At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
 116         // write protocol, ERMS could show the same or slower performance comparing to
 117         // Non-Temporal Stores when the size is bigger than LLC depending on hardware.
 118         CMPQ    BX, $0x2000000
 119         JAE     loop_preheader_avx2_huge
 120
 121 loop_erms:
 122         // STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
 123         // for a memory subsystem as the GC requires this.
 124         MOVQ    BX, CX
 125         SHRQ    $3, CX
 126         ANDQ    $7, BX
 127         REP;    STOSQ
 128         JMP     tail
 129
 130 loop_preheader_avx2_huge:
 131         // Align to 32 byte boundary
 132         VMOVDQU  Y0, 0(DI)
 133         MOVQ    DI, SI
 134         ADDQ    $32, DI
 135         ANDQ    $~31, DI
 136         SUBQ    DI, SI
 137         ADDQ    SI, BX
 138 loop_avx2_huge:
 139         VMOVNTDQ        Y0, 0(DI)
 140         VMOVNTDQ        Y0, 32(DI)
 141         VMOVNTDQ        Y0, 64(DI)
 142         VMOVNTDQ        Y0, 96(DI)
 143         SUBQ    $128, BX
 144         ADDQ    $128, DI
 145         CMPQ    BX, $128
 146         JAE     loop_avx2_huge
 147         // In the description of MOVNTDQ in [1]
 148         // "... fencing operation implemented with the SFENCE or MFENCE instruction
 149         // should be used in conjunction with MOVNTDQ instructions..."
 150         // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
 151         SFENCE
 152         VMOVDQU  Y0, -32(DI)(BX*1)
 153         VMOVDQU  Y0, -64(DI)(BX*1)
 154         VMOVDQU  Y0, -96(DI)(BX*1)
 155         VMOVDQU  Y0, -128(DI)(BX*1)
 156         VZEROUPPER
 157         RET
 158
 159 _1or2:
 160         MOVB    AX, (DI)
 161         MOVB    AX, -1(DI)(BX*1)
 162         RET
 163 _0:
 164         RET
 165 _3or4:
 166         MOVW    AX, (DI)
 167         MOVW    AX, -2(DI)(BX*1)
 168         RET
 169 _5through7:
 170         MOVL    AX, (DI)
 171         MOVL    AX, -4(DI)(BX*1)
 172         RET
 173 _8:
 174         // We need a separate case for 8 to make sure we clear pointers atomically.
 175         MOVQ    AX, (DI)
 176         RET
 177 _9through16:
 178         MOVQ    AX, (DI)
 179         MOVQ    AX, -8(DI)(BX*1)
 180         RET
 181 _17through32:
 182         MOVOU   X15, (DI)
 183         MOVOU   X15, -16(DI)(BX*1)
 184         RET
 185 _33through64:
 186         MOVOU   X15, (DI)
 187         MOVOU   X15, 16(DI)
 188         MOVOU   X15, -32(DI)(BX*1)
 189         MOVOU   X15, -16(DI)(BX*1)
 190         RET
 191 _65through128:
 192         MOVOU   X15, (DI)
 193         MOVOU   X15, 16(DI)
 194         MOVOU   X15, 32(DI)
 195         MOVOU   X15, 48(DI)
 196         MOVOU   X15, -64(DI)(BX*1)
 197         MOVOU   X15, -48(DI)(BX*1)
 198         MOVOU   X15, -32(DI)(BX*1)
 199         MOVOU   X15, -16(DI)(BX*1)
 200         RET
 201 _129through256:
 202         MOVOU   X15, (DI)
 203         MOVOU   X15, 16(DI)
 204         MOVOU   X15, 32(DI)
 205         MOVOU   X15, 48(DI)
 206         MOVOU   X15, 64(DI)
 207         MOVOU   X15, 80(DI)
 208         MOVOU   X15, 96(DI)
 209         MOVOU   X15, 112(DI)
 210         MOVOU   X15, -128(DI)(BX*1)
 211         MOVOU   X15, -112(DI)(BX*1)
 212         MOVOU   X15, -96(DI)(BX*1)
 213         MOVOU   X15, -80(DI)(BX*1)
 214         MOVOU   X15, -64(DI)(BX*1)
 215         MOVOU   X15, -48(DI)(BX*1)
 216         MOVOU   X15, -32(DI)(BX*1)
 217         MOVOU   X15, -16(DI)(BX*1)
 218         RET