1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
11 // See memclrNoHeapPointers Go doc for important implementation constraints.
13 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
14 // ABIInternal for performance.
15 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
18 MOVQ AX, DI // DI = ptr
21 // MOVOU seems always faster than REP STOSQ when Enhanced REP STOSQ is not available.
23 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
44 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
47 // If the size is less than 2kb, do not use ERMS as it has a big start-up cost.
48 // Table 3-4. Relative Performance of Memcpy() Using ERMSB Vs. 128-bit AVX
49 // in the Intel Optimization Guide shows better performance for ERMSB starting
50 // from 2KB. Benchmarks show the similar threshold for REP STOS vs AVX.
52 JAE loop_preheader_erms
56 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
57 JE loop_preheader_avx2
58 // TODO: for really big clears, use MOVNTDQ, even without AVX2.
86 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
87 // For larger sizes it is always faster, even on dual Xeons with 30M cache.
88 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
90 JAE loop_preheader_avx2_huge
101 VMOVDQU Y0, -32(DI)(BX*1)
102 VMOVDQU Y0, -64(DI)(BX*1)
103 VMOVDQU Y0, -96(DI)(BX*1)
104 VMOVDQU Y0, -128(DI)(BX*1)
110 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
115 // At this point both ERMS and AVX2 is supported. While REP STOS can use a no-RFO
116 // write protocol, ERMS could show the same or slower performance comparing to
117 // Non-Temporal Stores when the size is bigger than LLC depending on hardware.
119 JAE loop_preheader_avx2_huge
122 // STOSQ is used to guarantee that the whole zeroed pointer-sized word is visible
123 // for a memory subsystem as the GC requires this.
130 loop_preheader_avx2_huge:
131 // Align to 32 byte boundary
147 // In the description of MOVNTDQ in [1]
148 // "... fencing operation implemented with the SFENCE or MFENCE instruction
149 // should be used in conjunction with MOVNTDQ instructions..."
150 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
152 VMOVDQU Y0, -32(DI)(BX*1)
153 VMOVDQU Y0, -64(DI)(BX*1)
154 VMOVDQU Y0, -96(DI)(BX*1)
155 VMOVDQU Y0, -128(DI)(BX*1)
161 MOVB AX, -1(DI)(BX*1)
167 MOVW AX, -2(DI)(BX*1)
171 MOVL AX, -4(DI)(BX*1)
174 // We need a separate case for 8 to make sure we clear pointers atomically.
179 MOVQ AX, -8(DI)(BX*1)
183 MOVOU X15, -16(DI)(BX*1)
188 MOVOU X15, -32(DI)(BX*1)
189 MOVOU X15, -16(DI)(BX*1)
196 MOVOU X15, -64(DI)(BX*1)
197 MOVOU X15, -48(DI)(BX*1)
198 MOVOU X15, -32(DI)(BX*1)
199 MOVOU X15, -16(DI)(BX*1)
210 MOVOU X15, -128(DI)(BX*1)
211 MOVOU X15, -112(DI)(BX*1)
212 MOVOU X15, -96(DI)(BX*1)
213 MOVOU X15, -80(DI)(BX*1)
214 MOVOU X15, -64(DI)(BX*1)
215 MOVOU X15, -48(DI)(BX*1)
216 MOVOU X15, -32(DI)(BX*1)
217 MOVOU X15, -16(DI)(BX*1)