From: Michael Anthony Knyszek Date: Tue, 31 Oct 2023 04:50:26 +0000 (+0000) Subject: runtime: make alloc headers footers instead X-Git-Tag: go1.22rc1~362 X-Git-Url: http://www.git.cypherpunks.ru/?p=gostls13.git;a=commitdiff_plain;h=1e250a219900651dad27f29eab0877eee4afd5b9 runtime: make alloc headers footers instead The previous CL in this series (CL 437955) adds the allocation headers experiment. However, this experiment puts the headers at the beginning of each allocation, which decreases the default allocator alignment that users can rely upon. Historically, Go's memory allocator has implicitly provided 16-byte alignment (except for sizes where it doesn't make sense, like 8 or 24 bytes), so it's not unlikely that users are depending on it. It also complicates other changes that want higher alignment. For example, the sync/atomic.Uint64Pair proposal would (hypothetically; it's not yet accepted) introduce a type with 16-byte alignment. The malloc fast path will require extra code to consider alignment and will waste memory for any value containing such a type. This change moves the allocation header to the end of the span's allocation slot instead of the beginning. This means worse locality for the GC when scanning, but it's still an overall win. It also means that objects will still have the 16-byte alignment we've provided thus far. This is broken out in a separate change just becauase it ended up that way during development. But I've chosen to leave it this way in case we want to try and move allocation headers to the front of objects again. Below are the benchmark results of this CL series, comparing the performance of this CL with GOEXPERIMENT=allocheaders vs. without this CL series. name old time/op new time/op delta BiogoIgor 12.5s ± 0% 12.4s ± 2% ~ (p=0.079 n=9+10) BiogoKrishna 12.8s ±10% 12.4s ±10% ~ (p=0.182 n=9+10) BleveIndexBatch100 4.54s ± 3% 4.60s ± 3% ~ (p=0.050 n=9+9) EtcdPut 21.1ms ± 2% 21.3ms ± 4% ~ (p=0.669 n=7+10) EtcdSTM 107ms ± 3% 108ms ± 2% ~ (p=0.497 n=9+10) GoBuildKubelet 34.1s ± 3% 33.1s ± 2% -3.08% (p=0.000 n=10+10) GoBuildKubeletLink 7.94s ± 2% 7.95s ± 2% ~ (p=0.631 n=10+10) GoBuildIstioctl 33.2s ± 1% 31.7s ± 0% -4.37% (p=0.000 n=9+9) GoBuildIstioctlLink 8.07s ± 1% 8.05s ± 1% ~ (p=0.356 n=9+10) GoBuildFrontend 12.1s ± 0% 11.5s ± 1% -4.43% (p=0.000 n=8+10) GoBuildFrontendLink 1.20s ± 2% 1.20s ± 2% ~ (p=0.905 n=9+10) GopherLuaKNucleotide 19.9s ± 0% 19.5s ± 1% -1.95% (p=0.000 n=9+10) MarkdownRenderXHTML 194ms ± 5% 194ms ± 2% ~ (p=0.931 n=9+9) Tile38QueryLoad 518µs ± 1% 508µs ± 1% -1.93% (p=0.000 n=9+8) name old average-RSS-bytes new average-RSS-bytes delta BiogoIgor 66.2MB ± 3% 65.6MB ± 1% ~ (p=0.156 n=10+9) BiogoKrishna 4.34GB ± 2% 4.34GB ± 1% ~ (p=0.315 n=10+9) BleveIndexBatch100 189MB ± 3% 186MB ± 3% ~ (p=0.052 n=10+10) EtcdPut 105MB ± 5% 107MB ± 6% ~ (p=0.579 n=10+10) EtcdSTM 92.1MB ± 5% 93.2MB ± 4% ~ (p=0.353 n=10+10) GoBuildKubelet 2.07GB ± 1% 2.07GB ± 1% ~ (p=0.436 n=10+10) GoBuildIstioctl 1.44GB ± 1% 1.46GB ± 1% +0.96% (p=0.001 n=10+10) GoBuildFrontend 522MB ± 1% 512MB ± 2% -1.98% (p=0.000 n=10+10) GopherLuaKNucleotide 37.4MB ± 5% 36.4MB ± 4% -2.53% (p=0.035 n=10+10) MarkdownRenderXHTML 21.2MB ± 1% 20.9MB ± 3% -1.53% (p=0.003 n=8+10) Tile38QueryLoad 6.39GB ± 2% 6.24GB ± 2% -2.40% (p=0.000 n=10+10) name old peak-RSS-bytes new peak-RSS-bytes delta BiogoIgor 88.5MB ± 4% 88.4MB ± 3% ~ (p=0.971 n=10+10) BiogoKrishna 4.48GB ± 0% 4.42GB ± 0% -1.49% (p=0.000 n=10+10) BleveIndexBatch100 268MB ± 3% 265MB ± 4% ~ (p=0.315 n=9+10) EtcdPut 147MB ± 9% 146MB ± 5% ~ (p=0.853 n=10+10) EtcdSTM 119MB ± 6% 120MB ± 5% ~ (p=0.796 n=10+10) GopherLuaKNucleotide 43.1MB ±17% 40.7MB ±12% ~ (p=0.075 n=10+10) MarkdownRenderXHTML 21.2MB ± 1% 21.1MB ± 3% ~ (p=0.511 n=9+10) Tile38QueryLoad 6.65GB ± 4% 6.52GB ± 2% -1.93% (p=0.009 n=10+10) name old peak-VM-bytes new peak-VM-bytes delta BiogoIgor 1.33GB ± 0% 1.33GB ± 0% -0.16% (p=0.000 n=10+10) BiogoKrishna 5.77GB ± 0% 5.69GB ± 0% -1.23% (p=0.000 n=10+10) BleveIndexBatch100 2.62GB ± 0% 2.61GB ± 0% -0.13% (p=0.000 n=7+10) EtcdPut 12.1GB ± 0% 12.1GB ± 0% ~ (p=0.160 n=8+10) EtcdSTM 12.1GB ± 0% 12.1GB ± 0% -0.02% (p=0.000 n=10+10) GopherLuaKNucleotide 1.26GB ± 0% 1.26GB ± 0% -0.09% (p=0.000 n=10+10) MarkdownRenderXHTML 1.26GB ± 0% 1.26GB ± 0% -0.08% (p=0.000 n=10+10) Tile38QueryLoad 7.89GB ± 4% 7.76GB ± 1% -1.70% (p=0.008 n=10+8) name old p50-latency-ns new p50-latency-ns delta EtcdPut 20.1M ± 5% 20.2M ± 4% ~ (p=0.529 n=10+10) EtcdSTM 79.8M ± 4% 79.9M ± 4% ~ (p=0.971 n=10+10) Tile38QueryLoad 215k ± 1% 210k ± 3% -2.04% (p=0.021 n=8+10) name old p90-latency-ns new p90-latency-ns delta EtcdPut 31.9M ± 6% 32.0M ± 7% ~ (p=0.780 n=9+10) EtcdSTM 220M ± 6% 220M ± 2% ~ (p=1.000 n=10+10) Tile38QueryLoad 622k ± 2% 646k ± 2% +3.83% (p=0.000 n=10+10) name old p99-latency-ns new p99-latency-ns delta EtcdPut 47.6M ±32% 51.4M ±28% ~ (p=0.529 n=10+10) EtcdSTM 452M ± 2% 457M ± 2% ~ (p=0.182 n=9+10) Tile38QueryLoad 5.04M ± 2% 4.91M ± 3% -2.56% (p=0.001 n=9+9) name old ops/s new ops/s delta EtcdPut 46.1k ± 2% 45.7k ± 4% ~ (p=0.475 n=7+10) EtcdSTM 9.18k ± 5% 9.20k ± 3% ~ (p=0.971 n=10+10) Tile38QueryLoad 17.4k ± 1% 17.7k ± 1% +1.97% (p=0.000 n=9+8) Change-Id: I637f48fb9e8c181912db785ae9186d7f16769870 Reviewed-on: https://go-review.googlesource.com/c/go/+/537886 LUCI-TryBot-Result: Go LUCI Reviewed-by: Keith Randall Reviewed-by: Cherry Mui --- diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index beff9043ec..d9b4112ded 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -1149,8 +1149,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { memclrNoHeapPointers(x, size) } if goexperiment.AllocHeaders && hasHeader { - header = (**_type)(x) - x = add(x, mallocHeaderSize) + header = (**_type)(unsafe.Pointer(uintptr(v) + size - mallocHeaderSize)) size -= mallocHeaderSize } } diff --git a/src/runtime/mbitmap_allocheaders.go b/src/runtime/mbitmap_allocheaders.go index 8a6b6a6564..9370d50b72 100644 --- a/src/runtime/mbitmap_allocheaders.go +++ b/src/runtime/mbitmap_allocheaders.go @@ -48,9 +48,9 @@ // is zeroed, so the GC just observes nil pointers. // Note that this "tiled" bitmap isn't stored anywhere; it is generated on-the-fly. // -// For objects without their own span, the type metadata is stored in the first -// word before the object at the beginning of the allocation slot. For objects -// with their own span, the type metadata is stored in the mspan. +// For objects without their own span, the type metadata is stored in the last +// word of the allocation slot. For objects with their own span, the type metadata +// is stored in the mspan. // // The bitmap for small unallocated objects in scannable spans is not maintained // (can be junk). @@ -191,9 +191,8 @@ func (span *mspan) typePointersOfUnchecked(addr uintptr) typePointers { // All of these objects have a header. var typ *_type if spc.sizeclass() != 0 { - // Pull the allocation header from the first word of the object. - typ = *(**_type)(unsafe.Pointer(addr)) - addr += mallocHeaderSize + // Pull the allocation header from the last word of the object. + typ = *(**_type)(unsafe.Pointer(addr + span.elemsize - mallocHeaderSize)) } else { typ = span.largeType } diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go index be501e6fca..18cd93e77e 100644 --- a/src/runtime/mfinal.go +++ b/src/runtime/mfinal.go @@ -9,7 +9,6 @@ package runtime import ( "internal/abi" "internal/goarch" - "internal/goexperiment" "runtime/internal/atomic" "runtime/internal/sys" "unsafe" @@ -411,7 +410,7 @@ func SetFinalizer(obj any, finalizer any) { } // find the containing object - base, span, _ := findObject(uintptr(e.data), 0, 0) + base, _, _ := findObject(uintptr(e.data), 0, 0) if base == 0 { if isGoPointerWithoutSpan(e.data) { @@ -420,11 +419,6 @@ func SetFinalizer(obj any, finalizer any) { throw("runtime.SetFinalizer: pointer not in allocated block") } - // Move base forward if we've got an allocation header. - if goexperiment.AllocHeaders && !span.spanclass.noscan() && !heapBitsInSpan(span.elemsize) && span.spanclass.sizeclass() != 0 { - base += mallocHeaderSize - } - if uintptr(e.data) != base { // As an implementation detail we allow to set finalizers for an inner byte // of an object if it could come from tiny alloc (see mallocgc for details).