The previous CL in this series (CL 437955) adds the allocation headers
experiment. However, this experiment puts the headers at the beginning
of each allocation, which decreases the default allocator alignment that
users can rely upon. Historically, Go's memory allocator has implicitly
provided 16-byte alignment (except for sizes where it doesn't make
sense, like 8 or 24 bytes), so it's not unlikely that users are
depending on it. It also complicates other changes that want higher
alignment. For example, the sync/atomic.Uint64Pair proposal would
(hypothetically; it's not yet accepted) introduce a type with 16-byte
alignment. The malloc fast path will require extra code to consider
alignment and will waste memory for any value containing such a type.
This change moves the allocation header to the end of the span's
allocation slot instead of the beginning. This means worse locality for
the GC when scanning, but it's still an overall win. It also means that
objects will still have the 16-byte alignment we've provided thus far.
This is broken out in a separate change just becauase it ended up that
way during development. But I've chosen to leave it this way in case we
want to try and move allocation headers to the front of objects again.
Below are the benchmark results of this CL series, comparing the
performance of this CL with GOEXPERIMENT=allocheaders vs. without this
CL series.
name old time/op new time/op delta
BiogoIgor 12.5s ± 0% 12.4s ± 2% ~ (p=0.079 n=9+10)
BiogoKrishna 12.8s ±10% 12.4s ±10% ~ (p=0.182 n=9+10)
BleveIndexBatch100 4.54s ± 3% 4.60s ± 3% ~ (p=0.050 n=9+9)
EtcdPut 21.1ms ± 2% 21.3ms ± 4% ~ (p=0.669 n=7+10)
EtcdSTM 107ms ± 3% 108ms ± 2% ~ (p=0.497 n=9+10)
GoBuildKubelet 34.1s ± 3% 33.1s ± 2% -3.08% (p=0.000 n=10+10)
GoBuildKubeletLink 7.94s ± 2% 7.95s ± 2% ~ (p=0.631 n=10+10)
GoBuildIstioctl 33.2s ± 1% 31.7s ± 0% -4.37% (p=0.000 n=9+9)
GoBuildIstioctlLink 8.07s ± 1% 8.05s ± 1% ~ (p=0.356 n=9+10)
GoBuildFrontend 12.1s ± 0% 11.5s ± 1% -4.43% (p=0.000 n=8+10)
GoBuildFrontendLink 1.20s ± 2% 1.20s ± 2% ~ (p=0.905 n=9+10)
GopherLuaKNucleotide 19.9s ± 0% 19.5s ± 1% -1.95% (p=0.000 n=9+10)
MarkdownRenderXHTML 194ms ± 5% 194ms ± 2% ~ (p=0.931 n=9+9)
Tile38QueryLoad 518µs ± 1% 508µs ± 1% -1.93% (p=0.000 n=9+8)
name old average-RSS-bytes new average-RSS-bytes delta
BiogoIgor 66.2MB ± 3% 65.6MB ± 1% ~ (p=0.156 n=10+9)
BiogoKrishna 4.34GB ± 2% 4.34GB ± 1% ~ (p=0.315 n=10+9)
BleveIndexBatch100 189MB ± 3% 186MB ± 3% ~ (p=0.052 n=10+10)
EtcdPut 105MB ± 5% 107MB ± 6% ~ (p=0.579 n=10+10)
EtcdSTM 92.1MB ± 5% 93.2MB ± 4% ~ (p=0.353 n=10+10)
GoBuildKubelet 2.07GB ± 1% 2.07GB ± 1% ~ (p=0.436 n=10+10)
GoBuildIstioctl 1.44GB ± 1% 1.46GB ± 1% +0.96% (p=0.001 n=10+10)
GoBuildFrontend 522MB ± 1% 512MB ± 2% -1.98% (p=0.000 n=10+10)
GopherLuaKNucleotide 37.4MB ± 5% 36.4MB ± 4% -2.53% (p=0.035 n=10+10)
MarkdownRenderXHTML 21.2MB ± 1% 20.9MB ± 3% -1.53% (p=0.003 n=8+10)
Tile38QueryLoad 6.39GB ± 2% 6.24GB ± 2% -2.40% (p=0.000 n=10+10)
name old peak-RSS-bytes new peak-RSS-bytes delta
BiogoIgor 88.5MB ± 4% 88.4MB ± 3% ~ (p=0.971 n=10+10)
BiogoKrishna 4.48GB ± 0% 4.42GB ± 0% -1.49% (p=0.000 n=10+10)
BleveIndexBatch100 268MB ± 3% 265MB ± 4% ~ (p=0.315 n=9+10)
EtcdPut 147MB ± 9% 146MB ± 5% ~ (p=0.853 n=10+10)
EtcdSTM 119MB ± 6% 120MB ± 5% ~ (p=0.796 n=10+10)
GopherLuaKNucleotide 43.1MB ±17% 40.7MB ±12% ~ (p=0.075 n=10+10)
MarkdownRenderXHTML 21.2MB ± 1% 21.1MB ± 3% ~ (p=0.511 n=9+10)
Tile38QueryLoad 6.65GB ± 4% 6.52GB ± 2% -1.93% (p=0.009 n=10+10)
name old peak-VM-bytes new peak-VM-bytes delta
BiogoIgor 1.33GB ± 0% 1.33GB ± 0% -0.16% (p=0.000 n=10+10)
BiogoKrishna 5.77GB ± 0% 5.69GB ± 0% -1.23% (p=0.000 n=10+10)
BleveIndexBatch100 2.62GB ± 0% 2.61GB ± 0% -0.13% (p=0.000 n=7+10)
EtcdPut 12.1GB ± 0% 12.1GB ± 0% ~ (p=0.160 n=8+10)
EtcdSTM 12.1GB ± 0% 12.1GB ± 0% -0.02% (p=0.000 n=10+10)
GopherLuaKNucleotide 1.26GB ± 0% 1.26GB ± 0% -0.09% (p=0.000 n=10+10)
MarkdownRenderXHTML 1.26GB ± 0% 1.26GB ± 0% -0.08% (p=0.000 n=10+10)
Tile38QueryLoad 7.89GB ± 4% 7.76GB ± 1% -1.70% (p=0.008 n=10+8)
name old p50-latency-ns new p50-latency-ns delta
EtcdPut 20.1M ± 5% 20.2M ± 4% ~ (p=0.529 n=10+10)
EtcdSTM 79.8M ± 4% 79.9M ± 4% ~ (p=0.971 n=10+10)
Tile38QueryLoad 215k ± 1% 210k ± 3% -2.04% (p=0.021 n=8+10)
name old p90-latency-ns new p90-latency-ns delta
EtcdPut 31.9M ± 6% 32.0M ± 7% ~ (p=0.780 n=9+10)
EtcdSTM 220M ± 6% 220M ± 2% ~ (p=1.000 n=10+10)
Tile38QueryLoad 622k ± 2% 646k ± 2% +3.83% (p=0.000 n=10+10)
name old p99-latency-ns new p99-latency-ns delta
EtcdPut 47.6M ±32% 51.4M ±28% ~ (p=0.529 n=10+10)
EtcdSTM 452M ± 2% 457M ± 2% ~ (p=0.182 n=9+10)
Tile38QueryLoad 5.04M ± 2% 4.91M ± 3% -2.56% (p=0.001 n=9+9)
name old ops/s new ops/s delta
EtcdPut 46.1k ± 2% 45.7k ± 4% ~ (p=0.475 n=7+10)
EtcdSTM 9.18k ± 5% 9.20k ± 3% ~ (p=0.971 n=10+10)
Tile38QueryLoad 17.4k ± 1% 17.7k ± 1% +1.97% (p=0.000 n=9+8)
Change-Id: I637f48fb9e8c181912db785ae9186d7f16769870
Reviewed-on: https://go-review.googlesource.com/c/go/+/537886
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
memclrNoHeapPointers(x, size)
}
if goexperiment.AllocHeaders && hasHeader {
memclrNoHeapPointers(x, size)
}
if goexperiment.AllocHeaders && hasHeader {
- header = (**_type)(x)
- x = add(x, mallocHeaderSize)
+ header = (**_type)(unsafe.Pointer(uintptr(v) + size - mallocHeaderSize))
size -= mallocHeaderSize
}
}
size -= mallocHeaderSize
}
}
// is zeroed, so the GC just observes nil pointers.
// Note that this "tiled" bitmap isn't stored anywhere; it is generated on-the-fly.
//
// is zeroed, so the GC just observes nil pointers.
// Note that this "tiled" bitmap isn't stored anywhere; it is generated on-the-fly.
//
-// For objects without their own span, the type metadata is stored in the first
-// word before the object at the beginning of the allocation slot. For objects
-// with their own span, the type metadata is stored in the mspan.
+// For objects without their own span, the type metadata is stored in the last
+// word of the allocation slot. For objects with their own span, the type metadata
+// is stored in the mspan.
//
// The bitmap for small unallocated objects in scannable spans is not maintained
// (can be junk).
//
// The bitmap for small unallocated objects in scannable spans is not maintained
// (can be junk).
// All of these objects have a header.
var typ *_type
if spc.sizeclass() != 0 {
// All of these objects have a header.
var typ *_type
if spc.sizeclass() != 0 {
- // Pull the allocation header from the first word of the object.
- typ = *(**_type)(unsafe.Pointer(addr))
- addr += mallocHeaderSize
+ // Pull the allocation header from the last word of the object.
+ typ = *(**_type)(unsafe.Pointer(addr + span.elemsize - mallocHeaderSize))
} else {
typ = span.largeType
}
} else {
typ = span.largeType
}
import (
"internal/abi"
"internal/goarch"
import (
"internal/abi"
"internal/goarch"
- "internal/goexperiment"
"runtime/internal/atomic"
"runtime/internal/sys"
"unsafe"
"runtime/internal/atomic"
"runtime/internal/sys"
"unsafe"
}
// find the containing object
}
// find the containing object
- base, span, _ := findObject(uintptr(e.data), 0, 0)
+ base, _, _ := findObject(uintptr(e.data), 0, 0)
if base == 0 {
if isGoPointerWithoutSpan(e.data) {
if base == 0 {
if isGoPointerWithoutSpan(e.data) {
throw("runtime.SetFinalizer: pointer not in allocated block")
}
throw("runtime.SetFinalizer: pointer not in allocated block")
}
- // Move base forward if we've got an allocation header.
- if goexperiment.AllocHeaders && !span.spanclass.noscan() && !heapBitsInSpan(span.elemsize) && span.spanclass.sizeclass() != 0 {
- base += mallocHeaderSize
- }
-
if uintptr(e.data) != base {
// As an implementation detail we allow to set finalizers for an inner byte
// of an object if it could come from tiny alloc (see mallocgc for details).
if uintptr(e.data) != base {
// As an implementation detail we allow to set finalizers for an inner byte
// of an object if it could come from tiny alloc (see mallocgc for details).