]> Cypherpunks.ru repositories - gostls13.git/commitdiff
[release-branch.go1.21] runtime: put ReadMemStats debug assertions behind a double...
authorMichael Anthony Knyszek <mknyszek@google.com>
Mon, 27 Nov 2023 22:27:32 +0000 (22:27 +0000)
committerGopher Robot <gobot@golang.org>
Thu, 4 Jan 2024 21:34:32 +0000 (21:34 +0000)
ReadMemStats has a few assertions it makes about the consistency of the
stats it's about to produce. Specifically, how those stats line up with
runtime-internal stats. These checks are generally useful, but crashing
just because some stats are wrong is a heavy price to pay.

For a long time this wasn't a problem, but very recently it became a
real problem. It turns out that there's real benign skew that can happen
wherein sysmon (which doesn't synchronize with a STW) generates a trace
event when tracing is enabled, and may mutate some stats while
ReadMemStats is running its checks.

Fix this by synchronizing with both sysmon and the tracer. This is a bit
heavy-handed, but better that than false positives.

Also, put the checks behind a debug mode. We want to reduce the risk of
backporting this change, and again, it's not great to crash just because
user-facing stats are off. Still, enable this debug mode during the
runtime tests so we don't lose quite as much coverage from disabling
these checks by default.

For #64401.
Fixes #64410.

Change-Id: I9adb3e5c7161d207648d07373a11da8a5f0fda9a
Reviewed-on: https://go-review.googlesource.com/c/go/+/545277
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Felix Geisendörfer <felix.geisendoerfer@datadoghq.com>
(cherry picked from commit b2efd1de97402ec4b8fb4e9e0ec29c8e49e8e200)
Reviewed-on: https://go-review.googlesource.com/c/go/+/545557
Auto-Submit: Matthew Dempsky <mdempsky@google.com>
TryBot-Bypass: Matthew Dempsky <mdempsky@google.com>

src/runtime/export_test.go
src/runtime/gc_test.go
src/runtime/mstats.go

index f7ce5033f5879661fba7b3e1a806db1ee34fca73..a740993c58785f5a9a9fc4304ae1ea379a9c1b0f 100644 (file)
@@ -436,6 +436,8 @@ func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int)
        startTheWorld()
 }
 
+var DoubleCheckReadMemStats = &doubleCheckReadMemStats
+
 // ReadMemStatsSlow returns both the runtime-computed MemStats and
 // MemStats accumulated by scanning the heap.
 func ReadMemStatsSlow() (base, slow MemStats) {
index bd01e3610300dbc1d076691aaa453f9bf4e08709..9302ea32c3f136eb4b704587e8f3bff9527c5a9e 100644 (file)
@@ -570,6 +570,11 @@ func TestPageAccounting(t *testing.T) {
        }
 }
 
+func init() {
+       // Enable ReadMemStats' double-check mode.
+       *runtime.DoubleCheckReadMemStats = true
+}
+
 func TestReadMemStats(t *testing.T) {
        base, slow := runtime.ReadMemStatsSlow()
        if base != slow {
index 9cdc56513719700345a5028ffb7d6f48e3a31170..308bed67d7b36449de91cf5aa2f5644f660d2b2a 100644 (file)
@@ -367,6 +367,11 @@ func ReadMemStats(m *MemStats) {
        startTheWorld()
 }
 
+// doubleCheckReadMemStats controls a double-check mode for ReadMemStats that
+// ensures consistency between the values that ReadMemStats is using and the
+// runtime-internal stats.
+var doubleCheckReadMemStats = false
+
 // readmemstats_m populates stats for internal runtime values.
 //
 // The world must be stopped.
@@ -441,56 +446,65 @@ func readmemstats_m(stats *MemStats) {
 
        heapGoal := gcController.heapGoal()
 
-       // The world is stopped, so the consistent stats (after aggregation)
-       // should be identical to some combination of memstats. In particular:
-       //
-       // * memstats.heapInUse == inHeap
-       // * memstats.heapReleased == released
-       // * memstats.heapInUse + memstats.heapFree == committed - inStacks - inWorkBufs - inPtrScalarBits
-       // * memstats.totalAlloc == totalAlloc
-       // * memstats.totalFree == totalFree
-       //
-       // Check if that's actually true.
-       //
-       // TODO(mknyszek): Maybe don't throw here. It would be bad if a
-       // bug in otherwise benign accounting caused the whole application
-       // to crash.
-       if gcController.heapInUse.load() != uint64(consStats.inHeap) {
-               print("runtime: heapInUse=", gcController.heapInUse.load(), "\n")
-               print("runtime: consistent value=", consStats.inHeap, "\n")
-               throw("heapInUse and consistent stats are not equal")
-       }
-       if gcController.heapReleased.load() != uint64(consStats.released) {
-               print("runtime: heapReleased=", gcController.heapReleased.load(), "\n")
-               print("runtime: consistent value=", consStats.released, "\n")
-               throw("heapReleased and consistent stats are not equal")
-       }
-       heapRetained := gcController.heapInUse.load() + gcController.heapFree.load()
-       consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits)
-       if heapRetained != consRetained {
-               print("runtime: global value=", heapRetained, "\n")
-               print("runtime: consistent value=", consRetained, "\n")
-               throw("measures of the retained heap are not equal")
-       }
-       if gcController.totalAlloc.Load() != totalAlloc {
-               print("runtime: totalAlloc=", gcController.totalAlloc.Load(), "\n")
-               print("runtime: consistent value=", totalAlloc, "\n")
-               throw("totalAlloc and consistent stats are not equal")
-       }
-       if gcController.totalFree.Load() != totalFree {
-               print("runtime: totalFree=", gcController.totalFree.Load(), "\n")
-               print("runtime: consistent value=", totalFree, "\n")
-               throw("totalFree and consistent stats are not equal")
-       }
-       // Also check that mappedReady lines up with totalMapped - released.
-       // This isn't really the same type of "make sure consistent stats line up" situation,
-       // but this is an opportune time to check.
-       if gcController.mappedReady.Load() != totalMapped-uint64(consStats.released) {
-               print("runtime: mappedReady=", gcController.mappedReady.Load(), "\n")
-               print("runtime: totalMapped=", totalMapped, "\n")
-               print("runtime: released=", uint64(consStats.released), "\n")
-               print("runtime: totalMapped-released=", totalMapped-uint64(consStats.released), "\n")
-               throw("mappedReady and other memstats are not equal")
+       if doubleCheckReadMemStats {
+               // Only check this if we're debugging. It would be bad to crash an application
+               // just because the debugging stats are wrong. We mostly rely on tests to catch
+               // these issues, and we enable the double check mode for tests.
+               //
+               // The world is stopped, so the consistent stats (after aggregation)
+               // should be identical to some combination of memstats. In particular:
+               //
+               // * memstats.heapInUse == inHeap
+               // * memstats.heapReleased == released
+               // * memstats.heapInUse + memstats.heapFree == committed - inStacks - inWorkBufs - inPtrScalarBits
+               // * memstats.totalAlloc == totalAlloc
+               // * memstats.totalFree == totalFree
+               //
+               // Check if that's actually true.
+               //
+               // Prevent sysmon and the tracer from skewing the stats since they can
+               // act without synchronizing with a STW. See #64401.
+               lock(&sched.sysmonlock)
+               lock(&trace.lock)
+               if gcController.heapInUse.load() != uint64(consStats.inHeap) {
+                       print("runtime: heapInUse=", gcController.heapInUse.load(), "\n")
+                       print("runtime: consistent value=", consStats.inHeap, "\n")
+                       throw("heapInUse and consistent stats are not equal")
+               }
+               if gcController.heapReleased.load() != uint64(consStats.released) {
+                       print("runtime: heapReleased=", gcController.heapReleased.load(), "\n")
+                       print("runtime: consistent value=", consStats.released, "\n")
+                       throw("heapReleased and consistent stats are not equal")
+               }
+               heapRetained := gcController.heapInUse.load() + gcController.heapFree.load()
+               consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits)
+               if heapRetained != consRetained {
+                       print("runtime: global value=", heapRetained, "\n")
+                       print("runtime: consistent value=", consRetained, "\n")
+                       throw("measures of the retained heap are not equal")
+               }
+               if gcController.totalAlloc.Load() != totalAlloc {
+                       print("runtime: totalAlloc=", gcController.totalAlloc.Load(), "\n")
+                       print("runtime: consistent value=", totalAlloc, "\n")
+                       throw("totalAlloc and consistent stats are not equal")
+               }
+               if gcController.totalFree.Load() != totalFree {
+                       print("runtime: totalFree=", gcController.totalFree.Load(), "\n")
+                       print("runtime: consistent value=", totalFree, "\n")
+                       throw("totalFree and consistent stats are not equal")
+               }
+               // Also check that mappedReady lines up with totalMapped - released.
+               // This isn't really the same type of "make sure consistent stats line up" situation,
+               // but this is an opportune time to check.
+               if gcController.mappedReady.Load() != totalMapped-uint64(consStats.released) {
+                       print("runtime: mappedReady=", gcController.mappedReady.Load(), "\n")
+                       print("runtime: totalMapped=", totalMapped, "\n")
+                       print("runtime: released=", uint64(consStats.released), "\n")
+                       print("runtime: totalMapped-released=", totalMapped-uint64(consStats.released), "\n")
+                       throw("mappedReady and other memstats are not equal")
+               }
+               unlock(&trace.lock)
+               unlock(&sched.sysmonlock)
        }
 
        // We've calculated all the values we need. Now, populate stats.