]> Cypherpunks.ru repositories - gostls13.git/commitdiff
runtime: implement GC pacer redesign
authorMichael Anthony Knyszek <mknyszek@google.com>
Tue, 13 Apr 2021 03:07:27 +0000 (03:07 +0000)
committerMichael Knyszek <mknyszek@google.com>
Thu, 4 Nov 2021 20:00:31 +0000 (20:00 +0000)
This change implements the GC pacer redesign outlined in #44167 and the
accompanying design document, behind a GOEXPERIMENT flag that is on by
default.

In addition to adding the new pacer, this CL also includes code to track
and account for stack and globals scan work in the pacer and in the
assist credit system.

The new pacer also deviates slightly from the document in that it
increases the bound on the minimum trigger ratio from 0.6 (scaled by
GOGC) to 0.7. The logic behind this change is that the new pacer much
more consistently hits the goal (good!) leading to slightly less
frequent GC cycles, but _longer_ ones (in this case, bad!). It turns out
that the cost of having the GC on hurts throughput significantly (per
byte of memory used), though tail latencies can improve by up to 10%! To
be conservative, this change moves the value to 0.7 where there is a
small improvement to both throughput and latency, given the memory use.

Because the new pacer accounts for the two most significant sources of
scan work after heap objects, it is now also safer to reduce the minimum
heap size without leading to very poor amortization. This change thus
decreases the minimum heap size to 512 KiB, which corresponds to the
fact that the runtime has around 200 KiB of scannable globals always
there, up-front, providing a baseline.

Benchmark results: https://perf.golang.org/search?q=upload:20211001.6

tile38's KNearest benchmark shows a memory increase, but throughput (and
latency) per byte of memory used is better.

gopher-lua showed an increase in both CPU time and memory usage, but
subsequent attempts to reproduce this behavior are inconsistent.
Sometimes the overall performance is better, sometimes it's worse. This
suggests that the benchmark is fairly noisy in a way not captured by the
benchmarking framework itself.

biogo-igor is the only benchmark to show a significant performance loss.
This benchmark exhibits a very high GC rate, with relatively little work
to do in each cycle. The idle mark workers are quite active. In the new
pacer, mark phases are longer, mark assists are fewer, and some of that
time in mark assists has shifted to idle workers. Linux perf indicates
that the difference in CPU time can be mostly attributed to write-barrier
slow path related calls, which in turn indicates that the write barrier
being on for longer is the primary culprit. This also explains the memory
increase, as a longer mark phase leads to more memory allocated black,
surviving an extra cycle and contributing to the heap goal.

For #44167.

Change-Id: I8ac7cfef7d593e4a642c9b2be43fb3591a8ec9c4
Reviewed-on: https://go-review.googlesource.com/c/go/+/309869
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
src/internal/goexperiment/exp_pacerredesign_off.go [new file with mode: 0644]
src/internal/goexperiment/exp_pacerredesign_on.go [new file with mode: 0644]
src/internal/goexperiment/flags.go
src/runtime/export_test.go
src/runtime/mgc.go
src/runtime/mgcmark.go
src/runtime/mgcpacer.go
src/runtime/mgcpacer_test.go
src/runtime/mgcwork.go

diff --git a/src/internal/goexperiment/exp_pacerredesign_off.go b/src/internal/goexperiment/exp_pacerredesign_off.go
new file mode 100644 (file)
index 0000000..62e1831
--- /dev/null
@@ -0,0 +1,9 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build !goexperiment.pacerredesign
+// +build !goexperiment.pacerredesign
+
+package goexperiment
+
+const PacerRedesign = false
+const PacerRedesignInt = 0
diff --git a/src/internal/goexperiment/exp_pacerredesign_on.go b/src/internal/goexperiment/exp_pacerredesign_on.go
new file mode 100644 (file)
index 0000000..b22b031
--- /dev/null
@@ -0,0 +1,9 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build goexperiment.pacerredesign
+// +build goexperiment.pacerredesign
+
+package goexperiment
+
+const PacerRedesign = true
+const PacerRedesignInt = 1
index 0a61a0e5fc12adc1556cc38138bf14e2583b3cf9..3bf19222e1cb02f04f6b2d239735f169205f5dc5 100644 (file)
@@ -83,4 +83,10 @@ type Flags struct {
        // Requires wrappers (to do ABI translation), and reflect (so
        // reflection calls use registers).
        RegabiArgs bool
+
+       // PacerRedesign enables the new GC pacer in the runtime.
+       //
+       // Details regarding the new pacer may be found at
+       // https://golang.org/design/44167-gc-pacer-redesign
+       PacerRedesign bool
 }
index 533627fa585f374a32ee24b8487202a046b4f8bf..5149252c83746395c2fde9287dbf93acd68d53f2 100644 (file)
@@ -1290,7 +1290,9 @@ type GCControllerReviseDelta struct {
 func (c *GCController) Revise(d GCControllerReviseDelta) {
        c.heapLive += uint64(d.HeapLive)
        c.heapScan += uint64(d.HeapScan)
-       c.scanWork += d.HeapScanWork + d.StackScanWork + d.GlobalsScanWork
+       c.heapScanWork.Add(d.HeapScanWork)
+       c.stackScanWork.Add(d.StackScanWork)
+       c.globalsScanWork.Add(d.GlobalsScanWork)
        c.revise()
 }
 
index 03711a9617da03649172888db8209b813dafc5bd..96f4157b59864751da8b3ac5b798541dcbb43222 100644 (file)
@@ -1084,6 +1084,8 @@ func gcMarkTermination(nextTriggerRatio float64) {
                print(" ms cpu, ",
                        work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
                        work.heapGoal>>20, " MB goal, ",
+                       gcController.stackScan>>20, " MB stacks, ",
+                       gcController.globalsScan>>20, " MB globals, ",
                        work.maxprocs, " P")
                if work.userForced {
                        print(" (forced)")
index efda65fe1e6b4e60511d42bf2afd10fcfed5141e..a5129bd1ee489f55f81eebf633b5bf6cc801a7aa 100644 (file)
@@ -8,6 +8,7 @@ package runtime
 
 import (
        "internal/goarch"
+       "internal/goexperiment"
        "runtime/internal/atomic"
        "runtime/internal/sys"
        "unsafe"
@@ -151,20 +152,28 @@ var oneptrmask = [...]uint8{1}
 //
 // Preemption must be disabled (because this uses a gcWork).
 //
+// Returns the amount of GC work credit produced by the operation.
+// If flushBgCredit is true, then that credit is also flushed
+// to the background credit pool.
+//
 // nowritebarrier is only advisory here.
 //
 //go:nowritebarrier
-func markroot(gcw *gcWork, i uint32) {
+func markroot(gcw *gcWork, i uint32, flushBgCredit bool) int64 {
        // Note: if you add a case here, please also update heapdump.go:dumproots.
+       var workDone int64
+       var workCounter *atomic.Int64
        switch {
        case work.baseData <= i && i < work.baseBSS:
+               workCounter = &gcController.globalsScanWork
                for _, datap := range activeModules() {
-                       markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-work.baseData))
+                       workDone += markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-work.baseData))
                }
 
        case work.baseBSS <= i && i < work.baseSpans:
+               workCounter = &gcController.globalsScanWork
                for _, datap := range activeModules() {
-                       markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-work.baseBSS))
+                       workDone += markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-work.baseBSS))
                }
 
        case i == fixedRootFinalizers:
@@ -184,6 +193,7 @@ func markroot(gcw *gcWork, i uint32) {
 
        default:
                // the rest is scanning goroutine stacks
+               workCounter = &gcController.stackScanWork
                var gp *g
                if work.baseStacks <= i && i < work.baseEnd {
                        // N.B. Atomic read of allglen in gcMarkRootPrepare
@@ -230,7 +240,7 @@ func markroot(gcw *gcWork, i uint32) {
                        if gp.gcscandone {
                                throw("g already scanned")
                        }
-                       scanstack(gp, gcw)
+                       workDone += scanstack(gp, gcw)
                        gp.gcscandone = true
                        resumeG(stopped)
 
@@ -239,13 +249,24 @@ func markroot(gcw *gcWork, i uint32) {
                        }
                })
        }
+       if goexperiment.PacerRedesign {
+               if workCounter != nil && workDone != 0 {
+                       workCounter.Add(workDone)
+                       if flushBgCredit {
+                               gcFlushBgCredit(workDone)
+                       }
+               }
+       }
+       return workDone
 }
 
 // markrootBlock scans the shard'th shard of the block of memory [b0,
 // b0+n0), with the given pointer mask.
 //
+// Returns the amount of work done.
+//
 //go:nowritebarrier
-func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
+func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) int64 {
        if rootBlockBytes%(8*goarch.PtrSize) != 0 {
                // This is necessary to pick byte offsets in ptrmask0.
                throw("rootBlockBytes must be a multiple of 8*ptrSize")
@@ -256,7 +277,7 @@ func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
        // These tests are written to avoid any possible overflow.
        off := uintptr(shard) * rootBlockBytes
        if off >= n0 {
-               return
+               return 0
        }
        b := b0 + off
        ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*goarch.PtrSize))))
@@ -267,6 +288,7 @@ func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
 
        // Scan this shard.
        scanblock(b, n, ptrmask, gcw, nil)
+       return int64(n)
 }
 
 // markrootFreeGStacks frees stacks of dead Gs.
@@ -681,6 +703,13 @@ func gcFlushBgCredit(scanWork int64) {
 
 // scanstack scans gp's stack, greying all pointers found on the stack.
 //
+// For goexperiment.PacerRedesign:
+// Returns the amount of scan work performed, but doesn't update
+// gcController.stackScanWork or flush any credit. Any background credit produced
+// by this function should be flushed by its caller. scanstack itself can't
+// safely flush because it may result in trying to wake up a goroutine that
+// was just scanned, resulting in a self-deadlock.
+//
 // scanstack will also shrink the stack if it is safe to do so. If it
 // is not, it schedules a stack shrink for the next synchronous safe
 // point.
@@ -690,7 +719,7 @@ func gcFlushBgCredit(scanWork int64) {
 //
 //go:nowritebarrier
 //go:systemstack
-func scanstack(gp *g, gcw *gcWork) {
+func scanstack(gp *g, gcw *gcWork) int64 {
        if readgstatus(gp)&_Gscan == 0 {
                print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
                throw("scanstack - bad status")
@@ -701,7 +730,7 @@ func scanstack(gp *g, gcw *gcWork) {
                print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
                throw("mark - bad status")
        case _Gdead:
-               return
+               return 0
        case _Grunning:
                print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
                throw("scanstack: goroutine not stopped")
@@ -713,6 +742,15 @@ func scanstack(gp *g, gcw *gcWork) {
                throw("can't scan our own stack")
        }
 
+       // stackSize is the amount of work we'll be reporting.
+       //
+       // We report the total stack size, more than we scan,
+       // because this number needs to line up with gcControllerState's
+       // stackScan and scannableStackSize fields.
+       //
+       // See the documentation on those fields for more information.
+       stackSize := gp.stack.hi - gp.stack.lo
+
        if isShrinkStackSafe(gp) {
                // Shrink the stack if not much of it is being used.
                shrinkstack(gp)
@@ -852,6 +890,7 @@ func scanstack(gp *g, gcw *gcWork) {
        if state.buf != nil || state.cbuf != nil || state.freeBuf != nil {
                throw("remaining pointer buffers")
        }
+       return int64(stackSize)
 }
 
 // Scan a stack frame: local variables and function arguments/results.
@@ -984,7 +1023,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
        flushBgCredit := flags&gcDrainFlushBgCredit != 0
        idle := flags&gcDrainIdle != 0
 
-       initScanWork := gcw.scanWork
+       initScanWork := gcw.heapScanWork
 
        // checkWork is the scan work before performing the next
        // self-preempt check.
@@ -1007,7 +1046,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
                        if job >= work.markrootJobs {
                                break
                        }
-                       markroot(gcw, job)
+                       markroot(gcw, job, flushBgCredit)
                        if check != nil && check() {
                                goto done
                        }
@@ -1046,14 +1085,14 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
                // Flush background scan work credit to the global
                // account if we've accumulated enough locally so
                // mutator assists can draw on it.
-               if gcw.scanWork >= gcCreditSlack {
-                       atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+               if gcw.heapScanWork >= gcCreditSlack {
+                       gcController.heapScanWork.Add(gcw.heapScanWork)
                        if flushBgCredit {
-                               gcFlushBgCredit(gcw.scanWork - initScanWork)
+                               gcFlushBgCredit(gcw.heapScanWork - initScanWork)
                                initScanWork = 0
                        }
-                       checkWork -= gcw.scanWork
-                       gcw.scanWork = 0
+                       checkWork -= gcw.heapScanWork
+                       gcw.heapScanWork = 0
 
                        if checkWork <= 0 {
                                checkWork += drainCheckThreshold
@@ -1066,12 +1105,12 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 
 done:
        // Flush remaining scan work credit.
-       if gcw.scanWork > 0 {
-               atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+       if gcw.heapScanWork > 0 {
+               gcController.heapScanWork.Add(gcw.heapScanWork)
                if flushBgCredit {
-                       gcFlushBgCredit(gcw.scanWork - initScanWork)
+                       gcFlushBgCredit(gcw.heapScanWork - initScanWork)
                }
-               gcw.scanWork = 0
+               gcw.heapScanWork = 0
        }
 }
 
@@ -1095,10 +1134,10 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
 
        // There may already be scan work on the gcw, which we don't
        // want to claim was done by this call.
-       workFlushed := -gcw.scanWork
+       workFlushed := -gcw.heapScanWork
 
        gp := getg().m.curg
-       for !gp.preempt && workFlushed+gcw.scanWork < scanWork {
+       for !gp.preempt && workFlushed+gcw.heapScanWork < scanWork {
                // See gcDrain comment.
                if work.full == 0 {
                        gcw.balance()
@@ -1117,13 +1156,13 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
 
                if b == 0 {
                        // Try to do a root job.
-                       //
-                       // TODO: Assists should get credit for this
-                       // work.
                        if work.markrootNext < work.markrootJobs {
                                job := atomic.Xadd(&work.markrootNext, +1) - 1
                                if job < work.markrootJobs {
-                                       markroot(gcw, job)
+                                       work := markroot(gcw, job, false)
+                                       if goexperiment.PacerRedesign {
+                                               workFlushed += work
+                                       }
                                        continue
                                }
                        }
@@ -1134,10 +1173,10 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
                scanobject(b, gcw)
 
                // Flush background scan work credit.
-               if gcw.scanWork >= gcCreditSlack {
-                       atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
-                       workFlushed += gcw.scanWork
-                       gcw.scanWork = 0
+               if gcw.heapScanWork >= gcCreditSlack {
+                       gcController.heapScanWork.Add(gcw.heapScanWork)
+                       workFlushed += gcw.heapScanWork
+                       gcw.heapScanWork = 0
                }
        }
 
@@ -1145,14 +1184,14 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
        // here because this never flushes to bgScanCredit and
        // gcw.dispose will flush any remaining work to scanWork.
 
-       return workFlushed + gcw.scanWork
+       return workFlushed + gcw.heapScanWork
 }
 
 // scanblock scans b as scanobject would, but using an explicit
 // pointer bitmap instead of the heap bitmap.
 //
 // This is used to scan non-heap roots, so it does not update
-// gcw.bytesMarked or gcw.scanWork.
+// gcw.bytesMarked or gcw.heapScanWork.
 //
 // If stk != nil, possible stack pointers are also reported to stk.putPtr.
 //go:nowritebarrier
@@ -1282,7 +1321,7 @@ func scanobject(b uintptr, gcw *gcWork) {
                }
        }
        gcw.bytesMarked += uint64(n)
-       gcw.scanWork += int64(i)
+       gcw.heapScanWork += int64(i)
 }
 
 // scanConservative scans block [b, b+n) conservatively, treating any
@@ -1521,7 +1560,19 @@ func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
 
        gcw := &getg().m.p.ptr().gcw
        gcw.bytesMarked += uint64(size)
-       gcw.scanWork += int64(scanSize)
+       if !goexperiment.PacerRedesign {
+               // The old pacer counts newly allocated memory toward
+               // heapScanWork because heapScan is continuously updated
+               // throughout the GC cyle with newly allocated memory. However,
+               // these objects are never actually scanned, so we need
+               // to account for them in heapScanWork here, "faking" their work.
+               // Otherwise the pacer will think it's always behind, potentially
+               // by a large margin.
+               //
+               // The new pacer doesn't care about this because it ceases to updated
+               // heapScan once a GC cycle starts, effectively snapshotting it.
+               gcw.heapScanWork += int64(scanSize)
+       }
 }
 
 // gcMarkTinyAllocs greys all active tiny alloc blocks.
index af43e6258f2700716d70848a5337e303b7830180..f886a07da19374c9907db0102704ac96a43fb22d 100644 (file)
@@ -6,6 +6,7 @@ package runtime
 
 import (
        "internal/cpu"
+       "internal/goexperiment"
        "runtime/internal/atomic"
        "unsafe"
 )
@@ -13,7 +14,8 @@ import (
 const (
        // gcGoalUtilization is the goal CPU utilization for
        // marking as a fraction of GOMAXPROCS.
-       gcGoalUtilization = 0.30
+       gcGoalUtilization = goexperiment.PacerRedesignInt*gcBackgroundUtilization +
+               (1-goexperiment.PacerRedesignInt)*(gcBackgroundUtilization+0.05)
 
        // gcBackgroundUtilization is the fixed CPU utilization for background
        // marking. It must be <= gcGoalUtilization. The difference between
@@ -26,10 +28,15 @@ const (
        // better control CPU and heap growth. However, the larger the gap,
        // the more mutator assists are expected to happen, which impact
        // mutator latency.
+       //
+       // If goexperiment.PacerRedesign, the trigger feedback controller
+       // is replaced with an estimate of the mark/cons ratio that doesn't
+       // have the same saturation issues, so this is set equal to
+       // gcGoalUtilization.
        gcBackgroundUtilization = 0.25
 
        // gcCreditSlack is the amount of scan work credit that can
-       // accumulate locally before updating gcController.scanWork and,
+       // accumulate locally before updating gcController.heapScanWork and,
        // optionally, gcController.bgScanCredit. Lower values give a more
        // accurate assist ratio and make it more likely that assists will
        // successfully steal background credit. Higher values reduce memory
@@ -46,7 +53,8 @@ const (
        gcOverAssistWork = 64 << 10
 
        // defaultHeapMinimum is the value of heapMinimum for GOGC==100.
-       defaultHeapMinimum = 4 << 20
+       defaultHeapMinimum = goexperiment.PacerRedesignInt*(512<<10) +
+               (1-goexperiment.PacerRedesignInt)*(4<<20)
 
        // scannableStackSizeSlack is the bytes of stack space allocated or freed
        // that can accumulate on a P before updating gcController.stackSize.
@@ -108,6 +116,8 @@ type gcControllerState struct {
        // during mark termination for the next cycle's trigger.
        //
        // Protected by mheap_.lock or a STW.
+       //
+       // Used if !goexperiment.PacerRedesign.
        triggerRatio float64
 
        // trigger is the heap size that triggers marking.
@@ -122,6 +132,31 @@ type gcControllerState struct {
        // Protected by mheap_.lock or a STW.
        trigger uint64
 
+       // consMark is the estimated per-CPU consMark ratio for the application.
+       //
+       // It represents the ratio between the application's allocation
+       // rate, as bytes allocated per CPU-time, and the GC's scan rate,
+       // as bytes scanned per CPU-time.
+       // The units of this ratio are (B / cpu-ns) / (B / cpu-ns).
+       //
+       // At a high level, this value is computed as the bytes of memory
+       // allocated (cons) per unit of scan work completed (mark) in a GC
+       // cycle, divided by the CPU time spent on each activity.
+       //
+       // Updated at the end of each GC cycle, in endCycle.
+       //
+       // For goexperiment.PacerRedesign.
+       consMark float64
+
+       // consMarkController holds the state for the mark-cons ratio
+       // estimation over time.
+       //
+       // Its purpose is to smooth out noisiness in the computation of
+       // consMark; see consMark for details.
+       //
+       // For goexperiment.PacerRedesign.
+       consMarkController piController
+
        // heapGoal is the goal heapLive for when next GC ends.
        // Set to ^uint64(0) if disabled.
        //
@@ -164,12 +199,23 @@ type gcControllerState struct {
        // is the live heap (as counted by heapLive), but omitting
        // no-scan objects and no-scan tails of objects.
        //
-       // Whenever this is updated, call this gcControllerState's
-       // revise() method.
+       // For !goexperiment.PacerRedesign: Whenever this is updated,
+       // call this gcControllerState's revise() method. It is read
+       // and written atomically or with the world stopped.
        //
-       // Read and written atomically or with the world stopped.
+       // For goexperiment.PacerRedesign: This value is fixed at the
+       // start of a GC cycle, so during a GC cycle it is safe to
+       // read without atomics, and it represents the maximum scannable
+       // heap.
        heapScan uint64
 
+       // lastHeapScan is the number of bytes of heap that were scanned
+       // last GC cycle. It is the same as heapMarked, but only
+       // includes the "scannable" parts of objects.
+       //
+       // Updated when the world is stopped.
+       lastHeapScan uint64
+
        // stackScan is a snapshot of scannableStackSize taken at each GC
        // STW pause and is used in pacing decisions.
        //
@@ -179,6 +225,12 @@ type gcControllerState struct {
        // scannableStackSize is the amount of allocated goroutine stack space in
        // use by goroutines.
        //
+       // This number tracks allocated goroutine stack space rather than used
+       // goroutine stack space (i.e. what is actually scanned) because used
+       // goroutine stack space is much harder to measure cheaply. By using
+       // allocated space, we make an overestimate; this is OK, it's better
+       // to conservatively overcount than undercount.
+       //
        // Read and updated atomically.
        scannableStackSize uint64
 
@@ -194,16 +246,26 @@ type gcControllerState struct {
        // next mark termination.
        heapMarked uint64
 
-       // scanWork is the total scan work performed this cycle. This
-       // is updated atomically during the cycle. Updates occur in
-       // bounded batches, since it is both written and read
-       // throughout the cycle. At the end of the cycle, this is how
+       // heapScanWork is the total heap scan work performed this cycle.
+       // stackScanWork is the total stack scan work performed this cycle.
+       // globalsScanWork is the total globals scan work performed this cycle.
+       //
+       // These are updated atomically during the cycle. Updates occur in
+       // bounded batches, since they are both written and read
+       // throughout the cycle. At the end of the cycle, heapScanWork is how
        // much of the retained heap is scannable.
        //
-       // Currently this is the bytes of heap scanned. For most uses,
-       // this is an opaque unit of work, but for estimation the
-       // definition is important.
-       scanWork int64
+       // Currently these are measured in bytes. For most uses, this is an
+       // opaque unit of work, but for estimation the definition is important.
+       //
+       // Note that stackScanWork includes all allocated space, not just the
+       // size of the stack itself, mirroring stackSize.
+       //
+       // For !goexperiment.PacerRedesign, stackScanWork and globalsScanWork
+       // are always zero.
+       heapScanWork    atomic.Int64
+       stackScanWork   atomic.Int64
+       globalsScanWork atomic.Int64
 
        // bgScanCredit is the scan work credit accumulated by the
        // concurrent background scan. This credit is accumulated by
@@ -278,13 +340,39 @@ type gcControllerState struct {
 func (c *gcControllerState) init(gcPercent int32) {
        c.heapMinimum = defaultHeapMinimum
 
-       // Set a reasonable initial GC trigger.
-       c.triggerRatio = 7 / 8.0
+       if goexperiment.PacerRedesign {
+               c.consMarkController = piController{
+                       // Tuned first via the Ziegler-Nichols process in simulation,
+                       // then the integral time was manually tuned against real-world
+                       // applications to deal with noisiness in the measured cons/mark
+                       // ratio.
+                       kp: 0.9,
+                       ti: 4.0,
+
+                       // An update is done once per GC cycle.
+                       period: 1,
+
+                       // Set a high reset time in GC cycles.
+                       // This is inversely proportional to the rate at which we
+                       // accumulate error from clipping. By making this very high
+                       // we make the accumulation slow. In general, clipping is
+                       // OK in our situation, hence the choice.
+                       //
+                       // Tune this if we get unintended effects from clipping for
+                       // a long time.
+                       tt:  1000,
+                       min: -1000,
+                       max: 1000,
+               }
+       } else {
+               // Set a reasonable initial GC trigger.
+               c.triggerRatio = 7 / 8.0
 
-       // Fake a heapMarked value so it looks like a trigger at
-       // heapMinimum is the appropriate growth from heapMarked.
-       // This will go into computing the initial GC goal.
-       c.heapMarked = uint64(float64(c.heapMinimum) / (1 + c.triggerRatio))
+               // Fake a heapMarked value so it looks like a trigger at
+               // heapMinimum is the appropriate growth from heapMarked.
+               // This will go into computing the initial GC goal.
+               c.heapMarked = uint64(float64(c.heapMinimum) / (1 + c.triggerRatio))
+       }
 
        // This will also compute and set the GC trigger and goal.
        c.setGCPercent(gcPercent)
@@ -294,7 +382,9 @@ func (c *gcControllerState) init(gcPercent int32) {
 // for a new GC cycle. The caller must hold worldsema and the world
 // must be stopped.
 func (c *gcControllerState) startCycle(markStartTime int64, procs int) {
-       c.scanWork = 0
+       c.heapScanWork.Store(0)
+       c.stackScanWork.Store(0)
+       c.globalsScanWork.Store(0)
        c.bgScanCredit = 0
        c.assistTime = 0
        c.dedicatedMarkTime = 0
@@ -310,8 +400,14 @@ func (c *gcControllerState) startCycle(markStartTime int64, procs int) {
        // GOGC. Assist is proportional to this distance, so enforce a
        // minimum distance, even if it means going over the GOGC goal
        // by a tiny bit.
-       if c.heapGoal < c.heapLive+1024*1024 {
-               c.heapGoal = c.heapLive + 1024*1024
+       if goexperiment.PacerRedesign {
+               if c.heapGoal < c.heapLive+64<<10 {
+                       c.heapGoal = c.heapLive + 64<<10
+               }
+       } else {
+               if c.heapGoal < c.heapLive+1<<20 {
+                       c.heapGoal = c.heapLive + 1<<20
+               }
        }
 
        // Compute the background mark utilization goal. In general,
@@ -394,32 +490,79 @@ func (c *gcControllerState) revise() {
        }
        live := atomic.Load64(&c.heapLive)
        scan := atomic.Load64(&c.heapScan)
-       work := atomic.Loadint64(&c.scanWork)
+       work := c.heapScanWork.Load() + c.stackScanWork.Load() + c.globalsScanWork.Load()
 
        // Assume we're under the soft goal. Pace GC to complete at
        // heapGoal assuming the heap is in steady-state.
        heapGoal := int64(atomic.Load64(&c.heapGoal))
 
-       // Compute the expected scan work remaining.
-       //
-       // This is estimated based on the expected
-       // steady-state scannable heap. For example, with
-       // GOGC=100, only half of the scannable heap is
-       // expected to be live, so that's what we target.
-       //
-       // (This is a float calculation to avoid overflowing on
-       // 100*heapScan.)
-       scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcPercent))
-
-       if int64(live) > heapGoal || work > scanWorkExpected {
-               // We're past the soft goal, or we've already done more scan
-               // work than we expected. Pace GC so that in the worst case it
-               // will complete by the hard goal.
-               const maxOvershoot = 1.1
-               heapGoal = int64(float64(heapGoal) * maxOvershoot)
-
-               // Compute the upper bound on the scan work remaining.
-               scanWorkExpected = int64(scan)
+       var scanWorkExpected int64
+       if goexperiment.PacerRedesign {
+               // The expected scan work is computed as the amount of bytes scanned last
+               // GC cycle, plus our estimate of stacks and globals work for this cycle.
+               scanWorkExpected = int64(c.lastHeapScan + c.stackScan + c.globalsScan)
+
+               // maxScanWork is a worst-case estimate of the amount of scan work that
+               // needs to be performed in this GC cycle. Specifically, it represents
+               // the case where *all* scannable memory turns out to be live.
+               maxScanWork := int64(scan + c.stackScan + c.globalsScan)
+               if work > scanWorkExpected {
+                       // We've already done more scan work than expected. Because our expectation
+                       // is based on a steady-state scannable heap size, we assume this means our
+                       // heap is growing. Compute a new heap goal that takes our existing runway
+                       // computed for scanWorkExpected and extrapolates it to maxScanWork, the worst-case
+                       // scan work. This keeps our assist ratio stable if the heap continues to grow.
+                       //
+                       // The effect of this mechanism is that assists stay flat in the face of heap
+                       // growths. It's OK to use more memory this cycle to scan all the live heap,
+                       // because the next GC cycle is inevitably going to use *at least* that much
+                       // memory anyway.
+                       heapGoal = int64(float64(heapGoal-int64(c.trigger))/float64(scanWorkExpected)*float64(maxScanWork)) + int64(c.trigger)
+                       scanWorkExpected = maxScanWork
+
+                       // hardGoal is a hard limit on the amount that we're willing to push back the
+                       // heap goal, and that's twice the heap goal (i.e. if GOGC=100 and the heap and/or
+                       // stacks and/or globals grow to twice their size, this limits the current GC cycle's
+                       // growth to 4x the original live heap's size).
+                       //
+                       // This maintains the invariant that we use no more memory than the next GC cycle
+                       // will anyway.
+                       hardGoal := int64((1.0 + float64(gcPercent)/100.0) * float64(heapGoal))
+                       if heapGoal > hardGoal {
+                               heapGoal = hardGoal
+                       }
+               }
+               if int64(live) > heapGoal {
+                       // We're already past our heap goal, even the extrapolated one.
+                       // Leave ourselves some extra runway, so in the worst case we
+                       // finish by that point.
+                       const maxOvershoot = 1.1
+                       heapGoal = int64(float64(heapGoal) * maxOvershoot)
+
+                       // Compute the upper bound on the scan work remaining.
+                       scanWorkExpected = maxScanWork
+               }
+       } else {
+               // Compute the expected scan work remaining.
+               //
+               // This is estimated based on the expected
+               // steady-state scannable heap. For example, with
+               // GOGC=100, only half of the scannable heap is
+               // expected to be live, so that's what we target.
+               //
+               // (This is a float calculation to avoid overflowing on
+               // 100*heapScan.)
+               scanWorkExpected = int64(float64(scan) * 100 / float64(100+gcPercent))
+               if int64(live) > heapGoal || work > scanWorkExpected {
+                       // We're past the soft goal, or we've already done more scan
+                       // work than we expected. Pace GC so that in the worst case it
+                       // will complete by the hard goal.
+                       const maxOvershoot = 1.1
+                       heapGoal = int64(float64(heapGoal) * maxOvershoot)
+
+                       // Compute the upper bound on the scan work remaining.
+                       scanWorkExpected = int64(scan)
+               }
        }
 
        // Compute the remaining scan work estimate.
@@ -464,7 +607,9 @@ func (c *gcControllerState) revise() {
        c.assistBytesPerWork.Store(assistBytesPerWork)
 }
 
-// endCycle computes the trigger ratio for the next cycle.
+// endCycle computes the trigger ratio (!goexperiment.PacerRedesign)
+// or the consMark estimate (goexperiment.PacerRedesign) for the next cycle.
+// Returns the trigger ratio if application, or 0 (goexperiment.PacerRedesign).
 // userForced indicates whether the current GC cycle was forced
 // by the application.
 func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) float64 {
@@ -472,6 +617,81 @@ func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) floa
        // We'll be updating the heap goal soon.
        gcController.lastHeapGoal = gcController.heapGoal
 
+       // Compute the duration of time for which assists were turned on.
+       assistDuration := now - c.markStartTime
+
+       // Assume background mark hit its utilization goal.
+       utilization := gcBackgroundUtilization
+       // Add assist utilization; avoid divide by zero.
+       if assistDuration > 0 {
+               utilization += float64(c.assistTime) / float64(assistDuration*int64(procs))
+       }
+
+       if goexperiment.PacerRedesign {
+               if c.heapLive <= c.trigger {
+                       // Shouldn't happen, but let's be very safe about this in case the
+                       // GC is somehow extremely short.
+                       //
+                       // In this case though, the only reasonable value for c.heapLive-c.trigger
+                       // would be 0, which isn't really all that useful, i.e. the GC was so short
+                       // that it didn't matter.
+                       //
+                       // Ignore this case and don't update anything.
+                       return 0
+               }
+               idleUtilization := 0.0
+               if assistDuration > 0 {
+                       idleUtilization = float64(c.idleMarkTime) / float64(assistDuration*int64(procs))
+               }
+               // Determine the cons/mark ratio.
+               //
+               // The units we want for the numerator and denominator are both B / cpu-ns.
+               // We get this by taking the bytes allocated or scanned, and divide by the amount of
+               // CPU time it took for those operations. For allocations, that CPU time is
+               //
+               //    assistDuration * procs * (1 - utilization)
+               //
+               // Where utilization includes just background GC workers and assists. It does *not*
+               // include idle GC work time, because in theory the mutator is free to take that at
+               // any point.
+               //
+               // For scanning, that CPU time is
+               //
+               //    assistDuration * procs * (utilization + idleUtilization)
+               //
+               // In this case, we *include* idle utilization, because that is additional CPU time that the
+               // the GC had available to it.
+               //
+               // In effect, idle GC time is sort of double-counted here, but it's very weird compared
+               // to other kinds of GC work, because of how fluid it is. Namely, because the mutator is
+               // *always* free to take it.
+               //
+               // So this calculation is really:
+               //     (heapLive-trigger) / (assistDuration * procs * (1-utilization)) /
+               //         (scanWork) / (assistDuration * procs * (utilization+idleUtilization)
+               //
+               // Note that because we only care about the ratio, assistDuration and procs cancel out.
+               scanWork := c.heapScanWork.Load() + c.stackScanWork.Load() + c.globalsScanWork.Load()
+               currentConsMark := (float64(c.heapLive-c.trigger) * (utilization + idleUtilization)) /
+                       (float64(scanWork) * (1 - utilization))
+
+               // Update cons/mark controller.
+               oldConsMark := c.consMark
+               c.consMark = c.consMarkController.next(c.consMark, currentConsMark)
+
+               if debug.gcpacertrace > 0 {
+                       printlock()
+                       print("pacer: ", int(utilization*100), "% CPU (", int(gcGoalUtilization*100), " exp.) for ")
+                       print(c.heapScanWork.Load(), "+", c.stackScanWork.Load(), "+", c.globalsScanWork.Load(), " B work (", c.lastHeapScan+c.stackScan+c.globalsScan, " B exp.) ")
+                       print("in ", c.trigger, " B -> ", c.heapLive, " B (∆goal ", int64(c.heapLive)-int64(c.heapGoal), ", cons/mark ", oldConsMark, ")")
+                       println()
+                       printunlock()
+               }
+               return 0
+       }
+
+       // !goexperiment.PacerRedesign below.
+
        if userForced {
                // Forced GC means this cycle didn't start at the
                // trigger, so where it finished isn't good
@@ -498,15 +718,6 @@ func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) floa
        // heap growth is the error.
        goalGrowthRatio := c.effectiveGrowthRatio()
        actualGrowthRatio := float64(c.heapLive)/float64(c.heapMarked) - 1
-       assistDuration := now - c.markStartTime
-
-       // Assume background mark hit its utilization goal.
-       utilization := gcBackgroundUtilization
-       // Add assist utilization; avoid divide by zero.
-       if assistDuration > 0 {
-               utilization += float64(c.assistTime) / float64(assistDuration*int64(procs))
-       }
-
        triggerError := goalGrowthRatio - c.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-c.triggerRatio)
 
        // Finally, we adjust the trigger for next time by this error,
@@ -525,7 +736,7 @@ func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) floa
                H_g := int64(float64(H_m_prev) * (1 + h_g))
                u_a := utilization
                u_g := gcGoalUtilization
-               W_a := c.scanWork
+               W_a := c.heapScanWork.Load()
                print("pacer: H_m_prev=", H_m_prev,
                        " h_t=", h_t, " H_T=", H_T,
                        " h_a=", h_a, " H_a=", H_a,
@@ -669,7 +880,8 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 func (c *gcControllerState) resetLive(bytesMarked uint64) {
        c.heapMarked = bytesMarked
        c.heapLive = bytesMarked
-       c.heapScan = uint64(c.scanWork)
+       c.heapScan = uint64(c.heapScanWork.Load())
+       c.lastHeapScan = uint64(c.heapScanWork.Load())
 
        // heapLive was updated, so emit a trace event.
        if trace.enabled {
@@ -703,8 +915,12 @@ func (c *gcControllerState) update(dHeapLive, dHeapScan int64) {
                        traceHeapAlloc()
                }
        }
-       if dHeapScan != 0 {
-               atomic.Xadd64(&gcController.heapScan, dHeapScan)
+       // Only update heapScan in the new pacer redesign if we're not
+       // currently in a GC.
+       if !goexperiment.PacerRedesign || gcBlackenEnabled == 0 {
+               if dHeapScan != 0 {
+                       atomic.Xadd64(&gcController.heapScan, dHeapScan)
+               }
        }
        if gcBlackenEnabled != 0 {
                // gcController.heapLive and heapScan changed.
@@ -728,9 +944,10 @@ func (c *gcControllerState) addGlobals(amount int64) {
        atomic.Xadd64(&c.globalsScan, amount)
 }
 
-// commit sets the trigger ratio and updates everything
-// derived from it: the absolute trigger, the heap goal, mark pacing,
-// and sweep pacing.
+// commit recomputes all pacing parameters from scratch, namely
+// absolute trigger, the heap goal, mark pacing, and sweep pacing.
+//
+// If goexperiment.PacerRedesign is true, triggerRatio is ignored.
 //
 // This can be called any time. If GC is the in the middle of a
 // concurrent phase, it will adjust the pacing of that phase.
@@ -744,6 +961,130 @@ func (c *gcControllerState) commit(triggerRatio float64) {
                assertWorldStoppedOrLockHeld(&mheap_.lock)
        }
 
+       if !goexperiment.PacerRedesign {
+               c.oldCommit(triggerRatio)
+               return
+       }
+
+       // Compute the next GC goal, which is when the allocated heap
+       // has grown by GOGC/100 over where it started the last cycle,
+       // plus additional runway for non-heap sources of GC work.
+       goal := ^uint64(0)
+       if c.gcPercent >= 0 {
+               goal = c.heapMarked + (c.heapMarked+atomic.Load64(&c.stackScan)+atomic.Load64(&c.globalsScan))*uint64(c.gcPercent)/100
+       }
+
+       // Don't trigger below the minimum heap size.
+       minTrigger := c.heapMinimum
+       if !isSweepDone() {
+               // Concurrent sweep happens in the heap growth
+               // from gcController.heapLive to trigger, so ensure
+               // that concurrent sweep has some heap growth
+               // in which to perform sweeping before we
+               // start the next GC cycle.
+               sweepMin := atomic.Load64(&c.heapLive) + sweepMinHeapDistance
+               if sweepMin > minTrigger {
+                       minTrigger = sweepMin
+               }
+       }
+
+       // If we let the trigger go too low, then if the application
+       // is allocating very rapidly we might end up in a situation
+       // where we're allocating black during a nearly always-on GC.
+       // The result of this is a growing heap and ultimately an
+       // increase in RSS. By capping us at a point >0, we're essentially
+       // saying that we're OK using more CPU during the GC to prevent
+       // this growth in RSS.
+       //
+       // The current constant was chosen empirically: given a sufficiently
+       // fast/scalable allocator with 48 Ps that could drive the trigger ratio
+       // to <0.05, this constant causes applications to retain the same peak
+       // RSS compared to not having this allocator.
+       if triggerBound := uint64(0.7*float64(goal-c.heapMarked)) + c.heapMarked; minTrigger < triggerBound {
+               minTrigger = triggerBound
+       }
+
+       // For small heaps, set the max trigger point at 95% of the heap goal.
+       // This ensures we always have *some* headroom when the GC actually starts.
+       // For larger heaps, set the max trigger point at the goal, minus the
+       // minimum heap size.
+       // This choice follows from the fact that the minimum heap size is chosen
+       // to reflect the costs of a GC with no work to do. With a large heap but
+       // very little scan work to perform, this gives us exactly as much runway
+       // as we would need, in the worst case.
+       maxRunway := uint64(0.95 * float64(goal-c.heapMarked))
+       if largeHeapMaxRunway := goal - c.heapMinimum; goal > c.heapMinimum && maxRunway < largeHeapMaxRunway {
+               maxRunway = largeHeapMaxRunway
+       }
+       maxTrigger := maxRunway + c.heapMarked
+       if maxTrigger < minTrigger {
+               maxTrigger = minTrigger
+       }
+
+       // Compute the trigger by using our estimate of the cons/mark ratio.
+       //
+       // The idea is to take our expected scan work, and multiply it by
+       // the cons/mark ratio to determine how long it'll take to complete
+       // that scan work in terms of bytes allocated. This gives us our GC's
+       // runway.
+       //
+       // However, the cons/mark ratio is a ratio of rates per CPU-second, but
+       // here we care about the relative rates for some division of CPU
+       // resources among the mutator and the GC.
+       //
+       // To summarize, we have B / cpu-ns, and we want B / ns. We get that
+       // by multiplying by our desired division of CPU resources. We choose
+       // to express CPU resources as GOMAPROCS*fraction. Note that because
+       // we're working with a ratio here, we can omit the number of CPU cores,
+       // because they'll appear in the numerator and denominator and cancel out.
+       // As a result, this is basically just "weighing" the cons/mark ratio by
+       // our desired division of resources.
+       //
+       // Furthermore, by setting the trigger so that CPU resources are divided
+       // this way, assuming that the cons/mark ratio is correct, we make that
+       // division a reality.
+       var trigger uint64
+       runway := uint64((c.consMark * (1 - gcGoalUtilization) / (gcGoalUtilization)) * float64(c.lastHeapScan+c.stackScan+c.globalsScan))
+       if runway > goal {
+               trigger = minTrigger
+       } else {
+               trigger = goal - runway
+       }
+       if trigger < minTrigger {
+               trigger = minTrigger
+       }
+       if trigger > maxTrigger {
+               trigger = maxTrigger
+       }
+       if trigger > goal {
+               goal = trigger
+       }
+
+       // Commit to the trigger and goal.
+       c.trigger = trigger
+       atomic.Store64(&c.heapGoal, goal)
+       if trace.enabled {
+               traceHeapGoal()
+       }
+
+       // Update mark pacing.
+       if gcphase != _GCoff {
+               c.revise()
+       }
+}
+
+// oldCommit sets the trigger ratio and updates everything
+// derived from it: the absolute trigger, the heap goal, mark pacing,
+// and sweep pacing.
+//
+// This can be called any time. If GC is the in the middle of a
+// concurrent phase, it will adjust the pacing of that phase.
+//
+// This depends on gcPercent, gcController.heapMarked, and
+// gcController.heapLive. These must be up to date.
+//
+// For !goexperiment.PacerRedesign.
+func (c *gcControllerState) oldCommit(triggerRatio float64) {
        // Compute the next GC goal, which is when the allocated heap
        // has grown by GOGC/100 over the heap marked by the last
        // cycle.
@@ -913,3 +1254,38 @@ func readGOGC() int32 {
        }
        return 100
 }
+
+type piController struct {
+       kp float64 // Proportional constant.
+       ti float64 // Integral time constant.
+       tt float64 // Reset time in GC cyles.
+
+       // Period in GC cycles between updates.
+       period float64
+
+       min, max float64 // Output boundaries.
+
+       // PI controller state.
+
+       errIntegral float64 // Integral of the error from t=0 to now.
+}
+
+func (c *piController) next(input, setpoint float64) float64 {
+       // Compute the raw output value.
+       prop := c.kp * (setpoint - input)
+       rawOutput := prop + c.errIntegral
+
+       // Clamp rawOutput into output.
+       output := rawOutput
+       if output < c.min {
+               output = c.min
+       } else if output > c.max {
+               output = c.max
+       }
+
+       // Update the controller's state.
+       if c.ti != 0 && c.tt != 0 {
+               c.errIntegral += (c.kp*c.period/c.ti)*(setpoint-input) + (c.period/c.tt)*(output-rawOutput)
+       }
+       return output
+}
index 5a9f46c6d17ae9e1787c2ac56a72f28706d60d07..d2707ca5a1301caf4e9aa42f68590e3e1c7bdaa0 100644 (file)
@@ -6,6 +6,7 @@ package runtime_test
 
 import (
        "fmt"
+       "internal/goexperiment"
        "math"
        "math/rand"
        . "runtime"
@@ -34,12 +35,76 @@ func TestGcPacer(t *testing.T) {
                        checker: func(t *testing.T, c []gcCycleResult) {
                                n := len(c)
                                if n >= 25 {
+                                       if goexperiment.PacerRedesign {
+                                               // For the pacer redesign, assert something even stronger: at this alloc/scan rate,
+                                               // it should be extremely close to the goal utilization.
+                                               assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, GCGoalUtilization, 0.005)
+                                       }
+
                                        // Make sure the pacer settles into a non-degenerate state in at least 25 GC cycles.
                                        assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, c[n-2].gcUtilization, 0.005)
                                        assertInRange(t, "goal ratio", c[n-1].goalRatio(), 0.95, 1.05)
                                }
                        },
                },
+               {
+                       // Same as the steady-state case, but lots of stacks to scan relative to the heap size.
+                       name:          "SteadyBigStacks",
+                       gcPercent:     100,
+                       globalsBytes:  32 << 10,
+                       nCores:        8,
+                       allocRate:     constant(132.0),
+                       scanRate:      constant(1024.0),
+                       growthRate:    constant(2.0).sum(ramp(-1.0, 12)),
+                       scannableFrac: constant(1.0),
+                       stackBytes:    constant(2048).sum(ramp(128<<20, 8)),
+                       length:        50,
+                       checker: func(t *testing.T, c []gcCycleResult) {
+                               // Check the same conditions as the steady-state case, except the old pacer can't
+                               // really handle this well, so don't check the goal ratio for it.
+                               n := len(c)
+                               if n >= 25 {
+                                       if goexperiment.PacerRedesign {
+                                               // For the pacer redesign, assert something even stronger: at this alloc/scan rate,
+                                               // it should be extremely close to the goal utilization.
+                                               assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, GCGoalUtilization, 0.005)
+                                               assertInRange(t, "goal ratio", c[n-1].goalRatio(), 0.95, 1.05)
+                                       }
+
+                                       // Make sure the pacer settles into a non-degenerate state in at least 25 GC cycles.
+                                       assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, c[n-2].gcUtilization, 0.005)
+                               }
+                       },
+               },
+               {
+                       // Same as the steady-state case, but lots of globals to scan relative to the heap size.
+                       name:          "SteadyBigGlobals",
+                       gcPercent:     100,
+                       globalsBytes:  128 << 20,
+                       nCores:        8,
+                       allocRate:     constant(132.0),
+                       scanRate:      constant(1024.0),
+                       growthRate:    constant(2.0).sum(ramp(-1.0, 12)),
+                       scannableFrac: constant(1.0),
+                       stackBytes:    constant(8192),
+                       length:        50,
+                       checker: func(t *testing.T, c []gcCycleResult) {
+                               // Check the same conditions as the steady-state case, except the old pacer can't
+                               // really handle this well, so don't check the goal ratio for it.
+                               n := len(c)
+                               if n >= 25 {
+                                       if goexperiment.PacerRedesign {
+                                               // For the pacer redesign, assert something even stronger: at this alloc/scan rate,
+                                               // it should be extremely close to the goal utilization.
+                                               assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, GCGoalUtilization, 0.005)
+                                               assertInRange(t, "goal ratio", c[n-1].goalRatio(), 0.95, 1.05)
+                                       }
+
+                                       // Make sure the pacer settles into a non-degenerate state in at least 25 GC cycles.
+                                       assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, c[n-2].gcUtilization, 0.005)
+                               }
+                       },
+               },
                {
                        // This tests the GC pacer's response to a small change in allocation rate.
                        name:          "StepAlloc",
@@ -106,6 +171,47 @@ func TestGcPacer(t *testing.T) {
                                }
                        },
                },
+               {
+                       // Tests the pacer for a high GOGC value with a large heap growth happening
+                       // in the middle. The purpose of the large heap growth is to check if GC
+                       // utilization ends up sensitive
+                       name:          "HighGOGC",
+                       gcPercent:     1500,
+                       globalsBytes:  32 << 10,
+                       nCores:        8,
+                       allocRate:     random(7, 0x53).offset(165),
+                       scanRate:      constant(1024.0),
+                       growthRate:    constant(2.0).sum(ramp(-1.0, 12), random(0.01, 0x1), unit(14).delay(25)),
+                       scannableFrac: constant(1.0),
+                       stackBytes:    constant(8192),
+                       length:        50,
+                       checker: func(t *testing.T, c []gcCycleResult) {
+                               n := len(c)
+                               if goexperiment.PacerRedesign && n > 12 {
+                                       if n == 26 {
+                                               // In the 26th cycle there's a heap growth. Overshoot is expected to maintain
+                                               // a stable utilization, but we should *never* overshoot more than GOGC of
+                                               // the next cycle.
+                                               assertInRange(t, "goal ratio", c[n-1].goalRatio(), 0.90, 15)
+                                       } else {
+                                               // Give a wider goal range here. With such a high GOGC value we're going to be
+                                               // forced to undershoot.
+                                               //
+                                               // TODO(mknyszek): Instead of placing a 0.95 limit on the trigger, make the limit
+                                               // based on absolute bytes, that's based somewhat in how the minimum heap size
+                                               // is determined.
+                                               assertInRange(t, "goal ratio", c[n-1].goalRatio(), 0.90, 1.05)
+                                       }
+
+                                       // Ensure utilization remains stable despite a growth in live heap size
+                                       // at GC #25. This test fails prior to the GC pacer redesign.
+                                       //
+                                       // Because GOGC is so large, we should also be really close to the goal utilization.
+                                       assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, GCGoalUtilization, GCGoalUtilization+0.03)
+                                       assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, c[n-2].gcUtilization, 0.03)
+                               }
+                       },
+               },
                {
                        // This test makes sure that in the face of a varying (in this case, oscillating) allocation
                        // rate, the pacer does a reasonably good job of staying abreast of the changes.
@@ -126,7 +232,12 @@ func TestGcPacer(t *testing.T) {
                                        // 1. Utilization isn't varying _too_ much, and
                                        // 2. The pacer is mostly keeping up with the goal.
                                        assertInRange(t, "goal ratio", c[n-1].goalRatio(), 0.95, 1.05)
-                                       assertInRange(t, "GC utilization", c[n-1].gcUtilization, 0.25, 0.4)
+                                       if goexperiment.PacerRedesign {
+                                               assertInRange(t, "GC utilization", c[n-1].gcUtilization, 0.25, 0.3)
+                                       } else {
+                                               // The old pacer is messier here, and needs a lot more tolerance.
+                                               assertInRange(t, "GC utilization", c[n-1].gcUtilization, 0.25, 0.4)
+                                       }
                                }
                        },
                },
@@ -149,7 +260,12 @@ func TestGcPacer(t *testing.T) {
                                        // 1. Utilization isn't varying _too_ much, and
                                        // 2. The pacer is mostly keeping up with the goal.
                                        assertInRange(t, "goal ratio", c[n-1].goalRatio(), 0.95, 1.05)
-                                       assertInRange(t, "GC utilization", c[n-1].gcUtilization, 0.25, 0.4)
+                                       if goexperiment.PacerRedesign {
+                                               assertInRange(t, "GC utilization", c[n-1].gcUtilization, 0.25, 0.3)
+                                       } else {
+                                               // The old pacer is messier here, and needs a lot more tolerance.
+                                               assertInRange(t, "GC utilization", c[n-1].gcUtilization, 0.25, 0.4)
+                                       }
                                }
                        },
                },
@@ -177,7 +293,12 @@ func TestGcPacer(t *testing.T) {
                                        // Unlike the other tests, GC utilization here will vary more and tend higher.
                                        // Just make sure it's not going too crazy.
                                        assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, c[n-2].gcUtilization, 0.05)
-                                       assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, c[11].gcUtilization, 0.07)
+                                       if goexperiment.PacerRedesign {
+                                               assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, c[11].gcUtilization, 0.05)
+                                       } else {
+                                               // The old pacer is messier here, and needs a little more tolerance.
+                                               assertInEpsilon(t, "GC utilization", c[n-1].gcUtilization, c[11].gcUtilization, 0.07)
+                                       }
                                }
                        },
                },
index 8787d93d873508b36ebd597b417b1e08dec91b67..9c3f7fd223a943716dbab8549f1d638696fe6d74 100644 (file)
@@ -77,9 +77,10 @@ type gcWork struct {
        // into work.bytesMarked by dispose.
        bytesMarked uint64
 
-       // Scan work performed on this gcWork. This is aggregated into
+       // Heap scan work performed on this gcWork. This is aggregated into
        // gcController by dispose and may also be flushed by callers.
-       scanWork int64
+       // Other types of scan work are flushed immediately.
+       heapScanWork int64
 
        // flushedWork indicates that a non-empty work buffer was
        // flushed to the global work list since the last gcMarkDone
@@ -274,9 +275,9 @@ func (w *gcWork) dispose() {
                atomic.Xadd64(&work.bytesMarked, int64(w.bytesMarked))
                w.bytesMarked = 0
        }
-       if w.scanWork != 0 {
-               atomic.Xaddint64(&gcController.scanWork, w.scanWork)
-               w.scanWork = 0
+       if w.heapScanWork != 0 {
+               gcController.heapScanWork.Add(w.heapScanWork)
+               w.heapScanWork = 0
        }
 }