runtime: make it harder to introduce deadlocks with forEachP

[gostls13.git] / src / runtime / mgc.go
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go

index c239fa0f636509bc32bb86e1c66665d78048be05..d015d6dbabaa8035f28b5debad991c9da3840195 100644 (file)
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -113,7 +113,7 @@
  // Next GC is after we've allocated an extra amount of memory proportional to
  // the amount already in use. The proportion is controlled by GOGC environment variable
  // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
-// (this mark is tracked in gcController.heapGoal variable). This keeps the GC cost in
+// (this mark is computed by the gcController.heapGoal method). This keeps the GC cost in
  // linear proportion to the allocation cost. Adjusting GOGC just changes the linear constant
  // (and also the amount of extra memory used).
  
@@ -135,9 +135,12 @@ import (
  )
  
  const (
-       _DebugGC         = 0
-       _ConcurrentSweep = true
-       _FinBlockSize    = 4 * 1024
+       _DebugGC      = 0
+       _FinBlockSize = 4 * 1024
+
+       // concurrentSweep is a debug flag. Disabling this flag
+       // ensures all spans are swept while the world is stopped.
+       concurrentSweep = true
  
         // debugScanConservative enables debug logging for stack
         // frames that are scanned conservatively.
@@ -149,16 +152,39 @@ const (
         sweepMinHeapDistance = 1024 * 1024
  )
  
+// heapObjectsCanMove always returns false in the current garbage collector.
+// It exists for go4.org/unsafe/assume-no-moving-gc, which is an
+// unfortunate idea that had an even more unfortunate implementation.
+// Every time a new Go release happened, the package stopped building,
+// and the authors had to add a new file with a new //go:build line, and
+// then the entire ecosystem of packages with that as a dependency had to
+// explicitly update to the new version. Many packages depend on
+// assume-no-moving-gc transitively, through paths like
+// inet.af/netaddr -> go4.org/intern -> assume-no-moving-gc.
+// This was causing a significant amount of friction around each new
+// release, so we added this bool for the package to //go:linkname
+// instead. The bool is still unfortunate, but it's not as bad as
+// breaking the ecosystem on every new release.
+//
+// If the Go garbage collector ever does move heap objects, we can set
+// this to true to break all the programs using assume-no-moving-gc.
+//
+//go:linkname heapObjectsCanMove
+func heapObjectsCanMove() bool {
+       return false
+}
+
  func gcinit() {
         if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
                 throw("size of Workbuf is suboptimal")
         }
         // No sweep on the first cycle.
-       mheap_.sweepDrained = 1
+       sweep.active.state.Store(sweepDrainedMask)
  
         // Initialize GC pacer state.
         // Use the environment variable GOGC for the initial gcPercent value.
-       gcController.init(readGOGC())
+       // Use the environment variable GOMEMLIMIT for the initial memoryLimit value.
+       gcController.init(readGOGC(), readGOMEMLIMIT())
  
         work.startSema = 1
         work.markDoneSema = 1
@@ -192,8 +218,6 @@ var gcphase uint32
  var writeBarrier struct {
         enabled bool    // compiler emits a check of this before calling write barrier
         pad     [3]byte // compiler uses 32-bit load for "enabled" field
-       needed  bool    // whether we need a write barrier for current GC phase
-       cgo     bool    // whether we need a write barrier for a cgo check
         alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
  }
  
@@ -211,8 +235,7 @@ const (
  //go:nosplit
  func setGCPhase(x uint32) {
         atomic.Store(&gcphase, x)
-       writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
-       writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
+       writeBarrier.enabled = gcphase == _GCmark || gcphase == _GCmarktermination
  }
  
  // gcMarkWorkerMode represents the mode that a concurrent mark worker
@@ -278,10 +301,13 @@ func pollFractionalWorkerExit() bool {
         return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
  }
  
-var work struct {
+var work workType
+
+type workType struct {
         full  lfstack          // lock-free list of full blocks workbuf
+       _     cpu.CacheLinePad // prevents false-sharing between full and empty
         empty lfstack          // lock-free list of empty blocks workbuf
-       pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
+       _     cpu.CacheLinePad // prevents false-sharing between empty and nproc/nwait
  
         wbufSpans struct {
                 lock mutex
@@ -320,11 +346,20 @@ var work struct {
         nwait  uint32
  
         // Number of roots of various root types. Set by gcMarkRootPrepare.
+       //
+       // nStackRoots == len(stackRoots), but we have nStackRoots for
+       // consistency.
         nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
  
         // Base indexes of each root type. Set by gcMarkRootPrepare.
         baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32
  
+       // stackRoots is a snapshot of all of the Gs that existed
+       // before the beginning of concurrent marking. The backing
+       // store of this must not be modified because it might be
+       // shared with allgs.
+       stackRoots []*g
+
         // Each type of GC state transition is protected by a lock.
         // Since multiple threads can simultaneously detect the state
         // transition condition, any thread that detects a transition
@@ -354,10 +389,6 @@ var work struct {
         // explicit user call.
         userForced bool
  
-       // totaltime is the CPU nanoseconds spent in GC since the
-       // program started if debug.gctrace > 0.
-       totaltime int64
-
         // initialHeapLive is the value of gcController.heapLive at the
         // beginning of this GC cycle.
         initialHeapLive uint64
@@ -381,7 +412,7 @@ var work struct {
         // cycle is sweep termination, mark, mark termination, and
         // sweep. This differs from memstats.numgc, which is
         // incremented at mark termination.
-       cycles uint32
+       cycles atomic.Uint32
  
         // Timing/utilization stats for this cycle.
         stwprocs, maxprocs                 int32
@@ -391,7 +422,10 @@ var work struct {
         pauseStart int64 // nanotime() of last STW
  
         // debug.gctrace heap sizes for this cycle.
-       heap0, heap1, heap2, heapGoal uint64
+       heap0, heap1, heap2 uint64
+
+       // Cumulative estimated CPU usage.
+       cpuStats
  }
  
  // GC runs a garbage collection and blocks the caller until the
@@ -424,7 +458,7 @@ func GC() {
  
         // Wait until the current sweep termination, mark, and mark
         // termination complete.
-       n := atomic.Load(&work.cycles)
+       n := work.cycles.Load()
         gcWaitOnMark(n)
  
         // We're now in sweep N or later. Trigger GC cycle N+1, which
@@ -439,8 +473,7 @@ func GC() {
         // complete the cycle and because runtime.GC() is often used
         // as part of tests and benchmarks to get the system into a
         // relatively stable and isolated state.
-       for atomic.Load(&work.cycles) == n+1 && sweepone() != ^uintptr(0) {
-               sweep.nbgsweep++
+       for work.cycles.Load() == n+1 && sweepone() != ^uintptr(0) {
                 Gosched()
         }
  
@@ -455,7 +488,7 @@ func GC() {
         // First, wait for sweeping to finish. (We know there are no
         // more spans on the sweep queue, but we may be concurrently
         // sweeping spans, so we have to wait.)
-       for atomic.Load(&work.cycles) == n+1 && !isSweepDone() {
+       for work.cycles.Load() == n+1 && !isSweepDone() {
                 Gosched()
         }
  
@@ -463,7 +496,7 @@ func GC() {
         // stable heap profile. Only do this if we haven't already hit
         // another mark termination.
         mp := acquirem()
-       cycle := atomic.Load(&work.cycles)
+       cycle := work.cycles.Load()
         if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
                 mProf_PostSweep()
         }
@@ -476,7 +509,7 @@ func gcWaitOnMark(n uint32) {
         for {
                 // Disable phase transitions.
                 lock(&work.sweepWaiters.lock)
-               nMarks := atomic.Load(&work.cycles)
+               nMarks := work.cycles.Load()
                 if gcphase != _GCmark {
                         // We've already completed this cycle's mark.
                         nMarks++
@@ -490,7 +523,7 @@ func gcWaitOnMark(n uint32) {
                 // Wait until sweep termination, mark, and mark
                 // termination of cycle N complete.
                 work.sweepWaiters.list.push(getg())
-               goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
+               goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceBlockUntilGCEnds, 1)
         }
  }
  
@@ -534,25 +567,22 @@ const (
  // that the exit condition for the _GCoff phase has been met. The exit
  // condition should be tested when allocating.
  func (t gcTrigger) test() bool {
-       if !memstats.enablegc || panicking != 0 || gcphase != _GCoff {
+       if !memstats.enablegc || panicking.Load() != 0 || gcphase != _GCoff {
                 return false
         }
         switch t.kind {
         case gcTriggerHeap:
-               // Non-atomic access to gcController.heapLive for performance. If
-               // we are going to trigger on this, this thread just
-               // atomically wrote gcController.heapLive anyway and we'll see our
-               // own write.
-               return gcController.heapLive >= gcController.trigger
+               trigger, _ := gcController.trigger()
+               return gcController.heapLive.Load() >= trigger
         case gcTriggerTime:
-               if gcController.gcPercent < 0 {
+               if gcController.gcPercent.Load() < 0 {
                         return false
                 }
                 lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
                 return lastgc != 0 && t.now-lastgc > forcegcperiod
         case gcTriggerCycle:
                 // t.n > work.cycles, but accounting for wraparound.
-               return int32(t.n-work.cycles) > 0
+               return int32(t.n-work.cycles.Load()) > 0
         }
         return true
  }
@@ -587,7 +617,6 @@ func gcStart(trigger gcTrigger) {
         // We check the transition condition continuously here in case
         // this G gets delayed in to the next GC cycle.
         for trigger.test() && sweepone() != ^uintptr(0) {
-               sweep.nbgsweep++
         }
  
         // Perform GC initialization and the sweep termination
@@ -599,9 +628,6 @@ func gcStart(trigger gcTrigger) {
                 return
         }
  
-       // For stats, check if this GC was forced by the user.
-       work.userForced = trigger.kind == gcTriggerCycle
-
         // In gcstoptheworld debug mode, upgrade the mode accordingly.
         // We do this after re-checking the transition condition so
         // that multiple goroutines that detect the heap trigger don't
@@ -617,13 +643,19 @@ func gcStart(trigger gcTrigger) {
         semacquire(&gcsema)
         semacquire(&worldsema)
  
-       if trace.enabled {
-               traceGCStart()
+       // For stats, check if this GC was forced by the user.
+       // Update it under gcsema to avoid gctrace getting wrong values.
+       work.userForced = trigger.kind == gcTriggerCycle
+
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GCStart()
+               traceRelease(trace)
         }
  
         // Check that all Ps have finished deferred mcache flushes.
         for _, p := range allp {
-               if fg := atomic.Load(&p.mcache.flushGen); fg != mheap_.sweepgen {
+               if fg := p.mcache.flushGen.Load(); fg != mheap_.sweepgen {
                         println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
                         throw("p mcache not flushed")
                 }
@@ -639,30 +671,31 @@ func gcStart(trigger gcTrigger) {
                 // so it can't be more than ncpu, even if GOMAXPROCS is.
                 work.stwprocs = ncpu
         }
-       work.heap0 = atomic.Load64(&gcController.heapLive)
+       work.heap0 = gcController.heapLive.Load()
         work.pauseNS = 0
         work.mode = mode
  
         now := nanotime()
         work.tSweepTerm = now
         work.pauseStart = now
-       if trace.enabled {
-               traceGCSTWStart(1)
-       }
-       systemstack(stopTheWorldWithSema)
+       systemstack(func() { stopTheWorldWithSema(stwGCSweepTerm) })
         // Finish sweep before we start concurrent scan.
         systemstack(func() {
                 finishsweep_m()
         })
  
-       // clearpools before we start the GC. If we wait they memory will not be
+       // clearpools before we start the GC. If we wait the memory will not be
         // reclaimed until the next GC cycle.
         clearpools()
  
-       work.cycles++
+       work.cycles.Add(1)
  
-       gcController.startCycle()
-       work.heapGoal = gcController.heapGoal
+       // Assists and workers can start the moment we start
+       // the world.
+       gcController.startCycle(now, int(gomaxprocs), trigger)
+
+       // Notify the CPU limiter that assists may begin.
+       gcCPULimiter.startGCTransition(true, now)
  
         // In STW mode, disable scheduling of user Gs. This may also
         // disable scheduling of this goroutine, so it may block as
@@ -683,11 +716,11 @@ func gcStart(trigger gcTrigger) {
         // enabled because they must be enabled before
         // any non-leaf heap objects are marked. Since
         // allocations are blocked until assists can
-       // happen, we want enable assists as early as
+       // happen, we want to enable assists as early as
         // possible.
         setGCPhase(_GCmark)
  
-       gcBgMarkPrepare() // Must happen before assist enable.
+       gcBgMarkPrepare() // Must happen before assists are enabled.
         gcMarkRootPrepare()
  
         // Mark all active tinyalloc blocks. Since we're
@@ -704,20 +737,23 @@ func gcStart(trigger gcTrigger) {
         // mutators.
         atomic.Store(&gcBlackenEnabled, 1)
  
-       // Assists and workers can start the moment we start
-       // the world.
-       gcController.markStartTime = now
-
         // In STW mode, we could block the instant systemstack
         // returns, so make sure we're not preemptible.
         mp = acquirem()
  
         // Concurrent mark.
         systemstack(func() {
-               now = startTheWorldWithSema(trace.enabled)
+               now = startTheWorldWithSema()
                 work.pauseNS += now - work.pauseStart
                 work.tMark = now
                 memstats.gcPauseDist.record(now - work.pauseStart)
+
+               sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
+               work.cpuStats.gcPauseTime += sweepTermCpu
+               work.cpuStats.gcTotalTime += sweepTermCpu
+
+               // Release the CPU limiter.
+               gcCPULimiter.finishGCTransition(now)
         })
  
         // Release the world sema before Gosched() in STW mode
@@ -753,7 +789,7 @@ var gcMarkDoneFlushed uint32
  // This should be called when all local mark work has been drained and
  // there are no remaining workers. Specifically, when
  //
-//   work.nwait == work.nproc && !gcMarkWorkAvailable(p)
+//     work.nwait == work.nproc && !gcMarkWorkAvailable(p)
  //
  // The calling context must be preemptible.
  //
@@ -788,31 +824,22 @@ top:
  
         // Flush all local buffers and collect flushedWork flags.
         gcMarkDoneFlushed = 0
-       systemstack(func() {
-               gp := getg().m.curg
-               // Mark the user stack as preemptible so that it may be scanned.
-               // Otherwise, our attempt to force all P's to a safepoint could
-               // result in a deadlock as we attempt to preempt a worker that's
-               // trying to preempt us (e.g. for a stack scan).
-               casgstatus(gp, _Grunning, _Gwaiting)
-               forEachP(func(_p_ *p) {
-                       // Flush the write barrier buffer, since this may add
-                       // work to the gcWork.
-                       wbBufFlush1(_p_)
-
-                       // Flush the gcWork, since this may create global work
-                       // and set the flushedWork flag.
-                       //
-                       // TODO(austin): Break up these workbufs to
-                       // better distribute work.
-                       _p_.gcw.dispose()
-                       // Collect the flushedWork flag.
-                       if _p_.gcw.flushedWork {
-                               atomic.Xadd(&gcMarkDoneFlushed, 1)
-                               _p_.gcw.flushedWork = false
-                       }
-               })
-               casgstatus(gp, _Gwaiting, _Grunning)
+       forEachP(waitReasonGCMarkTermination, func(pp *p) {
+               // Flush the write barrier buffer, since this may add
+               // work to the gcWork.
+               wbBufFlush1(pp)
+
+               // Flush the gcWork, since this may create global work
+               // and set the flushedWork flag.
+               //
+               // TODO(austin): Break up these workbufs to
+               // better distribute work.
+               pp.gcw.dispose()
+               // Collect the flushedWork flag.
+               if pp.gcw.flushedWork {
+                       atomic.Xadd(&gcMarkDoneFlushed, 1)
+                       pp.gcw.flushedWork = false
+               }
         })
  
         if gcMarkDoneFlushed != 0 {
@@ -833,10 +860,7 @@ top:
         work.tMarkTerm = now
         work.pauseStart = now
         getg().m.preemptoff = "gcing"
-       if trace.enabled {
-               traceGCSTWStart(0)
-       }
-       systemstack(stopTheWorldWithSema)
+       systemstack(func() { stopTheWorldWithSema(stwGCMarkTerm) })
         // The gcphase is _GCmark, it will transition to _GCmarktermination
         // below. The important thing is that the wb remains active until
         // all marking is complete. This includes writes made by the GC.
@@ -863,7 +887,7 @@ top:
         if restart {
                 getg().m.preemptoff = ""
                 systemstack(func() {
-                       now := startTheWorldWithSema(true)
+                       now := startTheWorldWithSema()
                         work.pauseNS += now - work.pauseStart
                         memstats.gcPauseDist.record(now - work.pauseStart)
                 })
@@ -871,10 +895,15 @@ top:
                 goto top
         }
  
+       gcComputeStartingStackSize()
+
         // Disable assists and background workers. We must do
         // this before waking blocked assists.
         atomic.Store(&gcBlackenEnabled, 0)
  
+       // Notify the CPU limiter that GC assists will now cease.
+       gcCPULimiter.startGCTransition(false, now)
+
         // Wake all blocked assists. These will run when we
         // start the world again.
         gcWakeAllAssists()
@@ -891,28 +920,26 @@ top:
         // endCycle depends on all gcWork cache stats being flushed.
         // The termination algorithm above ensured that up to
         // allocations since the ragged barrier.
-       nextTriggerRatio := gcController.endCycle(work.userForced)
+       gcController.endCycle(now, int(gomaxprocs), work.userForced)
  
         // Perform mark termination. This will restart the world.
-       gcMarkTermination(nextTriggerRatio)
+       gcMarkTermination()
  }
  
  // World must be stopped and mark assists and background workers must be
  // disabled.
-func gcMarkTermination(nextTriggerRatio float64) {
+func gcMarkTermination() {
         // Start marktermination (write barrier remains enabled for now).
         setGCPhase(_GCmarktermination)
  
-       work.heap1 = gcController.heapLive
+       work.heap1 = gcController.heapLive.Load()
         startTime := nanotime()
  
         mp := acquirem()
         mp.preemptoff = "gcing"
-       _g_ := getg()
-       _g_.m.traceback = 2
-       gp := _g_.m.curg
-       casgstatus(gp, _Grunning, _Gwaiting)
-       gp.waitreason = waitReasonGarbageCollection
+       mp.traceback = 2
+       curgp := mp.curg
+       casGToWaiting(curgp, _Grunning, waitReasonGarbageCollection)
  
         // Run gc on the g0 stack. We do this so that the g stack
         // we're currently running on will no longer change. Cuts
@@ -930,6 +957,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
                 // before continuing.
         })
  
+       var stwSwept bool
         systemstack(func() {
                 work.heap2 = work.bytesMarked
                 if debug.gccheckmark > 0 {
@@ -948,14 +976,16 @@ func gcMarkTermination(nextTriggerRatio float64) {
  
                 // marking is complete so we can turn the write barrier off
                 setGCPhase(_GCoff)
-               gcSweep(work.mode)
+               stwSwept = gcSweep(work.mode)
         })
  
-       _g_.m.traceback = 0
-       casgstatus(gp, _Gwaiting, _Grunning)
+       mp.traceback = 0
+       casgstatus(curgp, _Gwaiting, _Grunning)
  
-       if trace.enabled {
-               traceGCDone()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GCDone()
+               traceRelease(trace)
         }
  
         // all done
@@ -965,12 +995,12 @@ func gcMarkTermination(nextTriggerRatio float64) {
                 throw("gc done but gcphase != _GCoff")
         }
  
-       // Record heapGoal and heap_inuse for scavenger.
-       gcController.lastHeapGoal = gcController.heapGoal
-       memstats.last_heap_inuse = memstats.heap_inuse
+       // Record heapInUse for scavenger.
+       memstats.lastHeapInUse = gcController.heapInUse.load()
  
-       // Update GC trigger and pacing for the next cycle.
-       gcController.commit(nextTriggerRatio)
+       // Update GC trigger and pacing, as well as downstream consumers
+       // of this pacing information, for the next cycle.
+       systemstack(gcControllerCommit)
  
         // Update timing memstats
         now := nanotime()
@@ -985,22 +1015,28 @@ func gcMarkTermination(nextTriggerRatio float64) {
         memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
         memstats.pause_total_ns += uint64(work.pauseNS)
  
-       // Update work.totaltime.
-       sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
-       // We report idle marking time below, but omit it from the
-       // overall utilization here since it's "free".
-       markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
         markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
-       cycleCpu := sweepTermCpu + markCpu + markTermCpu
-       work.totaltime += cycleCpu
+       work.cpuStats.gcPauseTime += markTermCpu
+       work.cpuStats.gcTotalTime += markTermCpu
+
+       // Accumulate CPU stats.
+       //
+       // Pass gcMarkPhase=true so we can get all the latest GC CPU stats in there too.
+       work.cpuStats.accumulate(now, true)
  
         // Compute overall GC CPU utilization.
-       totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
-       memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
+       // Omit idle marking time from the overall utilization here since it's "free".
+       memstats.gc_cpu_fraction = float64(work.cpuStats.gcTotalTime-work.cpuStats.gcIdleTime) / float64(work.cpuStats.totalTime)
+
+       // Reset assist time and background time stats.
+       //
+       // Do this now, instead of at the start of the next GC cycle, because
+       // these two may keep accumulating even if the GC is not active.
+       scavenge.assistTime.Store(0)
+       scavenge.backgroundTime.Store(0)
  
-       // Reset sweep state.
-       sweep.nbgsweep = 0
-       sweep.npausesweep = 0
+       // Reset idle time stat.
+       sched.idleTime.Store(0)
  
         if work.userForced {
                 memstats.numforcedgc++
@@ -1012,6 +1048,15 @@ func gcMarkTermination(nextTriggerRatio float64) {
         injectglist(&work.sweepWaiters.list)
         unlock(&work.sweepWaiters.lock)
  
+       // Increment the scavenge generation now.
+       //
+       // This moment represents peak heap in use because we're
+       // about to start sweeping.
+       mheap_.pages.scav.index.nextGen()
+
+       // Release the CPU limiter.
+       gcCPULimiter.finishGCTransition(now)
+
         // Finish the current heap profiling cycle and start a new
         // heap profiling cycle. We do this before starting the world
         // so events don't leak into the wrong cycle.
@@ -1021,10 +1066,22 @@ func gcMarkTermination(nextTriggerRatio float64) {
         // Those aren't tracked in any sweep lists, so we need to
         // count them against sweep completion until we ensure all
         // those spans have been forced out.
-       sl := newSweepLocker()
-       sl.blockCompletion()
+       //
+       // If gcSweep fully swept the heap (for example if the sweep
+       // is not concurrent due to a GODEBUG setting), then we expect
+       // the sweepLocker to be invalid, since sweeping is done.
+       //
+       // N.B. Below we might duplicate some work from gcSweep; this is
+       // fine as all that work is idempotent within a GC cycle, and
+       // we're still holding worldsema so a new cycle can't start.
+       sl := sweep.active.begin()
+       if !stwSwept && !sl.valid {
+               throw("failed to set sweep barrier")
+       } else if stwSwept && sl.valid {
+               throw("non-concurrent sweep failed to drain all sweep queues")
+       }
  
-       systemstack(func() { startTheWorldWithSema(true) })
+       systemstack(func() { startTheWorldWithSema() })
  
         // Flush the heap profile so we can start a new cycle next GC.
         // This is relatively expensive, so we don't do it with the
@@ -1042,14 +1099,34 @@ func gcMarkTermination(nextTriggerRatio float64) {
         // mcache before allocating, but idle Ps may not. Since this
         // is necessary to sweep all spans, we need to ensure all
         // mcaches are flushed before we start the next GC cycle.
-       systemstack(func() {
-               forEachP(func(_p_ *p) {
-                       _p_.mcache.prepareForSweep()
-               })
+       //
+       // While we're here, flush the page cache for idle Ps to avoid
+       // having pages get stuck on them. These pages are hidden from
+       // the scavenger, so in small idle heaps a significant amount
+       // of additional memory might be held onto.
+       //
+       // Also, flush the pinner cache, to avoid leaking that memory
+       // indefinitely.
+       forEachP(waitReasonFlushProcCaches, func(pp *p) {
+               pp.mcache.prepareForSweep()
+               if pp.status == _Pidle {
+                       systemstack(func() {
+                               lock(&mheap_.lock)
+                               pp.pcache.flush(&mheap_.pages)
+                               unlock(&mheap_.lock)
+                       })
+               }
+               pp.pinnerCache = nil
         })
-       // Now that we've swept stale spans in mcaches, they don't
-       // count against unswept spans.
-       sl.dispose()
+       if sl.valid {
+               // Now that we've swept stale spans in mcaches, they don't
+               // count against unswept spans.
+               //
+               // Note: this sweepLocker may not be valid if sweeping had
+               // already completed during the STW. See the corresponding
+               // begin() call that produced sl.
+               sweep.active.end(sl)
+       }
  
         // Print gctrace before dropping worldsema. As soon as we drop
         // worldsema another cycle could start and smash the stats
@@ -1071,7 +1148,13 @@ func gcMarkTermination(nextTriggerRatio float64) {
                         prev = ns
                 }
                 print(" ms clock, ")
-               for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
+               for i, ns := range []int64{
+                       int64(work.stwprocs) * (work.tMark - work.tSweepTerm),
+                       gcController.assistTime.Load(),
+                       gcController.dedicatedMarkTime.Load() + gcController.fractionalMarkTime.Load(),
+                       gcController.idleMarkTime.Load(),
+                       markTermCpu,
+               } {
                         if i == 2 || i == 3 {
                                 // Separate mark time components with /.
                                 print("/")
@@ -1082,7 +1165,9 @@ func gcMarkTermination(nextTriggerRatio float64) {
                 }
                 print(" ms cpu, ",
                         work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
-                       work.heapGoal>>20, " MB goal, ",
+                       gcController.lastHeapGoal>>20, " MB goal, ",
+                       gcController.lastStackScan.Load()>>20, " MB stacks, ",
+                       gcController.globalsScan.Load()>>20, " MB globals, ",
                         work.maxprocs, " P")
                 if work.userForced {
                         print(" (forced)")
@@ -1091,6 +1176,20 @@ func gcMarkTermination(nextTriggerRatio float64) {
                 printunlock()
         }
  
+       // Set any arena chunks that were deferred to fault.
+       lock(&userArenaState.lock)
+       faultList := userArenaState.fault
+       userArenaState.fault = nil
+       unlock(&userArenaState.lock)
+       for _, lc := range faultList {
+               lc.mspan.setUserArenaChunkToFault()
+       }
+
+       // Enable huge pages on some metadata if we cross a heap threshold.
+       if gcController.heapGoal() > minHeapForMetadataHugePages {
+               mheap_.enableMetadataHugePages()
+       }
+
         semrelease(&worldsema)
         semrelease(&gcsema)
         // Careful: another GC cycle may start now.
@@ -1143,7 +1242,7 @@ func gcBgMarkPrepare() {
         work.nwait = ^uint32(0)
  }
  
-// gcBgMarkWorker is an entry in the gcBgMarkWorkerPool. It points to a single
+// gcBgMarkWorkerNode is an entry in the gcBgMarkWorkerPool. It points to a single
  // gcBgMarkWorker goroutine.
  type gcBgMarkWorkerNode struct {
         // Unused workers are managed in a lock-free stack. This field must be first.
@@ -1219,7 +1318,7 @@ func gcBgMarkWorker() {
                         // Note that at this point, the G may immediately be
                         // rescheduled and may be running.
                         return true
-               }, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
+               }, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceBlockSystemGoroutine, 0)
  
                 // Preemption must not occur here, or another G might see
                 // p.gcMarkWorkerMode.
@@ -1241,6 +1340,10 @@ func gcBgMarkWorker() {
  
                 startTime := nanotime()
                 pp.gcMarkWorkerStartTime = startTime
+               var trackLimiterEvent bool
+               if pp.gcMarkWorkerMode == gcMarkWorkerIdleMode {
+                       trackLimiterEvent = pp.limiterEvent.start(limiterEventIdleMarkWork, startTime)
+               }
  
                 decnwait := atomic.Xadd(&work.nwait, -1)
                 if decnwait == work.nproc {
@@ -1256,12 +1359,12 @@ func gcBgMarkWorker() {
                         // the G stack. However, stack shrinking is
                         // disabled for mark workers, so it is safe to
                         // read from the G stack.
-                       casgstatus(gp, _Grunning, _Gwaiting)
+                       casGToWaiting(gp, _Grunning, waitReasonGCWorkerActive)
                         switch pp.gcMarkWorkerMode {
                         default:
                                 throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
                         case gcMarkWorkerDedicatedMode:
-                               gcDrain(&pp.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+                               gcDrainMarkWorkerDedicated(&pp.gcw, true)
                                 if gp.preempt {
                                         // We were preempted. This is
                                         // a useful signal to kick
@@ -1276,26 +1379,24 @@ func gcBgMarkWorker() {
                                 }
                                 // Go back to draining, this time
                                 // without preemption.
-                               gcDrain(&pp.gcw, gcDrainFlushBgCredit)
+                               gcDrainMarkWorkerDedicated(&pp.gcw, false)
                         case gcMarkWorkerFractionalMode:
-                               gcDrain(&pp.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+                               gcDrainMarkWorkerFractional(&pp.gcw)
                         case gcMarkWorkerIdleMode:
-                               gcDrain(&pp.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+                               gcDrainMarkWorkerIdle(&pp.gcw)
                         }
                         casgstatus(gp, _Gwaiting, _Grunning)
                 })
  
-               // Account for time.
-               duration := nanotime() - startTime
-               switch pp.gcMarkWorkerMode {
-               case gcMarkWorkerDedicatedMode:
-                       atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
-                       atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
-               case gcMarkWorkerFractionalMode:
-                       atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
+               // Account for time and mark us as stopped.
+               now := nanotime()
+               duration := now - startTime
+               gcController.markWorkerStop(pp.gcMarkWorkerMode, duration)
+               if trackLimiterEvent {
+                       pp.limiterEvent.stop(limiterEventIdleMarkWork, now)
+               }
+               if pp.gcMarkWorkerMode == gcMarkWorkerFractionalMode {
                         atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
-               case gcMarkWorkerIdleMode:
-                       atomic.Xaddint64(&gcController.idleMarkTime, duration)
                 }
  
                 // Was this the last worker and did we run out
@@ -1317,7 +1418,7 @@ func gcBgMarkWorker() {
                 // point, signal the main GC goroutine.
                 if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
                         // We don't need the P-local buffers here, allow
-                       // preemption becuse we may schedule like a regular
+                       // preemption because we may schedule like a regular
                         // goroutine in gcMarkDone (block on locks, etc).
                         releasem(node.m.ptr())
                         node.m.set(nil)
@@ -1367,9 +1468,11 @@ func gcMark(startTime int64) {
                 // Gs, so only do it if checkmark is also enabled.
                 gcMarkRootCheck()
         }
-       if work.full != 0 {
-               throw("work.full != 0")
-       }
+
+       // Drop allg snapshot. allgs may have grown, in which case
+       // this is the only reference to the old backing store and
+       // there's no need to keep it around.
+       work.stackRoots = nil
  
         // Clear out buffers and double-check that all gcWork caches
         // are empty. This should be ensured by gcMarkDone before we
@@ -1415,39 +1518,33 @@ func gcMark(startTime int64) {
                 gcw.dispose()
         }
  
-       // Update the marked heap stat.
-       gcController.heapMarked = work.bytesMarked
-
         // Flush scanAlloc from each mcache since we're about to modify
         // heapScan directly. If we were to flush this later, then scanAlloc
         // might have incorrect information.
+       //
+       // Note that it's not important to retain this information; we know
+       // exactly what heapScan is at this point via scanWork.
         for _, p := range allp {
                 c := p.mcache
                 if c == nil {
                         continue
                 }
-               gcController.heapScan += uint64(c.scanAlloc)
                 c.scanAlloc = 0
         }
  
-       // Update other GC heap size stats. This must happen after
-       // cachestats (which flushes local statistics to these) and
-       // flushallmcaches (which modifies gcController.heapLive).
-       gcController.heapLive = work.bytesMarked
-       gcController.heapScan = uint64(gcController.scanWork)
-
-       if trace.enabled {
-               traceHeapAlloc()
-       }
+       // Reset controller state.
+       gcController.resetLive(work.bytesMarked)
  }
  
  // gcSweep must be called on the system stack because it acquires the heap
  // lock. See mheap for details.
  //
+// Returns true if the heap was fully swept by this function.
+//
  // The world must be stopped.
  //
  //go:systemstack
-func gcSweep(mode gcMode) {
+func gcSweep(mode gcMode) bool {
         assertWorldStopped()
  
         if gcphase != _GCoff {
@@ -1456,24 +1553,27 @@ func gcSweep(mode gcMode) {
  
         lock(&mheap_.lock)
         mheap_.sweepgen += 2
-       mheap_.sweepDrained = 0
-       mheap_.pagesSwept = 0
+       sweep.active.reset()
+       mheap_.pagesSwept.Store(0)
         mheap_.sweepArenas = mheap_.allArenas
-       mheap_.reclaimIndex = 0
-       mheap_.reclaimCredit = 0
+       mheap_.reclaimIndex.Store(0)
+       mheap_.reclaimCredit.Store(0)
         unlock(&mheap_.lock)
  
         sweep.centralIndex.clear()
  
-       if !_ConcurrentSweep || mode == gcForceBlockMode {
+       if !concurrentSweep || mode == gcForceBlockMode {
                 // Special case synchronous sweep.
                 // Record that no proportional sweeping has to happen.
                 lock(&mheap_.lock)
                 mheap_.sweepPagesPerByte = 0
                 unlock(&mheap_.lock)
+               // Flush all mcaches.
+               for _, pp := range allp {
+                       pp.mcache.prepareForSweep()
+               }
                 // Sweep all spans eagerly.
                 for sweepone() != ^uintptr(0) {
-                       sweep.npausesweep++
                 }
                 // Free workbufs eagerly.
                 prepareFreeWorkbufs()
@@ -1484,7 +1584,7 @@ func gcSweep(mode gcMode) {
                 // available immediately.
                 mProf_NextCycle()
                 mProf_Flush()
-               return
+               return true
         }
  
         // Background sweep.
@@ -1494,6 +1594,7 @@ func gcSweep(mode gcMode) {
                 ready(sweep.g, 0, true)
         }
         unlock(&sweep.lock)
+       return false
  }
  
  // gcResetMarkState resets global state prior to marking (concurrent
@@ -1527,24 +1628,35 @@ func gcResetMarkState() {
         }
  
         work.bytesMarked = 0
-       work.initialHeapLive = atomic.Load64(&gcController.heapLive)
+       work.initialHeapLive = gcController.heapLive.Load()
  }
  
  // Hooks for other packages
  
  var poolcleanup func()
+var boringCaches []unsafe.Pointer // for crypto/internal/boring
  
  //go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
  func sync_runtime_registerPoolCleanup(f func()) {
         poolcleanup = f
  }
  
+//go:linkname boring_registerCache crypto/internal/boring/bcache.registerCache
+func boring_registerCache(p unsafe.Pointer) {
+       boringCaches = append(boringCaches, p)
+}
+
  func clearpools() {
         // clear sync.Pools
         if poolcleanup != nil {
                 poolcleanup()
         }
  
+       // clear boringcrypto caches
+       for _, p := range boringCaches {
+               atomicstorep(p, nil)
+       }
+
         // Clear central sudog cache.
         // Leave per-P caches alone, they have strictly bounded size.
         // Disconnect cached list before dropping it on the floor,
@@ -1558,19 +1670,17 @@ func clearpools() {
         sched.sudogcache = nil
         unlock(&sched.sudoglock)
  
-       // Clear central defer pools.
+       // Clear central defer pool.
         // Leave per-P pools alone, they have strictly bounded size.
         lock(&sched.deferlock)
-       for i := range sched.deferpool {
-               // disconnect cached list before dropping it on the floor,
-               // so that a dangling ref to one entry does not pin all of them.
-               var d, dlink *_defer
-               for d = sched.deferpool[i]; d != nil; d = dlink {
-                       dlink = d.link
-                       d.link = nil
-               }
-               sched.deferpool[i] = nil
+       // disconnect cached list before dropping it on the floor,
+       // so that a dangling ref to one entry does not pin all of them.
+       var d, dlink *_defer
+       for d = sched.deferpool; d != nil; d = dlink {
+               dlink = d.link
+               d.link = nil
         }
+       sched.deferpool = nil
         unlock(&sched.deferlock)
  }