runtime: refactor runtime->tracer API to appear more like a lock

[gostls13.git] / src / runtime / mgc.go
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go

index 1b057070aaf20c6b2267478b5571443e0ad7e593..30d2f1d3852b4430c6ac2039e5a0f3d1dcf43ba3 100644 (file)
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -135,9 +135,12 @@ import (
  )
  
  const (
-       _DebugGC         = 0
-       _ConcurrentSweep = true
-       _FinBlockSize    = 4 * 1024
+       _DebugGC      = 0
+       _FinBlockSize = 4 * 1024
+
+       // concurrentSweep is a debug flag. Disabling this flag
+       // ensures all spans are swept while the world is stopped.
+       concurrentSweep = true
  
         // debugScanConservative enables debug logging for stack
         // frames that are scanned conservatively.
@@ -149,6 +152,28 @@ const (
         sweepMinHeapDistance = 1024 * 1024
  )
  
+// heapObjectsCanMove always returns false in the current garbage collector.
+// It exists for go4.org/unsafe/assume-no-moving-gc, which is an
+// unfortunate idea that had an even more unfortunate implementation.
+// Every time a new Go release happened, the package stopped building,
+// and the authors had to add a new file with a new //go:build line, and
+// then the entire ecosystem of packages with that as a dependency had to
+// explicitly update to the new version. Many packages depend on
+// assume-no-moving-gc transitively, through paths like
+// inet.af/netaddr -> go4.org/intern -> assume-no-moving-gc.
+// This was causing a significant amount of friction around each new
+// release, so we added this bool for the package to //go:linkname
+// instead. The bool is still unfortunate, but it's not as bad as
+// breaking the ecosystem on every new release.
+//
+// If the Go garbage collector ever does move heap objects, we can set
+// this to true to break all the programs using assume-no-moving-gc.
+//
+//go:linkname heapObjectsCanMove
+func heapObjectsCanMove() bool {
+       return false
+}
+
  func gcinit() {
         if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
                 throw("size of Workbuf is suboptimal")
@@ -193,8 +218,6 @@ var gcphase uint32
  var writeBarrier struct {
         enabled bool    // compiler emits a check of this before calling write barrier
         pad     [3]byte // compiler uses 32-bit load for "enabled" field
-       needed  bool    // whether we need a write barrier for current GC phase
-       cgo     bool    // whether we need a write barrier for a cgo check
         alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
  }
  
@@ -212,8 +235,7 @@ const (
  //go:nosplit
  func setGCPhase(x uint32) {
         atomic.Store(&gcphase, x)
-       writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
-       writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
+       writeBarrier.enabled = gcphase == _GCmark || gcphase == _GCmarktermination
  }
  
  // gcMarkWorkerMode represents the mode that a concurrent mark worker
@@ -283,8 +305,9 @@ var work workType
  
  type workType struct {
         full  lfstack          // lock-free list of full blocks workbuf
+       _     cpu.CacheLinePad // prevents false-sharing between full and empty
         empty lfstack          // lock-free list of empty blocks workbuf
-       pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
+       _     cpu.CacheLinePad // prevents false-sharing between empty and nproc/nwait
  
         wbufSpans struct {
                 lock mutex
@@ -451,7 +474,6 @@ func GC() {
         // as part of tests and benchmarks to get the system into a
         // relatively stable and isolated state.
         for work.cycles.Load() == n+1 && sweepone() != ^uintptr(0) {
-               sweep.nbgsweep++
                 Gosched()
         }
  
@@ -501,7 +523,7 @@ func gcWaitOnMark(n uint32) {
                 // Wait until sweep termination, mark, and mark
                 // termination of cycle N complete.
                 work.sweepWaiters.list.push(getg())
-               goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
+               goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceBlockUntilGCEnds, 1)
         }
  }
  
@@ -550,10 +572,6 @@ func (t gcTrigger) test() bool {
         }
         switch t.kind {
         case gcTriggerHeap:
-               // Non-atomic access to gcController.heapLive for performance. If
-               // we are going to trigger on this, this thread just
-               // atomically wrote gcController.heapLive anyway and we'll see our
-               // own write.
                 trigger, _ := gcController.trigger()
                 return gcController.heapLive.Load() >= trigger
         case gcTriggerTime:
@@ -599,7 +617,6 @@ func gcStart(trigger gcTrigger) {
         // We check the transition condition continuously here in case
         // this G gets delayed in to the next GC cycle.
         for trigger.test() && sweepone() != ^uintptr(0) {
-               sweep.nbgsweep++
         }
  
         // Perform GC initialization and the sweep termination
@@ -630,8 +647,10 @@ func gcStart(trigger gcTrigger) {
         // Update it under gcsema to avoid gctrace getting wrong values.
         work.userForced = trigger.kind == gcTriggerCycle
  
-       if trace.enabled {
-               traceGCStart()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GCStart()
+               traceRelease(trace)
         }
  
         // Check that all Ps have finished deferred mcache flushes.
@@ -659,16 +678,13 @@ func gcStart(trigger gcTrigger) {
         now := nanotime()
         work.tSweepTerm = now
         work.pauseStart = now
-       if trace.enabled {
-               traceGCSTWStart(1)
-       }
-       systemstack(stopTheWorldWithSema)
+       systemstack(func() { stopTheWorldWithSema(stwGCSweepTerm) })
         // Finish sweep before we start concurrent scan.
         systemstack(func() {
                 finishsweep_m()
         })
  
-       // clearpools before we start the GC. If we wait they memory will not be
+       // clearpools before we start the GC. If we wait the memory will not be
         // reclaimed until the next GC cycle.
         clearpools()
  
@@ -700,11 +716,11 @@ func gcStart(trigger gcTrigger) {
         // enabled because they must be enabled before
         // any non-leaf heap objects are marked. Since
         // allocations are blocked until assists can
-       // happen, we want enable assists as early as
+       // happen, we want to enable assists as early as
         // possible.
         setGCPhase(_GCmark)
  
-       gcBgMarkPrepare() // Must happen before assist enable.
+       gcBgMarkPrepare() // Must happen before assists are enabled.
         gcMarkRootPrepare()
  
         // Mark all active tinyalloc blocks. Since we're
@@ -727,11 +743,15 @@ func gcStart(trigger gcTrigger) {
  
         // Concurrent mark.
         systemstack(func() {
-               now = startTheWorldWithSema(trace.enabled)
+               now = startTheWorldWithSema()
                 work.pauseNS += now - work.pauseStart
                 work.tMark = now
                 memstats.gcPauseDist.record(now - work.pauseStart)
  
+               sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
+               work.cpuStats.gcPauseTime += sweepTermCpu
+               work.cpuStats.gcTotalTime += sweepTermCpu
+
                 // Release the CPU limiter.
                 gcCPULimiter.finishGCTransition(now)
         })
@@ -849,10 +869,7 @@ top:
         work.tMarkTerm = now
         work.pauseStart = now
         getg().m.preemptoff = "gcing"
-       if trace.enabled {
-               traceGCSTWStart(0)
-       }
-       systemstack(stopTheWorldWithSema)
+       systemstack(func() { stopTheWorldWithSema(stwGCMarkTerm) })
         // The gcphase is _GCmark, it will transition to _GCmarktermination
         // below. The important thing is that the wb remains active until
         // all marking is complete. This includes writes made by the GC.
@@ -879,7 +896,7 @@ top:
         if restart {
                 getg().m.preemptoff = ""
                 systemstack(func() {
-                       now := startTheWorldWithSema(trace.enabled)
+                       now := startTheWorldWithSema()
                         work.pauseNS += now - work.pauseStart
                         memstats.gcPauseDist.record(now - work.pauseStart)
                 })
@@ -949,6 +966,7 @@ func gcMarkTermination() {
                 // before continuing.
         })
  
+       var stwSwept bool
         systemstack(func() {
                 work.heap2 = work.bytesMarked
                 if debug.gccheckmark > 0 {
@@ -967,14 +985,16 @@ func gcMarkTermination() {
  
                 // marking is complete so we can turn the write barrier off
                 setGCPhase(_GCoff)
-               gcSweep(work.mode)
+               stwSwept = gcSweep(work.mode)
         })
  
         mp.traceback = 0
         casgstatus(curgp, _Gwaiting, _Grunning)
  
-       if trace.enabled {
-               traceGCDone()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GCDone()
+               traceRelease(trace)
         }
  
         // all done
@@ -1004,43 +1024,14 @@ func gcMarkTermination() {
         memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
         memstats.pause_total_ns += uint64(work.pauseNS)
  
-       sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
-       // We report idle marking time below, but omit it from the
-       // overall utilization here since it's "free".
-       markAssistCpu := gcController.assistTime.Load()
-       markDedicatedCpu := gcController.dedicatedMarkTime.Load()
-       markFractionalCpu := gcController.fractionalMarkTime.Load()
-       markIdleCpu := gcController.idleMarkTime.Load()
         markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
-       scavAssistCpu := scavenge.assistTime.Load()
-       scavBgCpu := scavenge.backgroundTime.Load()
-
-       // Update cumulative GC CPU stats.
-       work.cpuStats.gcAssistTime += markAssistCpu
-       work.cpuStats.gcDedicatedTime += markDedicatedCpu + markFractionalCpu
-       work.cpuStats.gcIdleTime += markIdleCpu
-       work.cpuStats.gcPauseTime += sweepTermCpu + markTermCpu
-       work.cpuStats.gcTotalTime += sweepTermCpu + markAssistCpu + markDedicatedCpu + markFractionalCpu + markIdleCpu + markTermCpu
-
-       // Update cumulative scavenge CPU stats.
-       work.cpuStats.scavengeAssistTime += scavAssistCpu
-       work.cpuStats.scavengeBgTime += scavBgCpu
-       work.cpuStats.scavengeTotalTime += scavAssistCpu + scavBgCpu
-
-       // Update total CPU.
-       work.cpuStats.totalTime = sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
-       work.cpuStats.idleTime += sched.idleTime.Load()
-
-       // Compute userTime. We compute this indirectly as everything that's not the above.
+       work.cpuStats.gcPauseTime += markTermCpu
+       work.cpuStats.gcTotalTime += markTermCpu
+
+       // Accumulate CPU stats.
         //
-       // Since time spent in _Pgcstop is covered by gcPauseTime, and time spent in _Pidle
-       // is covered by idleTime, what we're left with is time spent in _Prunning and _Psyscall,
-       // the latter of which is fine because the P will either go idle or get used for something
-       // else via sysmon. Meanwhile if we subtract GC time from whatever's left, we get non-GC
-       // _Prunning time. Note that this still leaves time spent in sweeping and in the scheduler,
-       // but that's fine. The overwhelming majority of this time will be actual user time.
-       work.cpuStats.userTime = work.cpuStats.totalTime - (work.cpuStats.gcTotalTime +
-               work.cpuStats.scavengeTotalTime + work.cpuStats.idleTime)
+       // Pass gcMarkPhase=true so we can get all the latest GC CPU stats in there too.
+       work.cpuStats.accumulate(now, true)
  
         // Compute overall GC CPU utilization.
         // Omit idle marking time from the overall utilization here since it's "free".
@@ -1056,10 +1047,6 @@ func gcMarkTermination() {
         // Reset idle time stat.
         sched.idleTime.Store(0)
  
-       // Reset sweep state.
-       sweep.nbgsweep = 0
-       sweep.npausesweep = 0
-
         if work.userForced {
                 memstats.numforcedgc++
         }
@@ -1070,6 +1057,12 @@ func gcMarkTermination() {
         injectglist(&work.sweepWaiters.list)
         unlock(&work.sweepWaiters.lock)
  
+       // Increment the scavenge generation now.
+       //
+       // This moment represents peak heap in use because we're
+       // about to start sweeping.
+       mheap_.pages.scav.index.nextGen()
+
         // Release the CPU limiter.
         gcCPULimiter.finishGCTransition(now)
  
@@ -1082,12 +1075,22 @@ func gcMarkTermination() {
         // Those aren't tracked in any sweep lists, so we need to
         // count them against sweep completion until we ensure all
         // those spans have been forced out.
+       //
+       // If gcSweep fully swept the heap (for example if the sweep
+       // is not concurrent due to a GODEBUG setting), then we expect
+       // the sweepLocker to be invalid, since sweeping is done.
+       //
+       // N.B. Below we might duplicate some work from gcSweep; this is
+       // fine as all that work is idempotent within a GC cycle, and
+       // we're still holding worldsema so a new cycle can't start.
         sl := sweep.active.begin()
-       if !sl.valid {
+       if !stwSwept && !sl.valid {
                 throw("failed to set sweep barrier")
+       } else if stwSwept && sl.valid {
+               throw("non-concurrent sweep failed to drain all sweep queues")
         }
  
-       systemstack(func() { startTheWorldWithSema(trace.enabled) })
+       systemstack(func() { startTheWorldWithSema() })
  
         // Flush the heap profile so we can start a new cycle next GC.
         // This is relatively expensive, so we don't do it with the
@@ -1105,14 +1108,36 @@ func gcMarkTermination() {
         // mcache before allocating, but idle Ps may not. Since this
         // is necessary to sweep all spans, we need to ensure all
         // mcaches are flushed before we start the next GC cycle.
+       //
+       // While we're here, flush the page cache for idle Ps to avoid
+       // having pages get stuck on them. These pages are hidden from
+       // the scavenger, so in small idle heaps a significant amount
+       // of additional memory might be held onto.
+       //
+       // Also, flush the pinner cache, to avoid leaking that memory
+       // indefinitely.
         systemstack(func() {
                 forEachP(func(pp *p) {
                         pp.mcache.prepareForSweep()
+                       if pp.status == _Pidle {
+                               systemstack(func() {
+                                       lock(&mheap_.lock)
+                                       pp.pcache.flush(&mheap_.pages)
+                                       unlock(&mheap_.lock)
+                               })
+                       }
+                       pp.pinnerCache = nil
                 })
         })
-       // Now that we've swept stale spans in mcaches, they don't
-       // count against unswept spans.
-       sweep.active.end(sl)
+       if sl.valid {
+               // Now that we've swept stale spans in mcaches, they don't
+               // count against unswept spans.
+               //
+               // Note: this sweepLocker may not be valid if sweeping had
+               // already completed during the STW. See the corresponding
+               // begin() call that produced sl.
+               sweep.active.end(sl)
+       }
  
         // Print gctrace before dropping worldsema. As soon as we drop
         // worldsema another cycle could start and smash the stats
@@ -1135,7 +1160,7 @@ func gcMarkTermination() {
                 }
                 print(" ms clock, ")
                 for i, ns := range []int64{
-                       sweepTermCpu,
+                       int64(work.stwprocs) * (work.tMark - work.tSweepTerm),
                         gcController.assistTime.Load(),
                         gcController.dedicatedMarkTime.Load() + gcController.fractionalMarkTime.Load(),
                         gcController.idleMarkTime.Load(),
@@ -1171,6 +1196,11 @@ func gcMarkTermination() {
                 lc.mspan.setUserArenaChunkToFault()
         }
  
+       // Enable huge pages on some metadata if we cross a heap threshold.
+       if gcController.heapGoal() > minHeapForMetadataHugePages {
+               mheap_.enableMetadataHugePages()
+       }
+
         semrelease(&worldsema)
         semrelease(&gcsema)
         // Careful: another GC cycle may start now.
@@ -1299,7 +1329,7 @@ func gcBgMarkWorker() {
                         // Note that at this point, the G may immediately be
                         // rescheduled and may be running.
                         return true
-               }, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
+               }, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceBlockSystemGoroutine, 0)
  
                 // Preemption must not occur here, or another G might see
                 // p.gcMarkWorkerMode.
@@ -1345,7 +1375,7 @@ func gcBgMarkWorker() {
                         default:
                                 throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
                         case gcMarkWorkerDedicatedMode:
-                               gcDrain(&pp.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+                               gcDrainMarkWorkerDedicated(&pp.gcw, true)
                                 if gp.preempt {
                                         // We were preempted. This is
                                         // a useful signal to kick
@@ -1360,11 +1390,11 @@ func gcBgMarkWorker() {
                                 }
                                 // Go back to draining, this time
                                 // without preemption.
-                               gcDrain(&pp.gcw, gcDrainFlushBgCredit)
+                               gcDrainMarkWorkerDedicated(&pp.gcw, false)
                         case gcMarkWorkerFractionalMode:
-                               gcDrain(&pp.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+                               gcDrainMarkWorkerFractional(&pp.gcw)
                         case gcMarkWorkerIdleMode:
-                               gcDrain(&pp.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+                               gcDrainMarkWorkerIdle(&pp.gcw)
                         }
                         casgstatus(gp, _Gwaiting, _Grunning)
                 })
@@ -1449,9 +1479,6 @@ func gcMark(startTime int64) {
                 // Gs, so only do it if checkmark is also enabled.
                 gcMarkRootCheck()
         }
-       if work.full != 0 {
-               throw("work.full != 0")
-       }
  
         // Drop allg snapshot. allgs may have grown, in which case
         // this is the only reference to the old backing store and
@@ -1523,10 +1550,12 @@ func gcMark(startTime int64) {
  // gcSweep must be called on the system stack because it acquires the heap
  // lock. See mheap for details.
  //
+// Returns true if the heap was fully swept by this function.
+//
  // The world must be stopped.
  //
  //go:systemstack
-func gcSweep(mode gcMode) {
+func gcSweep(mode gcMode) bool {
         assertWorldStopped()
  
         if gcphase != _GCoff {
@@ -1544,15 +1573,18 @@ func gcSweep(mode gcMode) {
  
         sweep.centralIndex.clear()
  
-       if !_ConcurrentSweep || mode == gcForceBlockMode {
+       if !concurrentSweep || mode == gcForceBlockMode {
                 // Special case synchronous sweep.
                 // Record that no proportional sweeping has to happen.
                 lock(&mheap_.lock)
                 mheap_.sweepPagesPerByte = 0
                 unlock(&mheap_.lock)
+               // Flush all mcaches.
+               for _, pp := range allp {
+                       pp.mcache.prepareForSweep()
+               }
                 // Sweep all spans eagerly.
                 for sweepone() != ^uintptr(0) {
-                       sweep.npausesweep++
                 }
                 // Free workbufs eagerly.
                 prepareFreeWorkbufs()
@@ -1563,7 +1595,7 @@ func gcSweep(mode gcMode) {
                 // available immediately.
                 mProf_NextCycle()
                 mProf_Flush()
-               return
+               return true
         }
  
         // Background sweep.
@@ -1573,6 +1605,7 @@ func gcSweep(mode gcMode) {
                 ready(sweep.g, 0, true)
         }
         unlock(&sweep.lock)
+       return false
  }
  
  // gcResetMarkState resets global state prior to marking (concurrent