src/runtime/mgc.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Garbage collector (GC).
   6 //
   7 // The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
   8 // GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
   9 // non-generational and non-compacting. Allocation is done using size segregated per P allocation
  10 // areas to minimize fragmentation while eliminating locks in the common case.
  11 //
  12 // The algorithm decomposes into several steps.
  13 // This is a high level description of the algorithm being used. For an overview of GC a good
  14 // place to start is Richard Jones' gchandbook.org.
  15 //
  16 // The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
  17 // Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
  18 // On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
  19 // 966-975.
  20 // For journal quality proofs that these steps are complete, correct, and terminate see
  21 // Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
  22 // Concurrency and Computation: Practice and Experience 15(3-5), 2003.
  23 //
  24 // 1. GC performs sweep termination.
  25 //
  26 //    a. Stop the world. This causes all Ps to reach a GC safe-point.
  27 //
  28 //    b. Sweep any unswept spans. There will only be unswept spans if
  29 //    this GC cycle was forced before the expected time.
  30 //
  31 // 2. GC performs the mark phase.
  32 //
  33 //    a. Prepare for the mark phase by setting gcphase to _GCmark
  34 //    (from _GCoff), enabling the write barrier, enabling mutator
  35 //    assists, and enqueueing root mark jobs. No objects may be
  36 //    scanned until all Ps have enabled the write barrier, which is
  37 //    accomplished using STW.
  38 //
  39 //    b. Start the world. From this point, GC work is done by mark
  40 //    workers started by the scheduler and by assists performed as
  41 //    part of allocation. The write barrier shades both the
  42 //    overwritten pointer and the new pointer value for any pointer
  43 //    writes (see mbarrier.go for details). Newly allocated objects
  44 //    are immediately marked black.
  45 //
  46 //    c. GC performs root marking jobs. This includes scanning all
  47 //    stacks, shading all globals, and shading any heap pointers in
  48 //    off-heap runtime data structures. Scanning a stack stops a
  49 //    goroutine, shades any pointers found on its stack, and then
  50 //    resumes the goroutine.
  51 //
  52 //    d. GC drains the work queue of grey objects, scanning each grey
  53 //    object to black and shading all pointers found in the object
  54 //    (which in turn may add those pointers to the work queue).
  55 //
  56 //    e. Because GC work is spread across local caches, GC uses a
  57 //    distributed termination algorithm to detect when there are no
  58 //    more root marking jobs or grey objects (see gcMarkDone). At this
  59 //    point, GC transitions to mark termination.
  60 //
  61 // 3. GC performs mark termination.
  62 //
  63 //    a. Stop the world.
  64 //
  65 //    b. Set gcphase to _GCmarktermination, and disable workers and
  66 //    assists.
  67 //
  68 //    c. Perform housekeeping like flushing mcaches.
  69 //
  70 // 4. GC performs the sweep phase.
  71 //
  72 //    a. Prepare for the sweep phase by setting gcphase to _GCoff,
  73 //    setting up sweep state and disabling the write barrier.
  74 //
  75 //    b. Start the world. From this point on, newly allocated objects
  76 //    are white, and allocating sweeps spans before use if necessary.
  77 //
  78 //    c. GC does concurrent sweeping in the background and in response
  79 //    to allocation. See description below.
  80 //
  81 // 5. When sufficient allocation has taken place, replay the sequence
  82 // starting with 1 above. See discussion of GC rate below.
  83
  84 // Concurrent sweep.
  85 //
  86 // The sweep phase proceeds concurrently with normal program execution.
  87 // The heap is swept span-by-span both lazily (when a goroutine needs another span)
  88 // and concurrently in a background goroutine (this helps programs that are not CPU bound).
  89 // At the end of STW mark termination all spans are marked as "needs sweeping".
  90 //
  91 // The background sweeper goroutine simply sweeps spans one-by-one.
  92 //
  93 // To avoid requesting more OS memory while there are unswept spans, when a
  94 // goroutine needs another span, it first attempts to reclaim that much memory
  95 // by sweeping. When a goroutine needs to allocate a new small-object span, it
  96 // sweeps small-object spans for the same object size until it frees at least
  97 // one object. When a goroutine needs to allocate large-object span from heap,
  98 // it sweeps spans until it frees at least that many pages into heap. There is
  99 // one case where this may not suffice: if a goroutine sweeps and frees two
 100 // nonadjacent one-page spans to the heap, it will allocate a new two-page
 101 // span, but there can still be other one-page unswept spans which could be
 102 // combined into a two-page span.
 103 //
 104 // It's critical to ensure that no operations proceed on unswept spans (that would corrupt
 105 // mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
 106 // so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
 107 // When a goroutine explicitly frees an object or sets a finalizer, it ensures that
 108 // the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
 109 // The finalizer goroutine is kicked off only when all spans are swept.
 110 // When the next GC starts, it sweeps all not-yet-swept spans (if any).
 111
 112 // GC rate.
 113 // Next GC is after we've allocated an extra amount of memory proportional to
 114 // the amount already in use. The proportion is controlled by GOGC environment variable
 115 // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
 116 // (this mark is computed by the gcController.heapGoal method). This keeps the GC cost in
 117 // linear proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 118 // (and also the amount of extra memory used).
 119
 120 // Oblets
 121 //
 122 // In order to prevent long pauses while scanning large objects and to
 123 // improve parallelism, the garbage collector breaks up scan jobs for
 124 // objects larger than maxObletBytes into "oblets" of at most
 125 // maxObletBytes. When scanning encounters the beginning of a large
 126 // object, it scans only the first oblet and enqueues the remaining
 127 // oblets as new scan jobs.
 128
 129 package runtime
 130
 131 import (
 132         "internal/cpu"
 133         "runtime/internal/atomic"
 134         "unsafe"
 135 )
 136
 137 const (
 138         _DebugGC      = 0
 139         _FinBlockSize = 4 * 1024
 140
 141         // concurrentSweep is a debug flag. Disabling this flag
 142         // ensures all spans are swept while the world is stopped.
 143         concurrentSweep = true
 144
 145         // debugScanConservative enables debug logging for stack
 146         // frames that are scanned conservatively.
 147         debugScanConservative = false
 148
 149         // sweepMinHeapDistance is a lower bound on the heap distance
 150         // (in bytes) reserved for concurrent sweeping between GC
 151         // cycles.
 152         sweepMinHeapDistance = 1024 * 1024
 153 )
 154
 155 // heapObjectsCanMove always returns false in the current garbage collector.
 156 // It exists for go4.org/unsafe/assume-no-moving-gc, which is an
 157 // unfortunate idea that had an even more unfortunate implementation.
 158 // Every time a new Go release happened, the package stopped building,
 159 // and the authors had to add a new file with a new //go:build line, and
 160 // then the entire ecosystem of packages with that as a dependency had to
 161 // explicitly update to the new version. Many packages depend on
 162 // assume-no-moving-gc transitively, through paths like
 163 // inet.af/netaddr -> go4.org/intern -> assume-no-moving-gc.
 164 // This was causing a significant amount of friction around each new
 165 // release, so we added this bool for the package to //go:linkname
 166 // instead. The bool is still unfortunate, but it's not as bad as
 167 // breaking the ecosystem on every new release.
 168 //
 169 // If the Go garbage collector ever does move heap objects, we can set
 170 // this to true to break all the programs using assume-no-moving-gc.
 171 //
 172 //go:linkname heapObjectsCanMove
 173 func heapObjectsCanMove() bool {
 174         return false
 175 }
 176
 177 func gcinit() {
 178         if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
 179                 throw("size of Workbuf is suboptimal")
 180         }
 181         // No sweep on the first cycle.
 182         sweep.active.state.Store(sweepDrainedMask)
 183
 184         // Initialize GC pacer state.
 185         // Use the environment variable GOGC for the initial gcPercent value.
 186         // Use the environment variable GOMEMLIMIT for the initial memoryLimit value.
 187         gcController.init(readGOGC(), readGOMEMLIMIT())
 188
 189         work.startSema = 1
 190         work.markDoneSema = 1
 191         lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
 192         lockInit(&work.assistQueue.lock, lockRankAssistQueue)
 193         lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
 194 }
 195
 196 // gcenable is called after the bulk of the runtime initialization,
 197 // just before we're about to start letting user code run.
 198 // It kicks off the background sweeper goroutine, the background
 199 // scavenger goroutine, and enables GC.
 200 func gcenable() {
 201         // Kick off sweeping and scavenging.
 202         c := make(chan int, 2)
 203         go bgsweep(c)
 204         go bgscavenge(c)
 205         <-c
 206         <-c
 207         memstats.enablegc = true // now that runtime is initialized, GC is okay
 208 }
 209
 210 // Garbage collector phase.
 211 // Indicates to write barrier and synchronization task to perform.
 212 var gcphase uint32
 213
 214 // The compiler knows about this variable.
 215 // If you change it, you must change builtin/runtime.go, too.
 216 // If you change the first four bytes, you must also change the write
 217 // barrier insertion code.
 218 var writeBarrier struct {
 219         enabled bool    // compiler emits a check of this before calling write barrier
 220         pad     [3]byte // compiler uses 32-bit load for "enabled" field
 221         alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
 222 }
 223
 224 // gcBlackenEnabled is 1 if mutator assists and background mark
 225 // workers are allowed to blacken objects. This must only be set when
 226 // gcphase == _GCmark.
 227 var gcBlackenEnabled uint32
 228
 229 const (
 230         _GCoff             = iota // GC not running; sweeping in background, write barrier disabled
 231         _GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
 232         _GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
 233 )
 234
 235 //go:nosplit
 236 func setGCPhase(x uint32) {
 237         atomic.Store(&gcphase, x)
 238         writeBarrier.enabled = gcphase == _GCmark || gcphase == _GCmarktermination
 239 }
 240
 241 // gcMarkWorkerMode represents the mode that a concurrent mark worker
 242 // should operate in.
 243 //
 244 // Concurrent marking happens through four different mechanisms. One
 245 // is mutator assists, which happen in response to allocations and are
 246 // not scheduled. The other three are variations in the per-P mark
 247 // workers and are distinguished by gcMarkWorkerMode.
 248 type gcMarkWorkerMode int
 249
 250 const (
 251         // gcMarkWorkerNotWorker indicates that the next scheduled G is not
 252         // starting work and the mode should be ignored.
 253         gcMarkWorkerNotWorker gcMarkWorkerMode = iota
 254
 255         // gcMarkWorkerDedicatedMode indicates that the P of a mark
 256         // worker is dedicated to running that mark worker. The mark
 257         // worker should run without preemption.
 258         gcMarkWorkerDedicatedMode
 259
 260         // gcMarkWorkerFractionalMode indicates that a P is currently
 261         // running the "fractional" mark worker. The fractional worker
 262         // is necessary when GOMAXPROCS*gcBackgroundUtilization is not
 263         // an integer and using only dedicated workers would result in
 264         // utilization too far from the target of gcBackgroundUtilization.
 265         // The fractional worker should run until it is preempted and
 266         // will be scheduled to pick up the fractional part of
 267         // GOMAXPROCS*gcBackgroundUtilization.
 268         gcMarkWorkerFractionalMode
 269
 270         // gcMarkWorkerIdleMode indicates that a P is running the mark
 271         // worker because it has nothing else to do. The idle worker
 272         // should run until it is preempted and account its time
 273         // against gcController.idleMarkTime.
 274         gcMarkWorkerIdleMode
 275 )
 276
 277 // gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
 278 // to use in execution traces.
 279 var gcMarkWorkerModeStrings = [...]string{
 280         "Not worker",
 281         "GC (dedicated)",
 282         "GC (fractional)",
 283         "GC (idle)",
 284 }
 285
 286 // pollFractionalWorkerExit reports whether a fractional mark worker
 287 // should self-preempt. It assumes it is called from the fractional
 288 // worker.
 289 func pollFractionalWorkerExit() bool {
 290         // This should be kept in sync with the fractional worker
 291         // scheduler logic in findRunnableGCWorker.
 292         now := nanotime()
 293         delta := now - gcController.markStartTime
 294         if delta <= 0 {
 295                 return true
 296         }
 297         p := getg().m.p.ptr()
 298         selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
 299         // Add some slack to the utilization goal so that the
 300         // fractional worker isn't behind again the instant it exits.
 301         return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
 302 }
 303
 304 var work workType
 305
 306 type workType struct {
 307         full  lfstack          // lock-free list of full blocks workbuf
 308         _     cpu.CacheLinePad // prevents false-sharing between full and empty
 309         empty lfstack          // lock-free list of empty blocks workbuf
 310         _     cpu.CacheLinePad // prevents false-sharing between empty and nproc/nwait
 311
 312         wbufSpans struct {
 313                 lock mutex
 314                 // free is a list of spans dedicated to workbufs, but
 315                 // that don't currently contain any workbufs.
 316                 free mSpanList
 317                 // busy is a list of all spans containing workbufs on
 318                 // one of the workbuf lists.
 319                 busy mSpanList
 320         }
 321
 322         // Restore 64-bit alignment on 32-bit.
 323         _ uint32
 324
 325         // bytesMarked is the number of bytes marked this cycle. This
 326         // includes bytes blackened in scanned objects, noscan objects
 327         // that go straight to black, and permagrey objects scanned by
 328         // markroot during the concurrent scan phase. This is updated
 329         // atomically during the cycle. Updates may be batched
 330         // arbitrarily, since the value is only read at the end of the
 331         // cycle.
 332         //
 333         // Because of benign races during marking, this number may not
 334         // be the exact number of marked bytes, but it should be very
 335         // close.
 336         //
 337         // Put this field here because it needs 64-bit atomic access
 338         // (and thus 8-byte alignment even on 32-bit architectures).
 339         bytesMarked uint64
 340
 341         markrootNext uint32 // next markroot job
 342         markrootJobs uint32 // number of markroot jobs
 343
 344         nproc  uint32
 345         tstart int64
 346         nwait  uint32
 347
 348         // Number of roots of various root types. Set by gcMarkRootPrepare.
 349         //
 350         // nStackRoots == len(stackRoots), but we have nStackRoots for
 351         // consistency.
 352         nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
 353
 354         // Base indexes of each root type. Set by gcMarkRootPrepare.
 355         baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32
 356
 357         // stackRoots is a snapshot of all of the Gs that existed
 358         // before the beginning of concurrent marking. The backing
 359         // store of this must not be modified because it might be
 360         // shared with allgs.
 361         stackRoots []*g
 362
 363         // Each type of GC state transition is protected by a lock.
 364         // Since multiple threads can simultaneously detect the state
 365         // transition condition, any thread that detects a transition
 366         // condition must acquire the appropriate transition lock,
 367         // re-check the transition condition and return if it no
 368         // longer holds or perform the transition if it does.
 369         // Likewise, any transition must invalidate the transition
 370         // condition before releasing the lock. This ensures that each
 371         // transition is performed by exactly one thread and threads
 372         // that need the transition to happen block until it has
 373         // happened.
 374         //
 375         // startSema protects the transition from "off" to mark or
 376         // mark termination.
 377         startSema uint32
 378         // markDoneSema protects transitions from mark to mark termination.
 379         markDoneSema uint32
 380
 381         bgMarkReady note   // signal background mark worker has started
 382         bgMarkDone  uint32 // cas to 1 when at a background mark completion point
 383         // Background mark completion signaling
 384
 385         // mode is the concurrency mode of the current GC cycle.
 386         mode gcMode
 387
 388         // userForced indicates the current GC cycle was forced by an
 389         // explicit user call.
 390         userForced bool
 391
 392         // initialHeapLive is the value of gcController.heapLive at the
 393         // beginning of this GC cycle.
 394         initialHeapLive uint64
 395
 396         // assistQueue is a queue of assists that are blocked because
 397         // there was neither enough credit to steal or enough work to
 398         // do.
 399         assistQueue struct {
 400                 lock mutex
 401                 q    gQueue
 402         }
 403
 404         // sweepWaiters is a list of blocked goroutines to wake when
 405         // we transition from mark termination to sweep.
 406         sweepWaiters struct {
 407                 lock mutex
 408                 list gList
 409         }
 410
 411         // cycles is the number of completed GC cycles, where a GC
 412         // cycle is sweep termination, mark, mark termination, and
 413         // sweep. This differs from memstats.numgc, which is
 414         // incremented at mark termination.
 415         cycles atomic.Uint32
 416
 417         // Timing/utilization stats for this cycle.
 418         stwprocs, maxprocs                 int32
 419         tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
 420
 421         pauseNS    int64 // total STW time this cycle
 422
 423         // debug.gctrace heap sizes for this cycle.
 424         heap0, heap1, heap2 uint64
 425
 426         // Cumulative estimated CPU usage.
 427         cpuStats
 428 }
 429
 430 // GC runs a garbage collection and blocks the caller until the
 431 // garbage collection is complete. It may also block the entire
 432 // program.
 433 func GC() {
 434         // We consider a cycle to be: sweep termination, mark, mark
 435         // termination, and sweep. This function shouldn't return
 436         // until a full cycle has been completed, from beginning to
 437         // end. Hence, we always want to finish up the current cycle
 438         // and start a new one. That means:
 439         //
 440         // 1. In sweep termination, mark, or mark termination of cycle
 441         // N, wait until mark termination N completes and transitions
 442         // to sweep N.
 443         //
 444         // 2. In sweep N, help with sweep N.
 445         //
 446         // At this point we can begin a full cycle N+1.
 447         //
 448         // 3. Trigger cycle N+1 by starting sweep termination N+1.
 449         //
 450         // 4. Wait for mark termination N+1 to complete.
 451         //
 452         // 5. Help with sweep N+1 until it's done.
 453         //
 454         // This all has to be written to deal with the fact that the
 455         // GC may move ahead on its own. For example, when we block
 456         // until mark termination N, we may wake up in cycle N+2.
 457
 458         // Wait until the current sweep termination, mark, and mark
 459         // termination complete.
 460         n := work.cycles.Load()
 461         gcWaitOnMark(n)
 462
 463         // We're now in sweep N or later. Trigger GC cycle N+1, which
 464         // will first finish sweep N if necessary and then enter sweep
 465         // termination N+1.
 466         gcStart(gcTrigger{kind: gcTriggerCycle, n: n + 1})
 467
 468         // Wait for mark termination N+1 to complete.
 469         gcWaitOnMark(n + 1)
 470
 471         // Finish sweep N+1 before returning. We do this both to
 472         // complete the cycle and because runtime.GC() is often used
 473         // as part of tests and benchmarks to get the system into a
 474         // relatively stable and isolated state.
 475         for work.cycles.Load() == n+1 && sweepone() != ^uintptr(0) {
 476                 Gosched()
 477         }
 478
 479         // Callers may assume that the heap profile reflects the
 480         // just-completed cycle when this returns (historically this
 481         // happened because this was a STW GC), but right now the
 482         // profile still reflects mark termination N, not N+1.
 483         //
 484         // As soon as all of the sweep frees from cycle N+1 are done,
 485         // we can go ahead and publish the heap profile.
 486         //
 487         // First, wait for sweeping to finish. (We know there are no
 488         // more spans on the sweep queue, but we may be concurrently
 489         // sweeping spans, so we have to wait.)
 490         for work.cycles.Load() == n+1 && !isSweepDone() {
 491                 Gosched()
 492         }
 493
 494         // Now we're really done with sweeping, so we can publish the
 495         // stable heap profile. Only do this if we haven't already hit
 496         // another mark termination.
 497         mp := acquirem()
 498         cycle := work.cycles.Load()
 499         if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
 500                 mProf_PostSweep()
 501         }
 502         releasem(mp)
 503 }
 504
 505 // gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
 506 // already completed this mark phase, it returns immediately.
 507 func gcWaitOnMark(n uint32) {
 508         for {
 509                 // Disable phase transitions.
 510                 lock(&work.sweepWaiters.lock)
 511                 nMarks := work.cycles.Load()
 512                 if gcphase != _GCmark {
 513                         // We've already completed this cycle's mark.
 514                         nMarks++
 515                 }
 516                 if nMarks > n {
 517                         // We're done.
 518                         unlock(&work.sweepWaiters.lock)
 519                         return
 520                 }
 521
 522                 // Wait until sweep termination, mark, and mark
 523                 // termination of cycle N complete.
 524                 work.sweepWaiters.list.push(getg())
 525                 goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceBlockUntilGCEnds, 1)
 526         }
 527 }
 528
 529 // gcMode indicates how concurrent a GC cycle should be.
 530 type gcMode int
 531
 532 const (
 533         gcBackgroundMode gcMode = iota // concurrent GC and sweep
 534         gcForceMode                    // stop-the-world GC now, concurrent sweep
 535         gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
 536 )
 537
 538 // A gcTrigger is a predicate for starting a GC cycle. Specifically,
 539 // it is an exit condition for the _GCoff phase.
 540 type gcTrigger struct {
 541         kind gcTriggerKind
 542         now  int64  // gcTriggerTime: current time
 543         n    uint32 // gcTriggerCycle: cycle number to start
 544 }
 545
 546 type gcTriggerKind int
 547
 548 const (
 549         // gcTriggerHeap indicates that a cycle should be started when
 550         // the heap size reaches the trigger heap size computed by the
 551         // controller.
 552         gcTriggerHeap gcTriggerKind = iota
 553
 554         // gcTriggerTime indicates that a cycle should be started when
 555         // it's been more than forcegcperiod nanoseconds since the
 556         // previous GC cycle.
 557         gcTriggerTime
 558
 559         // gcTriggerCycle indicates that a cycle should be started if
 560         // we have not yet started cycle number gcTrigger.n (relative
 561         // to work.cycles).
 562         gcTriggerCycle
 563 )
 564
 565 // test reports whether the trigger condition is satisfied, meaning
 566 // that the exit condition for the _GCoff phase has been met. The exit
 567 // condition should be tested when allocating.
 568 func (t gcTrigger) test() bool {
 569         if !memstats.enablegc || panicking.Load() != 0 || gcphase != _GCoff {
 570                 return false
 571         }
 572         switch t.kind {
 573         case gcTriggerHeap:
 574                 trigger, _ := gcController.trigger()
 575                 return gcController.heapLive.Load() >= trigger
 576         case gcTriggerTime:
 577                 if gcController.gcPercent.Load() < 0 {
 578                         return false
 579                 }
 580                 lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
 581                 return lastgc != 0 && t.now-lastgc > forcegcperiod
 582         case gcTriggerCycle:
 583                 // t.n > work.cycles, but accounting for wraparound.
 584                 return int32(t.n-work.cycles.Load()) > 0
 585         }
 586         return true
 587 }
 588
 589 // gcStart starts the GC. It transitions from _GCoff to _GCmark (if
 590 // debug.gcstoptheworld == 0) or performs all of GC (if
 591 // debug.gcstoptheworld != 0).
 592 //
 593 // This may return without performing this transition in some cases,
 594 // such as when called on a system stack or with locks held.
 595 func gcStart(trigger gcTrigger) {
 596         // Since this is called from malloc and malloc is called in
 597         // the guts of a number of libraries that might be holding
 598         // locks, don't attempt to start GC in non-preemptible or
 599         // potentially unstable situations.
 600         mp := acquirem()
 601         if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
 602                 releasem(mp)
 603                 return
 604         }
 605         releasem(mp)
 606         mp = nil
 607
 608         // Pick up the remaining unswept/not being swept spans concurrently
 609         //
 610         // This shouldn't happen if we're being invoked in background
 611         // mode since proportional sweep should have just finished
 612         // sweeping everything, but rounding errors, etc, may leave a
 613         // few spans unswept. In forced mode, this is necessary since
 614         // GC can be forced at any point in the sweeping cycle.
 615         //
 616         // We check the transition condition continuously here in case
 617         // this G gets delayed in to the next GC cycle.
 618         for trigger.test() && sweepone() != ^uintptr(0) {
 619         }
 620
 621         // Perform GC initialization and the sweep termination
 622         // transition.
 623         semacquire(&work.startSema)
 624         // Re-check transition condition under transition lock.
 625         if !trigger.test() {
 626                 semrelease(&work.startSema)
 627                 return
 628         }
 629
 630         // In gcstoptheworld debug mode, upgrade the mode accordingly.
 631         // We do this after re-checking the transition condition so
 632         // that multiple goroutines that detect the heap trigger don't
 633         // start multiple STW GCs.
 634         mode := gcBackgroundMode
 635         if debug.gcstoptheworld == 1 {
 636                 mode = gcForceMode
 637         } else if debug.gcstoptheworld == 2 {
 638                 mode = gcForceBlockMode
 639         }
 640
 641         // Ok, we're doing it! Stop everybody else
 642         semacquire(&gcsema)
 643         semacquire(&worldsema)
 644
 645         // For stats, check if this GC was forced by the user.
 646         // Update it under gcsema to avoid gctrace getting wrong values.
 647         work.userForced = trigger.kind == gcTriggerCycle
 648
 649         trace := traceAcquire()
 650         if trace.ok() {
 651                 trace.GCStart()
 652                 traceRelease(trace)
 653         }
 654
 655         // Check that all Ps have finished deferred mcache flushes.
 656         for _, p := range allp {
 657                 if fg := p.mcache.flushGen.Load(); fg != mheap_.sweepgen {
 658                         println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
 659                         throw("p mcache not flushed")
 660                 }
 661         }
 662
 663         gcBgMarkStartWorkers()
 664
 665         systemstack(gcResetMarkState)
 666
 667         work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
 668         if work.stwprocs > ncpu {
 669                 // This is used to compute CPU time of the STW phases,
 670                 // so it can't be more than ncpu, even if GOMAXPROCS is.
 671                 work.stwprocs = ncpu
 672         }
 673         work.heap0 = gcController.heapLive.Load()
 674         work.pauseNS = 0
 675         work.mode = mode
 676
 677         now := nanotime()
 678         work.tSweepTerm = now
 679         pauseStart := now
 680         systemstack(func() { stopTheWorldWithSema(stwGCSweepTerm) })
 681         // Finish sweep before we start concurrent scan.
 682         systemstack(func() {
 683                 finishsweep_m()
 684         })
 685
 686         // clearpools before we start the GC. If we wait the memory will not be
 687         // reclaimed until the next GC cycle.
 688         clearpools()
 689
 690         work.cycles.Add(1)
 691
 692         // Assists and workers can start the moment we start
 693         // the world.
 694         gcController.startCycle(now, int(gomaxprocs), trigger)
 695
 696         // Notify the CPU limiter that assists may begin.
 697         gcCPULimiter.startGCTransition(true, now)
 698
 699         // In STW mode, disable scheduling of user Gs. This may also
 700         // disable scheduling of this goroutine, so it may block as
 701         // soon as we start the world again.
 702         if mode != gcBackgroundMode {
 703                 schedEnableUser(false)
 704         }
 705
 706         // Enter concurrent mark phase and enable
 707         // write barriers.
 708         //
 709         // Because the world is stopped, all Ps will
 710         // observe that write barriers are enabled by
 711         // the time we start the world and begin
 712         // scanning.
 713         //
 714         // Write barriers must be enabled before assists are
 715         // enabled because they must be enabled before
 716         // any non-leaf heap objects are marked. Since
 717         // allocations are blocked until assists can
 718         // happen, we want to enable assists as early as
 719         // possible.
 720         setGCPhase(_GCmark)
 721
 722         gcBgMarkPrepare() // Must happen before assists are enabled.
 723         gcMarkRootPrepare()
 724
 725         // Mark all active tinyalloc blocks. Since we're
 726         // allocating from these, they need to be black like
 727         // other allocations. The alternative is to blacken
 728         // the tiny block on every allocation from it, which
 729         // would slow down the tiny allocator.
 730         gcMarkTinyAllocs()
 731
 732         // At this point all Ps have enabled the write
 733         // barrier, thus maintaining the no white to
 734         // black invariant. Enable mutator assists to
 735         // put back-pressure on fast allocating
 736         // mutators.
 737         atomic.Store(&gcBlackenEnabled, 1)
 738
 739         // In STW mode, we could block the instant systemstack
 740         // returns, so make sure we're not preemptible.
 741         mp = acquirem()
 742
 743         // Concurrent mark.
 744         systemstack(func() {
 745                 now = startTheWorldWithSema()
 746                 work.pauseNS += now - pauseStart
 747                 work.tMark = now
 748                 memstats.gcPauseDist.record(now - pauseStart)
 749
 750                 sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
 751                 work.cpuStats.gcPauseTime += sweepTermCpu
 752                 work.cpuStats.gcTotalTime += sweepTermCpu
 753
 754                 // Release the CPU limiter.
 755                 gcCPULimiter.finishGCTransition(now)
 756         })
 757
 758         // Release the world sema before Gosched() in STW mode
 759         // because we will need to reacquire it later but before
 760         // this goroutine becomes runnable again, and we could
 761         // self-deadlock otherwise.
 762         semrelease(&worldsema)
 763         releasem(mp)
 764
 765         // Make sure we block instead of returning to user code
 766         // in STW mode.
 767         if mode != gcBackgroundMode {
 768                 Gosched()
 769         }
 770
 771         semrelease(&work.startSema)
 772 }
 773
 774 // gcMarkDoneFlushed counts the number of P's with flushed work.
 775 //
 776 // Ideally this would be a captured local in gcMarkDone, but forEachP
 777 // escapes its callback closure, so it can't capture anything.
 778 //
 779 // This is protected by markDoneSema.
 780 var gcMarkDoneFlushed uint32
 781
 782 // gcMarkDone transitions the GC from mark to mark termination if all
 783 // reachable objects have been marked (that is, there are no grey
 784 // objects and can be no more in the future). Otherwise, it flushes
 785 // all local work to the global queues where it can be discovered by
 786 // other workers.
 787 //
 788 // This should be called when all local mark work has been drained and
 789 // there are no remaining workers. Specifically, when
 790 //
 791 //      work.nwait == work.nproc && !gcMarkWorkAvailable(p)
 792 //
 793 // The calling context must be preemptible.
 794 //
 795 // Flushing local work is important because idle Ps may have local
 796 // work queued. This is the only way to make that work visible and
 797 // drive GC to completion.
 798 //
 799 // It is explicitly okay to have write barriers in this function. If
 800 // it does transition to mark termination, then all reachable objects
 801 // have been marked, so the write barrier cannot shade any more
 802 // objects.
 803 func gcMarkDone() {
 804         // Ensure only one thread is running the ragged barrier at a
 805         // time.
 806         semacquire(&work.markDoneSema)
 807
 808 top:
 809         // Re-check transition condition under transition lock.
 810         //
 811         // It's critical that this checks the global work queues are
 812         // empty before performing the ragged barrier. Otherwise,
 813         // there could be global work that a P could take after the P
 814         // has passed the ragged barrier.
 815         if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
 816                 semrelease(&work.markDoneSema)
 817                 return
 818         }
 819
 820         // forEachP needs worldsema to execute, and we'll need it to
 821         // stop the world later, so acquire worldsema now.
 822         semacquire(&worldsema)
 823
 824         // Flush all local buffers and collect flushedWork flags.
 825         gcMarkDoneFlushed = 0
 826         forEachP(waitReasonGCMarkTermination, func(pp *p) {
 827                 // Flush the write barrier buffer, since this may add
 828                 // work to the gcWork.
 829                 wbBufFlush1(pp)
 830
 831                 // Flush the gcWork, since this may create global work
 832                 // and set the flushedWork flag.
 833                 //
 834                 // TODO(austin): Break up these workbufs to
 835                 // better distribute work.
 836                 pp.gcw.dispose()
 837                 // Collect the flushedWork flag.
 838                 if pp.gcw.flushedWork {
 839                         atomic.Xadd(&gcMarkDoneFlushed, 1)
 840                         pp.gcw.flushedWork = false
 841                 }
 842         })
 843
 844         if gcMarkDoneFlushed != 0 {
 845                 // More grey objects were discovered since the
 846                 // previous termination check, so there may be more
 847                 // work to do. Keep going. It's possible the
 848                 // transition condition became true again during the
 849                 // ragged barrier, so re-check it.
 850                 semrelease(&worldsema)
 851                 goto top
 852         }
 853
 854         // There was no global work, no local work, and no Ps
 855         // communicated work since we took markDoneSema. Therefore
 856         // there are no grey objects and no more objects can be
 857         // shaded. Transition to mark termination.
 858         now := nanotime()
 859         work.tMarkTerm = now
 860         pauseStart := now
 861         getg().m.preemptoff = "gcing"
 862         systemstack(func() { stopTheWorldWithSema(stwGCMarkTerm) })
 863         // The gcphase is _GCmark, it will transition to _GCmarktermination
 864         // below. The important thing is that the wb remains active until
 865         // all marking is complete. This includes writes made by the GC.
 866
 867         // There is sometimes work left over when we enter mark termination due
 868         // to write barriers performed after the completion barrier above.
 869         // Detect this and resume concurrent mark. This is obviously
 870         // unfortunate.
 871         //
 872         // See issue #27993 for details.
 873         //
 874         // Switch to the system stack to call wbBufFlush1, though in this case
 875         // it doesn't matter because we're non-preemptible anyway.
 876         restart := false
 877         systemstack(func() {
 878                 for _, p := range allp {
 879                         wbBufFlush1(p)
 880                         if !p.gcw.empty() {
 881                                 restart = true
 882                                 break
 883                         }
 884                 }
 885         })
 886         if restart {
 887                 getg().m.preemptoff = ""
 888                 systemstack(func() {
 889                         now := startTheWorldWithSema()
 890                         work.pauseNS += now - pauseStart
 891                         memstats.gcPauseDist.record(now - pauseStart)
 892                 })
 893                 semrelease(&worldsema)
 894                 goto top
 895         }
 896
 897         gcComputeStartingStackSize()
 898
 899         // Disable assists and background workers. We must do
 900         // this before waking blocked assists.
 901         atomic.Store(&gcBlackenEnabled, 0)
 902
 903         // Notify the CPU limiter that GC assists will now cease.
 904         gcCPULimiter.startGCTransition(false, now)
 905
 906         // Wake all blocked assists. These will run when we
 907         // start the world again.
 908         gcWakeAllAssists()
 909
 910         // Likewise, release the transition lock. Blocked
 911         // workers and assists will run when we start the
 912         // world again.
 913         semrelease(&work.markDoneSema)
 914
 915         // In STW mode, re-enable user goroutines. These will be
 916         // queued to run after we start the world.
 917         schedEnableUser(true)
 918
 919         // endCycle depends on all gcWork cache stats being flushed.
 920         // The termination algorithm above ensured that up to
 921         // allocations since the ragged barrier.
 922         gcController.endCycle(now, int(gomaxprocs), work.userForced)
 923
 924         // Perform mark termination. This will restart the world.
 925         gcMarkTermination(pauseStart)
 926 }
 927
 928 // World must be stopped and mark assists and background workers must be
 929 // disabled.
 930 func gcMarkTermination(pauseStart int64) {
 931         // Start marktermination (write barrier remains enabled for now).
 932         setGCPhase(_GCmarktermination)
 933
 934         work.heap1 = gcController.heapLive.Load()
 935         startTime := nanotime()
 936
 937         mp := acquirem()
 938         mp.preemptoff = "gcing"
 939         mp.traceback = 2
 940         curgp := mp.curg
 941         // N.B. The execution tracer is not aware of this status
 942         // transition and handles it specially based on the
 943         // wait reason.
 944         casGToWaiting(curgp, _Grunning, waitReasonGarbageCollection)
 945
 946         // Run gc on the g0 stack. We do this so that the g stack
 947         // we're currently running on will no longer change. Cuts
 948         // the root set down a bit (g0 stacks are not scanned, and
 949         // we don't need to scan gc's internal state).  We also
 950         // need to switch to g0 so we can shrink the stack.
 951         systemstack(func() {
 952                 gcMark(startTime)
 953                 // Must return immediately.
 954                 // The outer function's stack may have moved
 955                 // during gcMark (it shrinks stacks, including the
 956                 // outer function's stack), so we must not refer
 957                 // to any of its variables. Return back to the
 958                 // non-system stack to pick up the new addresses
 959                 // before continuing.
 960         })
 961
 962         var stwSwept bool
 963         systemstack(func() {
 964                 work.heap2 = work.bytesMarked
 965                 if debug.gccheckmark > 0 {
 966                         // Run a full non-parallel, stop-the-world
 967                         // mark using checkmark bits, to check that we
 968                         // didn't forget to mark anything during the
 969                         // concurrent mark process.
 970                         startCheckmarks()
 971                         gcResetMarkState()
 972                         gcw := &getg().m.p.ptr().gcw
 973                         gcDrain(gcw, 0)
 974                         wbBufFlush1(getg().m.p.ptr())
 975                         gcw.dispose()
 976                         endCheckmarks()
 977                 }
 978
 979                 // marking is complete so we can turn the write barrier off
 980                 setGCPhase(_GCoff)
 981                 stwSwept = gcSweep(work.mode)
 982         })
 983
 984         mp.traceback = 0
 985         casgstatus(curgp, _Gwaiting, _Grunning)
 986
 987         trace := traceAcquire()
 988         if trace.ok() {
 989                 trace.GCDone()
 990                 traceRelease(trace)
 991         }
 992
 993         // all done
 994         mp.preemptoff = ""
 995
 996         if gcphase != _GCoff {
 997                 throw("gc done but gcphase != _GCoff")
 998         }
 999
1000         // Record heapInUse for scavenger.
1001         memstats.lastHeapInUse = gcController.heapInUse.load()
1002
1003         // Update GC trigger and pacing, as well as downstream consumers
1004         // of this pacing information, for the next cycle.
1005         systemstack(gcControllerCommit)
1006
1007         // Update timing memstats
1008         now := nanotime()
1009         sec, nsec, _ := time_now()
1010         unixNow := sec*1e9 + int64(nsec)
1011         work.pauseNS += now - pauseStart
1012         work.tEnd = now
1013         memstats.gcPauseDist.record(now - pauseStart)
1014         atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
1015         atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
1016         memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
1017         memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
1018         memstats.pause_total_ns += uint64(work.pauseNS)
1019
1020         markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
1021         work.cpuStats.gcPauseTime += markTermCpu
1022         work.cpuStats.gcTotalTime += markTermCpu
1023
1024         // Accumulate CPU stats.
1025         //
1026         // Pass gcMarkPhase=true so we can get all the latest GC CPU stats in there too.
1027         work.cpuStats.accumulate(now, true)
1028
1029         // Compute overall GC CPU utilization.
1030         // Omit idle marking time from the overall utilization here since it's "free".
1031         memstats.gc_cpu_fraction = float64(work.cpuStats.gcTotalTime-work.cpuStats.gcIdleTime) / float64(work.cpuStats.totalTime)
1032
1033         // Reset assist time and background time stats.
1034         //
1035         // Do this now, instead of at the start of the next GC cycle, because
1036         // these two may keep accumulating even if the GC is not active.
1037         scavenge.assistTime.Store(0)
1038         scavenge.backgroundTime.Store(0)
1039
1040         // Reset idle time stat.
1041         sched.idleTime.Store(0)
1042
1043         if work.userForced {
1044                 memstats.numforcedgc++
1045         }
1046
1047         // Bump GC cycle count and wake goroutines waiting on sweep.
1048         lock(&work.sweepWaiters.lock)
1049         memstats.numgc++
1050         injectglist(&work.sweepWaiters.list)
1051         unlock(&work.sweepWaiters.lock)
1052
1053         // Increment the scavenge generation now.
1054         //
1055         // This moment represents peak heap in use because we're
1056         // about to start sweeping.
1057         mheap_.pages.scav.index.nextGen()
1058
1059         // Release the CPU limiter.
1060         gcCPULimiter.finishGCTransition(now)
1061
1062         // Finish the current heap profiling cycle and start a new
1063         // heap profiling cycle. We do this before starting the world
1064         // so events don't leak into the wrong cycle.
1065         mProf_NextCycle()
1066
1067         // There may be stale spans in mcaches that need to be swept.
1068         // Those aren't tracked in any sweep lists, so we need to
1069         // count them against sweep completion until we ensure all
1070         // those spans have been forced out.
1071         //
1072         // If gcSweep fully swept the heap (for example if the sweep
1073         // is not concurrent due to a GODEBUG setting), then we expect
1074         // the sweepLocker to be invalid, since sweeping is done.
1075         //
1076         // N.B. Below we might duplicate some work from gcSweep; this is
1077         // fine as all that work is idempotent within a GC cycle, and
1078         // we're still holding worldsema so a new cycle can't start.
1079         sl := sweep.active.begin()
1080         if !stwSwept && !sl.valid {
1081                 throw("failed to set sweep barrier")
1082         } else if stwSwept && sl.valid {
1083                 throw("non-concurrent sweep failed to drain all sweep queues")
1084         }
1085
1086         systemstack(func() { startTheWorldWithSema() })
1087
1088         // Flush the heap profile so we can start a new cycle next GC.
1089         // This is relatively expensive, so we don't do it with the
1090         // world stopped.
1091         mProf_Flush()
1092
1093         // Prepare workbufs for freeing by the sweeper. We do this
1094         // asynchronously because it can take non-trivial time.
1095         prepareFreeWorkbufs()
1096
1097         // Free stack spans. This must be done between GC cycles.
1098         systemstack(freeStackSpans)
1099
1100         // Ensure all mcaches are flushed. Each P will flush its own
1101         // mcache before allocating, but idle Ps may not. Since this
1102         // is necessary to sweep all spans, we need to ensure all
1103         // mcaches are flushed before we start the next GC cycle.
1104         //
1105         // While we're here, flush the page cache for idle Ps to avoid
1106         // having pages get stuck on them. These pages are hidden from
1107         // the scavenger, so in small idle heaps a significant amount
1108         // of additional memory might be held onto.
1109         //
1110         // Also, flush the pinner cache, to avoid leaking that memory
1111         // indefinitely.
1112         forEachP(waitReasonFlushProcCaches, func(pp *p) {
1113                 pp.mcache.prepareForSweep()
1114                 if pp.status == _Pidle {
1115                         systemstack(func() {
1116                                 lock(&mheap_.lock)
1117                                 pp.pcache.flush(&mheap_.pages)
1118                                 unlock(&mheap_.lock)
1119                         })
1120                 }
1121                 pp.pinnerCache = nil
1122         })
1123         if sl.valid {
1124                 // Now that we've swept stale spans in mcaches, they don't
1125                 // count against unswept spans.
1126                 //
1127                 // Note: this sweepLocker may not be valid if sweeping had
1128                 // already completed during the STW. See the corresponding
1129                 // begin() call that produced sl.
1130                 sweep.active.end(sl)
1131         }
1132
1133         // Print gctrace before dropping worldsema. As soon as we drop
1134         // worldsema another cycle could start and smash the stats
1135         // we're trying to print.
1136         if debug.gctrace > 0 {
1137                 util := int(memstats.gc_cpu_fraction * 100)
1138
1139                 var sbuf [24]byte
1140                 printlock()
1141                 print("gc ", memstats.numgc,
1142                         " @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
1143                         util, "%: ")
1144                 prev := work.tSweepTerm
1145                 for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
1146                         if i != 0 {
1147                                 print("+")
1148                         }
1149                         print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
1150                         prev = ns
1151                 }
1152                 print(" ms clock, ")
1153                 for i, ns := range []int64{
1154                         int64(work.stwprocs) * (work.tMark - work.tSweepTerm),
1155                         gcController.assistTime.Load(),
1156                         gcController.dedicatedMarkTime.Load() + gcController.fractionalMarkTime.Load(),
1157                         gcController.idleMarkTime.Load(),
1158                         markTermCpu,
1159                 } {
1160                         if i == 2 || i == 3 {
1161                                 // Separate mark time components with /.
1162                                 print("/")
1163                         } else if i != 0 {
1164                                 print("+")
1165                         }
1166                         print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
1167                 }
1168                 print(" ms cpu, ",
1169                         work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
1170                         gcController.lastHeapGoal>>20, " MB goal, ",
1171                         gcController.lastStackScan.Load()>>20, " MB stacks, ",
1172                         gcController.globalsScan.Load()>>20, " MB globals, ",
1173                         work.maxprocs, " P")
1174                 if work.userForced {
1175                         print(" (forced)")
1176                 }
1177                 print("\n")
1178                 printunlock()
1179         }
1180
1181         // Set any arena chunks that were deferred to fault.
1182         lock(&userArenaState.lock)
1183         faultList := userArenaState.fault
1184         userArenaState.fault = nil
1185         unlock(&userArenaState.lock)
1186         for _, lc := range faultList {
1187                 lc.mspan.setUserArenaChunkToFault()
1188         }
1189
1190         // Enable huge pages on some metadata if we cross a heap threshold.
1191         if gcController.heapGoal() > minHeapForMetadataHugePages {
1192                 systemstack(func() {
1193                         mheap_.enableMetadataHugePages()
1194                 })
1195         }
1196
1197         semrelease(&worldsema)
1198         semrelease(&gcsema)
1199         // Careful: another GC cycle may start now.
1200
1201         releasem(mp)
1202         mp = nil
1203
1204         // now that gc is done, kick off finalizer thread if needed
1205         if !concurrentSweep {
1206                 // give the queued finalizers, if any, a chance to run
1207                 Gosched()
1208         }
1209 }
1210
1211 // gcBgMarkStartWorkers prepares background mark worker goroutines. These
1212 // goroutines will not run until the mark phase, but they must be started while
1213 // the work is not stopped and from a regular G stack. The caller must hold
1214 // worldsema.
1215 func gcBgMarkStartWorkers() {
1216         // Background marking is performed by per-P G's. Ensure that each P has
1217         // a background GC G.
1218         //
1219         // Worker Gs don't exit if gomaxprocs is reduced. If it is raised
1220         // again, we can reuse the old workers; no need to create new workers.
1221         for gcBgMarkWorkerCount < gomaxprocs {
1222                 go gcBgMarkWorker()
1223
1224                 notetsleepg(&work.bgMarkReady, -1)
1225                 noteclear(&work.bgMarkReady)
1226                 // The worker is now guaranteed to be added to the pool before
1227                 // its P's next findRunnableGCWorker.
1228
1229                 gcBgMarkWorkerCount++
1230         }
1231 }
1232
1233 // gcBgMarkPrepare sets up state for background marking.
1234 // Mutator assists must not yet be enabled.
1235 func gcBgMarkPrepare() {
1236         // Background marking will stop when the work queues are empty
1237         // and there are no more workers (note that, since this is
1238         // concurrent, this may be a transient state, but mark
1239         // termination will clean it up). Between background workers
1240         // and assists, we don't really know how many workers there
1241         // will be, so we pretend to have an arbitrarily large number
1242         // of workers, almost all of which are "waiting". While a
1243         // worker is working it decrements nwait. If nproc == nwait,
1244         // there are no workers.
1245         work.nproc = ^uint32(0)
1246         work.nwait = ^uint32(0)
1247 }
1248
1249 // gcBgMarkWorkerNode is an entry in the gcBgMarkWorkerPool. It points to a single
1250 // gcBgMarkWorker goroutine.
1251 type gcBgMarkWorkerNode struct {
1252         // Unused workers are managed in a lock-free stack. This field must be first.
1253         node lfnode
1254
1255         // The g of this worker.
1256         gp guintptr
1257
1258         // Release this m on park. This is used to communicate with the unlock
1259         // function, which cannot access the G's stack. It is unused outside of
1260         // gcBgMarkWorker().
1261         m muintptr
1262 }
1263
1264 func gcBgMarkWorker() {
1265         gp := getg()
1266
1267         // We pass node to a gopark unlock function, so it can't be on
1268         // the stack (see gopark). Prevent deadlock from recursively
1269         // starting GC by disabling preemption.
1270         gp.m.preemptoff = "GC worker init"
1271         node := new(gcBgMarkWorkerNode)
1272         gp.m.preemptoff = ""
1273
1274         node.gp.set(gp)
1275
1276         node.m.set(acquirem())
1277         notewakeup(&work.bgMarkReady)
1278         // After this point, the background mark worker is generally scheduled
1279         // cooperatively by gcController.findRunnableGCWorker. While performing
1280         // work on the P, preemption is disabled because we are working on
1281         // P-local work buffers. When the preempt flag is set, this puts itself
1282         // into _Gwaiting to be woken up by gcController.findRunnableGCWorker
1283         // at the appropriate time.
1284         //
1285         // When preemption is enabled (e.g., while in gcMarkDone), this worker
1286         // may be preempted and schedule as a _Grunnable G from a runq. That is
1287         // fine; it will eventually gopark again for further scheduling via
1288         // findRunnableGCWorker.
1289         //
1290         // Since we disable preemption before notifying bgMarkReady, we
1291         // guarantee that this G will be in the worker pool for the next
1292         // findRunnableGCWorker. This isn't strictly necessary, but it reduces
1293         // latency between _GCmark starting and the workers starting.
1294
1295         for {
1296                 // Go to sleep until woken by
1297                 // gcController.findRunnableGCWorker.
1298                 gopark(func(g *g, nodep unsafe.Pointer) bool {
1299                         node := (*gcBgMarkWorkerNode)(nodep)
1300
1301                         if mp := node.m.ptr(); mp != nil {
1302                                 // The worker G is no longer running; release
1303                                 // the M.
1304                                 //
1305                                 // N.B. it is _safe_ to release the M as soon
1306                                 // as we are no longer performing P-local mark
1307                                 // work.
1308                                 //
1309                                 // However, since we cooperatively stop work
1310                                 // when gp.preempt is set, if we releasem in
1311                                 // the loop then the following call to gopark
1312                                 // would immediately preempt the G. This is
1313                                 // also safe, but inefficient: the G must
1314                                 // schedule again only to enter gopark and park
1315                                 // again. Thus, we defer the release until
1316                                 // after parking the G.
1317                                 releasem(mp)
1318                         }
1319
1320                         // Release this G to the pool.
1321                         gcBgMarkWorkerPool.push(&node.node)
1322                         // Note that at this point, the G may immediately be
1323                         // rescheduled and may be running.
1324                         return true
1325                 }, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceBlockSystemGoroutine, 0)
1326
1327                 // Preemption must not occur here, or another G might see
1328                 // p.gcMarkWorkerMode.
1329
1330                 // Disable preemption so we can use the gcw. If the
1331                 // scheduler wants to preempt us, we'll stop draining,
1332                 // dispose the gcw, and then preempt.
1333                 node.m.set(acquirem())
1334                 pp := gp.m.p.ptr() // P can't change with preemption disabled.
1335
1336                 if gcBlackenEnabled == 0 {
1337                         println("worker mode", pp.gcMarkWorkerMode)
1338                         throw("gcBgMarkWorker: blackening not enabled")
1339                 }
1340
1341                 if pp.gcMarkWorkerMode == gcMarkWorkerNotWorker {
1342                         throw("gcBgMarkWorker: mode not set")
1343                 }
1344
1345                 startTime := nanotime()
1346                 pp.gcMarkWorkerStartTime = startTime
1347                 var trackLimiterEvent bool
1348                 if pp.gcMarkWorkerMode == gcMarkWorkerIdleMode {
1349                         trackLimiterEvent = pp.limiterEvent.start(limiterEventIdleMarkWork, startTime)
1350                 }
1351
1352                 decnwait := atomic.Xadd(&work.nwait, -1)
1353                 if decnwait == work.nproc {
1354                         println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
1355                         throw("work.nwait was > work.nproc")
1356                 }
1357
1358                 systemstack(func() {
1359                         // Mark our goroutine preemptible so its stack
1360                         // can be scanned. This lets two mark workers
1361                         // scan each other (otherwise, they would
1362                         // deadlock). We must not modify anything on
1363                         // the G stack. However, stack shrinking is
1364                         // disabled for mark workers, so it is safe to
1365                         // read from the G stack.
1366                         //
1367                         // N.B. The execution tracer is not aware of this status
1368                         // transition and handles it specially based on the
1369                         // wait reason.
1370                         casGToWaiting(gp, _Grunning, waitReasonGCWorkerActive)
1371                         switch pp.gcMarkWorkerMode {
1372                         default:
1373                                 throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
1374                         case gcMarkWorkerDedicatedMode:
1375                                 gcDrainMarkWorkerDedicated(&pp.gcw, true)
1376                                 if gp.preempt {
1377                                         // We were preempted. This is
1378                                         // a useful signal to kick
1379                                         // everything out of the run
1380                                         // queue so it can run
1381                                         // somewhere else.
1382                                         if drainQ, n := runqdrain(pp); n > 0 {
1383                                                 lock(&sched.lock)
1384                                                 globrunqputbatch(&drainQ, int32(n))
1385                                                 unlock(&sched.lock)
1386                                         }
1387                                 }
1388                                 // Go back to draining, this time
1389                                 // without preemption.
1390                                 gcDrainMarkWorkerDedicated(&pp.gcw, false)
1391                         case gcMarkWorkerFractionalMode:
1392                                 gcDrainMarkWorkerFractional(&pp.gcw)
1393                         case gcMarkWorkerIdleMode:
1394                                 gcDrainMarkWorkerIdle(&pp.gcw)
1395                         }
1396                         casgstatus(gp, _Gwaiting, _Grunning)
1397                 })
1398
1399                 // Account for time and mark us as stopped.
1400                 now := nanotime()
1401                 duration := now - startTime
1402                 gcController.markWorkerStop(pp.gcMarkWorkerMode, duration)
1403                 if trackLimiterEvent {
1404                         pp.limiterEvent.stop(limiterEventIdleMarkWork, now)
1405                 }
1406                 if pp.gcMarkWorkerMode == gcMarkWorkerFractionalMode {
1407                         atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
1408                 }
1409
1410                 // Was this the last worker and did we run out
1411                 // of work?
1412                 incnwait := atomic.Xadd(&work.nwait, +1)
1413                 if incnwait > work.nproc {
1414                         println("runtime: p.gcMarkWorkerMode=", pp.gcMarkWorkerMode,
1415                                 "work.nwait=", incnwait, "work.nproc=", work.nproc)
1416                         throw("work.nwait > work.nproc")
1417                 }
1418
1419                 // We'll releasem after this point and thus this P may run
1420                 // something else. We must clear the worker mode to avoid
1421                 // attributing the mode to a different (non-worker) G in
1422                 // traceGoStart.
1423                 pp.gcMarkWorkerMode = gcMarkWorkerNotWorker
1424
1425                 // If this worker reached a background mark completion
1426                 // point, signal the main GC goroutine.
1427                 if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1428                         // We don't need the P-local buffers here, allow
1429                         // preemption because we may schedule like a regular
1430                         // goroutine in gcMarkDone (block on locks, etc).
1431                         releasem(node.m.ptr())
1432                         node.m.set(nil)
1433
1434                         gcMarkDone()
1435                 }
1436         }
1437 }
1438
1439 // gcMarkWorkAvailable reports whether executing a mark worker
1440 // on p is potentially useful. p may be nil, in which case it only
1441 // checks the global sources of work.
1442 func gcMarkWorkAvailable(p *p) bool {
1443         if p != nil && !p.gcw.empty() {
1444                 return true
1445         }
1446         if !work.full.empty() {
1447                 return true // global work available
1448         }
1449         if work.markrootNext < work.markrootJobs {
1450                 return true // root scan work available
1451         }
1452         return false
1453 }
1454
1455 // gcMark runs the mark (or, for concurrent GC, mark termination)
1456 // All gcWork caches must be empty.
1457 // STW is in effect at this point.
1458 func gcMark(startTime int64) {
1459         if debug.allocfreetrace > 0 {
1460                 tracegc()
1461         }
1462
1463         if gcphase != _GCmarktermination {
1464                 throw("in gcMark expecting to see gcphase as _GCmarktermination")
1465         }
1466         work.tstart = startTime
1467
1468         // Check that there's no marking work remaining.
1469         if work.full != 0 || work.markrootNext < work.markrootJobs {
1470                 print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
1471                 panic("non-empty mark queue after concurrent mark")
1472         }
1473
1474         if debug.gccheckmark > 0 {
1475                 // This is expensive when there's a large number of
1476                 // Gs, so only do it if checkmark is also enabled.
1477                 gcMarkRootCheck()
1478         }
1479
1480         // Drop allg snapshot. allgs may have grown, in which case
1481         // this is the only reference to the old backing store and
1482         // there's no need to keep it around.
1483         work.stackRoots = nil
1484
1485         // Clear out buffers and double-check that all gcWork caches
1486         // are empty. This should be ensured by gcMarkDone before we
1487         // enter mark termination.
1488         //
1489         // TODO: We could clear out buffers just before mark if this
1490         // has a non-negligible impact on STW time.
1491         for _, p := range allp {
1492                 // The write barrier may have buffered pointers since
1493                 // the gcMarkDone barrier. However, since the barrier
1494                 // ensured all reachable objects were marked, all of
1495                 // these must be pointers to black objects. Hence we
1496                 // can just discard the write barrier buffer.
1497                 if debug.gccheckmark > 0 {
1498                         // For debugging, flush the buffer and make
1499                         // sure it really was all marked.
1500                         wbBufFlush1(p)
1501                 } else {
1502                         p.wbBuf.reset()
1503                 }
1504
1505                 gcw := &p.gcw
1506                 if !gcw.empty() {
1507                         printlock()
1508                         print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
1509                         if gcw.wbuf1 == nil {
1510                                 print(" wbuf1=<nil>")
1511                         } else {
1512                                 print(" wbuf1.n=", gcw.wbuf1.nobj)
1513                         }
1514                         if gcw.wbuf2 == nil {
1515                                 print(" wbuf2=<nil>")
1516                         } else {
1517                                 print(" wbuf2.n=", gcw.wbuf2.nobj)
1518                         }
1519                         print("\n")
1520                         throw("P has cached GC work at end of mark termination")
1521                 }
1522                 // There may still be cached empty buffers, which we
1523                 // need to flush since we're going to free them. Also,
1524                 // there may be non-zero stats because we allocated
1525                 // black after the gcMarkDone barrier.
1526                 gcw.dispose()
1527         }
1528
1529         // Flush scanAlloc from each mcache since we're about to modify
1530         // heapScan directly. If we were to flush this later, then scanAlloc
1531         // might have incorrect information.
1532         //
1533         // Note that it's not important to retain this information; we know
1534         // exactly what heapScan is at this point via scanWork.
1535         for _, p := range allp {
1536                 c := p.mcache
1537                 if c == nil {
1538                         continue
1539                 }
1540                 c.scanAlloc = 0
1541         }
1542
1543         // Reset controller state.
1544         gcController.resetLive(work.bytesMarked)
1545 }
1546
1547 // gcSweep must be called on the system stack because it acquires the heap
1548 // lock. See mheap for details.
1549 //
1550 // Returns true if the heap was fully swept by this function.
1551 //
1552 // The world must be stopped.
1553 //
1554 //go:systemstack
1555 func gcSweep(mode gcMode) bool {
1556         assertWorldStopped()
1557
1558         if gcphase != _GCoff {
1559                 throw("gcSweep being done but phase is not GCoff")
1560         }
1561
1562         lock(&mheap_.lock)
1563         mheap_.sweepgen += 2
1564         sweep.active.reset()
1565         mheap_.pagesSwept.Store(0)
1566         mheap_.sweepArenas = mheap_.allArenas
1567         mheap_.reclaimIndex.Store(0)
1568         mheap_.reclaimCredit.Store(0)
1569         unlock(&mheap_.lock)
1570
1571         sweep.centralIndex.clear()
1572
1573         if !concurrentSweep || mode == gcForceBlockMode {
1574                 // Special case synchronous sweep.
1575                 // Record that no proportional sweeping has to happen.
1576                 lock(&mheap_.lock)
1577                 mheap_.sweepPagesPerByte = 0
1578                 unlock(&mheap_.lock)
1579                 // Flush all mcaches.
1580                 for _, pp := range allp {
1581                         pp.mcache.prepareForSweep()
1582                 }
1583                 // Sweep all spans eagerly.
1584                 for sweepone() != ^uintptr(0) {
1585                 }
1586                 // Free workbufs eagerly.
1587                 prepareFreeWorkbufs()
1588                 for freeSomeWbufs(false) {
1589                 }
1590                 // All "free" events for this mark/sweep cycle have
1591                 // now happened, so we can make this profile cycle
1592                 // available immediately.
1593                 mProf_NextCycle()
1594                 mProf_Flush()
1595                 return true
1596         }
1597
1598         // Background sweep.
1599         lock(&sweep.lock)
1600         if sweep.parked {
1601                 sweep.parked = false
1602                 ready(sweep.g, 0, true)
1603         }
1604         unlock(&sweep.lock)
1605         return false
1606 }
1607
1608 // gcResetMarkState resets global state prior to marking (concurrent
1609 // or STW) and resets the stack scan state of all Gs.
1610 //
1611 // This is safe to do without the world stopped because any Gs created
1612 // during or after this will start out in the reset state.
1613 //
1614 // gcResetMarkState must be called on the system stack because it acquires
1615 // the heap lock. See mheap for details.
1616 //
1617 //go:systemstack
1618 func gcResetMarkState() {
1619         // This may be called during a concurrent phase, so lock to make sure
1620         // allgs doesn't change.
1621         forEachG(func(gp *g) {
1622                 gp.gcscandone = false // set to true in gcphasework
1623                 gp.gcAssistBytes = 0
1624         })
1625
1626         // Clear page marks. This is just 1MB per 64GB of heap, so the
1627         // time here is pretty trivial.
1628         lock(&mheap_.lock)
1629         arenas := mheap_.allArenas
1630         unlock(&mheap_.lock)
1631         for _, ai := range arenas {
1632                 ha := mheap_.arenas[ai.l1()][ai.l2()]
1633                 for i := range ha.pageMarks {
1634                         ha.pageMarks[i] = 0
1635                 }
1636         }
1637
1638         work.bytesMarked = 0
1639         work.initialHeapLive = gcController.heapLive.Load()
1640 }
1641
1642 // Hooks for other packages
1643
1644 var poolcleanup func()
1645 var boringCaches []unsafe.Pointer // for crypto/internal/boring
1646
1647 //go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
1648 func sync_runtime_registerPoolCleanup(f func()) {
1649         poolcleanup = f
1650 }
1651
1652 //go:linkname boring_registerCache crypto/internal/boring/bcache.registerCache
1653 func boring_registerCache(p unsafe.Pointer) {
1654         boringCaches = append(boringCaches, p)
1655 }
1656
1657 func clearpools() {
1658         // clear sync.Pools
1659         if poolcleanup != nil {
1660                 poolcleanup()
1661         }
1662
1663         // clear boringcrypto caches
1664         for _, p := range boringCaches {
1665                 atomicstorep(p, nil)
1666         }
1667
1668         // Clear central sudog cache.
1669         // Leave per-P caches alone, they have strictly bounded size.
1670         // Disconnect cached list before dropping it on the floor,
1671         // so that a dangling ref to one entry does not pin all of them.
1672         lock(&sched.sudoglock)
1673         var sg, sgnext *sudog
1674         for sg = sched.sudogcache; sg != nil; sg = sgnext {
1675                 sgnext = sg.next
1676                 sg.next = nil
1677         }
1678         sched.sudogcache = nil
1679         unlock(&sched.sudoglock)
1680
1681         // Clear central defer pool.
1682         // Leave per-P pools alone, they have strictly bounded size.
1683         lock(&sched.deferlock)
1684         // disconnect cached list before dropping it on the floor,
1685         // so that a dangling ref to one entry does not pin all of them.
1686         var d, dlink *_defer
1687         for d = sched.deferpool; d != nil; d = dlink {
1688                 dlink = d.link
1689                 d.link = nil
1690         }
1691         sched.deferpool = nil
1692         unlock(&sched.deferlock)
1693 }
1694
1695 // Timing
1696
1697 // itoaDiv formats val/(10**dec) into buf.
1698 func itoaDiv(buf []byte, val uint64, dec int) []byte {
1699         i := len(buf) - 1
1700         idec := i - dec
1701         for val >= 10 || i >= idec {
1702                 buf[i] = byte(val%10 + '0')
1703                 i--
1704                 if i == idec {
1705                         buf[i] = '.'
1706                         i--
1707                 }
1708                 val /= 10
1709         }
1710         buf[i] = byte(val + '0')
1711         return buf[i:]
1712 }
1713
1714 // fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
1715 func fmtNSAsMS(buf []byte, ns uint64) []byte {
1716         if ns >= 10e6 {
1717                 // Format as whole milliseconds.
1718                 return itoaDiv(buf, ns/1e6, 0)
1719         }
1720         // Format two digits of precision, with at most three decimal places.
1721         x := ns / 1e3
1722         if x == 0 {
1723                 buf[0] = '0'
1724                 return buf[:1]
1725         }
1726         dec := 3
1727         for x >= 100 {
1728                 x /= 10
1729                 dec--
1730         }
1731         return itoaDiv(buf, x, dec)
1732 }
1733
1734 // Helpers for testing GC.
1735
1736 // gcTestMoveStackOnNextCall causes the stack to be moved on a call
1737 // immediately following the call to this. It may not work correctly
1738 // if any other work appears after this call (such as returning).
1739 // Typically the following call should be marked go:noinline so it
1740 // performs a stack check.
1741 //
1742 // In rare cases this may not cause the stack to move, specifically if
1743 // there's a preemption between this call and the next.
1744 func gcTestMoveStackOnNextCall() {
1745         gp := getg()
1746         gp.stackguard0 = stackForceMove
1747 }
1748
1749 // gcTestIsReachable performs a GC and returns a bit set where bit i
1750 // is set if ptrs[i] is reachable.
1751 func gcTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
1752         // This takes the pointers as unsafe.Pointers in order to keep
1753         // them live long enough for us to attach specials. After
1754         // that, we drop our references to them.
1755
1756         if len(ptrs) > 64 {
1757                 panic("too many pointers for uint64 mask")
1758         }
1759
1760         // Block GC while we attach specials and drop our references
1761         // to ptrs. Otherwise, if a GC is in progress, it could mark
1762         // them reachable via this function before we have a chance to
1763         // drop them.
1764         semacquire(&gcsema)
1765
1766         // Create reachability specials for ptrs.
1767         specials := make([]*specialReachable, len(ptrs))
1768         for i, p := range ptrs {
1769                 lock(&mheap_.speciallock)
1770                 s := (*specialReachable)(mheap_.specialReachableAlloc.alloc())
1771                 unlock(&mheap_.speciallock)
1772                 s.special.kind = _KindSpecialReachable
1773                 if !addspecial(p, &s.special) {
1774                         throw("already have a reachable special (duplicate pointer?)")
1775                 }
1776                 specials[i] = s
1777                 // Make sure we don't retain ptrs.
1778                 ptrs[i] = nil
1779         }
1780
1781         semrelease(&gcsema)
1782
1783         // Force a full GC and sweep.
1784         GC()
1785
1786         // Process specials.
1787         for i, s := range specials {
1788                 if !s.done {
1789                         printlock()
1790                         println("runtime: object", i, "was not swept")
1791                         throw("IsReachable failed")
1792                 }
1793                 if s.reachable {
1794                         mask |= 1 << i
1795                 }
1796                 lock(&mheap_.speciallock)
1797                 mheap_.specialReachableAlloc.free(unsafe.Pointer(s))
1798                 unlock(&mheap_.speciallock)
1799         }
1800
1801         return mask
1802 }
1803
1804 // gcTestPointerClass returns the category of what p points to, one of:
1805 // "heap", "stack", "data", "bss", "other". This is useful for checking
1806 // that a test is doing what it's intended to do.
1807 //
1808 // This is nosplit simply to avoid extra pointer shuffling that may
1809 // complicate a test.
1810 //
1811 //go:nosplit
1812 func gcTestPointerClass(p unsafe.Pointer) string {
1813         p2 := uintptr(noescape(p))
1814         gp := getg()
1815         if gp.stack.lo <= p2 && p2 < gp.stack.hi {
1816                 return "stack"
1817         }
1818         if base, _, _ := findObject(p2, 0, 0); base != 0 {
1819                 return "heap"
1820         }
1821         for _, datap := range activeModules() {
1822                 if datap.data <= p2 && p2 < datap.edata || datap.noptrdata <= p2 && p2 < datap.enoptrdata {
1823                         return "data"
1824                 }
1825                 if datap.bss <= p2 && p2 < datap.ebss || datap.noptrbss <= p2 && p2 <= datap.enoptrbss {
1826                         return "bss"
1827                 }
1828         }
1829         KeepAlive(p)
1830         return "other"
1831 }