src/runtime/mgc.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Garbage collector (GC).
   6 //
   7 // The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
   8 // GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
   9 // non-generational and non-compacting. Allocation is done using size segregated per P allocation
  10 // areas to minimize fragmentation while eliminating locks in the common case.
  11 //
  12 // The algorithm decomposes into several steps.
  13 // This is a high level description of the algorithm being used. For an overview of GC a good
  14 // place to start is Richard Jones' gchandbook.org.
  15 //
  16 // The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
  17 // Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
  18 // On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
  19 // 966-975.
  20 // For journal quality proofs that these steps are complete, correct, and terminate see
  21 // Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
  22 // Concurrency and Computation: Practice and Experience 15(3-5), 2003.
  23 //
  24 // 1. GC performs sweep termination.
  25 //
  26 //    a. Stop the world. This causes all Ps to reach a GC safe-point.
  27 //
  28 //    b. Sweep any unswept spans. There will only be unswept spans if
  29 //    this GC cycle was forced before the expected time.
  30 //
  31 // 2. GC performs the mark phase.
  32 //
  33 //    a. Prepare for the mark phase by setting gcphase to _GCmark
  34 //    (from _GCoff), enabling the write barrier, enabling mutator
  35 //    assists, and enqueueing root mark jobs. No objects may be
  36 //    scanned until all Ps have enabled the write barrier, which is
  37 //    accomplished using STW.
  38 //
  39 //    b. Start the world. From this point, GC work is done by mark
  40 //    workers started by the scheduler and by assists performed as
  41 //    part of allocation. The write barrier shades both the
  42 //    overwritten pointer and the new pointer value for any pointer
  43 //    writes (see mbarrier.go for details). Newly allocated objects
  44 //    are immediately marked black.
  45 //
  46 //    c. GC performs root marking jobs. This includes scanning all
  47 //    stacks, shading all globals, and shading any heap pointers in
  48 //    off-heap runtime data structures. Scanning a stack stops a
  49 //    goroutine, shades any pointers found on its stack, and then
  50 //    resumes the goroutine.
  51 //
  52 //    d. GC drains the work queue of grey objects, scanning each grey
  53 //    object to black and shading all pointers found in the object
  54 //    (which in turn may add those pointers to the work queue).
  55 //
  56 //    e. Because GC work is spread across local caches, GC uses a
  57 //    distributed termination algorithm to detect when there are no
  58 //    more root marking jobs or grey objects (see gcMarkDone). At this
  59 //    point, GC transitions to mark termination.
  60 //
  61 // 3. GC performs mark termination.
  62 //
  63 //    a. Stop the world.
  64 //
  65 //    b. Set gcphase to _GCmarktermination, and disable workers and
  66 //    assists.
  67 //
  68 //    c. Perform housekeeping like flushing mcaches.
  69 //
  70 // 4. GC performs the sweep phase.
  71 //
  72 //    a. Prepare for the sweep phase by setting gcphase to _GCoff,
  73 //    setting up sweep state and disabling the write barrier.
  74 //
  75 //    b. Start the world. From this point on, newly allocated objects
  76 //    are white, and allocating sweeps spans before use if necessary.
  77 //
  78 //    c. GC does concurrent sweeping in the background and in response
  79 //    to allocation. See description below.
  80 //
  81 // 5. When sufficient allocation has taken place, replay the sequence
  82 // starting with 1 above. See discussion of GC rate below.
  83
  84 // Concurrent sweep.
  85 //
  86 // The sweep phase proceeds concurrently with normal program execution.
  87 // The heap is swept span-by-span both lazily (when a goroutine needs another span)
  88 // and concurrently in a background goroutine (this helps programs that are not CPU bound).
  89 // At the end of STW mark termination all spans are marked as "needs sweeping".
  90 //
  91 // The background sweeper goroutine simply sweeps spans one-by-one.
  92 //
  93 // To avoid requesting more OS memory while there are unswept spans, when a
  94 // goroutine needs another span, it first attempts to reclaim that much memory
  95 // by sweeping. When a goroutine needs to allocate a new small-object span, it
  96 // sweeps small-object spans for the same object size until it frees at least
  97 // one object. When a goroutine needs to allocate large-object span from heap,
  98 // it sweeps spans until it frees at least that many pages into heap. There is
  99 // one case where this may not suffice: if a goroutine sweeps and frees two
 100 // nonadjacent one-page spans to the heap, it will allocate a new two-page
 101 // span, but there can still be other one-page unswept spans which could be
 102 // combined into a two-page span.
 103 //
 104 // It's critical to ensure that no operations proceed on unswept spans (that would corrupt
 105 // mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
 106 // so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
 107 // When a goroutine explicitly frees an object or sets a finalizer, it ensures that
 108 // the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
 109 // The finalizer goroutine is kicked off only when all spans are swept.
 110 // When the next GC starts, it sweeps all not-yet-swept spans (if any).
 111
 112 // GC rate.
 113 // Next GC is after we've allocated an extra amount of memory proportional to
 114 // the amount already in use. The proportion is controlled by GOGC environment variable
 115 // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
 116 // (this mark is computed by the gcController.heapGoal method). This keeps the GC cost in
 117 // linear proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 118 // (and also the amount of extra memory used).
 119
 120 // Oblets
 121 //
 122 // In order to prevent long pauses while scanning large objects and to
 123 // improve parallelism, the garbage collector breaks up scan jobs for
 124 // objects larger than maxObletBytes into "oblets" of at most
 125 // maxObletBytes. When scanning encounters the beginning of a large
 126 // object, it scans only the first oblet and enqueues the remaining
 127 // oblets as new scan jobs.
 128
 129 package runtime
 130
 131 import (
 132         "internal/cpu"
 133         "runtime/internal/atomic"
 134         "unsafe"
 135 )
 136
 137 const (
 138         _DebugGC         = 0
 139         _ConcurrentSweep = true
 140         _FinBlockSize    = 4 * 1024
 141
 142         // debugScanConservative enables debug logging for stack
 143         // frames that are scanned conservatively.
 144         debugScanConservative = false
 145
 146         // sweepMinHeapDistance is a lower bound on the heap distance
 147         // (in bytes) reserved for concurrent sweeping between GC
 148         // cycles.
 149         sweepMinHeapDistance = 1024 * 1024
 150 )
 151
 152 func gcinit() {
 153         if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
 154                 throw("size of Workbuf is suboptimal")
 155         }
 156         // No sweep on the first cycle.
 157         sweep.active.state.Store(sweepDrainedMask)
 158
 159         // Initialize GC pacer state.
 160         // Use the environment variable GOGC for the initial gcPercent value.
 161         // Use the environment variable GOMEMLIMIT for the initial memoryLimit value.
 162         gcController.init(readGOGC(), readGOMEMLIMIT())
 163
 164         work.startSema = 1
 165         work.markDoneSema = 1
 166         lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
 167         lockInit(&work.assistQueue.lock, lockRankAssistQueue)
 168         lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
 169 }
 170
 171 // gcenable is called after the bulk of the runtime initialization,
 172 // just before we're about to start letting user code run.
 173 // It kicks off the background sweeper goroutine, the background
 174 // scavenger goroutine, and enables GC.
 175 func gcenable() {
 176         // Kick off sweeping and scavenging.
 177         c := make(chan int, 2)
 178         go bgsweep(c)
 179         go bgscavenge(c)
 180         <-c
 181         <-c
 182         memstats.enablegc = true // now that runtime is initialized, GC is okay
 183 }
 184
 185 // Garbage collector phase.
 186 // Indicates to write barrier and synchronization task to perform.
 187 var gcphase uint32
 188
 189 // The compiler knows about this variable.
 190 // If you change it, you must change builtin/runtime.go, too.
 191 // If you change the first four bytes, you must also change the write
 192 // barrier insertion code.
 193 var writeBarrier struct {
 194         enabled bool    // compiler emits a check of this before calling write barrier
 195         pad     [3]byte // compiler uses 32-bit load for "enabled" field
 196         needed  bool    // whether we need a write barrier for current GC phase
 197         cgo     bool    // whether we need a write barrier for a cgo check
 198         alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
 199 }
 200
 201 // gcBlackenEnabled is 1 if mutator assists and background mark
 202 // workers are allowed to blacken objects. This must only be set when
 203 // gcphase == _GCmark.
 204 var gcBlackenEnabled uint32
 205
 206 const (
 207         _GCoff             = iota // GC not running; sweeping in background, write barrier disabled
 208         _GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
 209         _GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
 210 )
 211
 212 //go:nosplit
 213 func setGCPhase(x uint32) {
 214         atomic.Store(&gcphase, x)
 215         writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
 216         writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
 217 }
 218
 219 // gcMarkWorkerMode represents the mode that a concurrent mark worker
 220 // should operate in.
 221 //
 222 // Concurrent marking happens through four different mechanisms. One
 223 // is mutator assists, which happen in response to allocations and are
 224 // not scheduled. The other three are variations in the per-P mark
 225 // workers and are distinguished by gcMarkWorkerMode.
 226 type gcMarkWorkerMode int
 227
 228 const (
 229         // gcMarkWorkerNotWorker indicates that the next scheduled G is not
 230         // starting work and the mode should be ignored.
 231         gcMarkWorkerNotWorker gcMarkWorkerMode = iota
 232
 233         // gcMarkWorkerDedicatedMode indicates that the P of a mark
 234         // worker is dedicated to running that mark worker. The mark
 235         // worker should run without preemption.
 236         gcMarkWorkerDedicatedMode
 237
 238         // gcMarkWorkerFractionalMode indicates that a P is currently
 239         // running the "fractional" mark worker. The fractional worker
 240         // is necessary when GOMAXPROCS*gcBackgroundUtilization is not
 241         // an integer and using only dedicated workers would result in
 242         // utilization too far from the target of gcBackgroundUtilization.
 243         // The fractional worker should run until it is preempted and
 244         // will be scheduled to pick up the fractional part of
 245         // GOMAXPROCS*gcBackgroundUtilization.
 246         gcMarkWorkerFractionalMode
 247
 248         // gcMarkWorkerIdleMode indicates that a P is running the mark
 249         // worker because it has nothing else to do. The idle worker
 250         // should run until it is preempted and account its time
 251         // against gcController.idleMarkTime.
 252         gcMarkWorkerIdleMode
 253 )
 254
 255 // gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
 256 // to use in execution traces.
 257 var gcMarkWorkerModeStrings = [...]string{
 258         "Not worker",
 259         "GC (dedicated)",
 260         "GC (fractional)",
 261         "GC (idle)",
 262 }
 263
 264 // pollFractionalWorkerExit reports whether a fractional mark worker
 265 // should self-preempt. It assumes it is called from the fractional
 266 // worker.
 267 func pollFractionalWorkerExit() bool {
 268         // This should be kept in sync with the fractional worker
 269         // scheduler logic in findRunnableGCWorker.
 270         now := nanotime()
 271         delta := now - gcController.markStartTime
 272         if delta <= 0 {
 273                 return true
 274         }
 275         p := getg().m.p.ptr()
 276         selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
 277         // Add some slack to the utilization goal so that the
 278         // fractional worker isn't behind again the instant it exits.
 279         return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
 280 }
 281
 282 var work workType
 283
 284 type workType struct {
 285         full  lfstack          // lock-free list of full blocks workbuf
 286         empty lfstack          // lock-free list of empty blocks workbuf
 287         pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
 288
 289         wbufSpans struct {
 290                 lock mutex
 291                 // free is a list of spans dedicated to workbufs, but
 292                 // that don't currently contain any workbufs.
 293                 free mSpanList
 294                 // busy is a list of all spans containing workbufs on
 295                 // one of the workbuf lists.
 296                 busy mSpanList
 297         }
 298
 299         // Restore 64-bit alignment on 32-bit.
 300         _ uint32
 301
 302         // bytesMarked is the number of bytes marked this cycle. This
 303         // includes bytes blackened in scanned objects, noscan objects
 304         // that go straight to black, and permagrey objects scanned by
 305         // markroot during the concurrent scan phase. This is updated
 306         // atomically during the cycle. Updates may be batched
 307         // arbitrarily, since the value is only read at the end of the
 308         // cycle.
 309         //
 310         // Because of benign races during marking, this number may not
 311         // be the exact number of marked bytes, but it should be very
 312         // close.
 313         //
 314         // Put this field here because it needs 64-bit atomic access
 315         // (and thus 8-byte alignment even on 32-bit architectures).
 316         bytesMarked uint64
 317
 318         markrootNext uint32 // next markroot job
 319         markrootJobs uint32 // number of markroot jobs
 320
 321         nproc  uint32
 322         tstart int64
 323         nwait  uint32
 324
 325         // Number of roots of various root types. Set by gcMarkRootPrepare.
 326         //
 327         // nStackRoots == len(stackRoots), but we have nStackRoots for
 328         // consistency.
 329         nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
 330
 331         // Base indexes of each root type. Set by gcMarkRootPrepare.
 332         baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32
 333
 334         // stackRoots is a snapshot of all of the Gs that existed
 335         // before the beginning of concurrent marking. The backing
 336         // store of this must not be modified because it might be
 337         // shared with allgs.
 338         stackRoots []*g
 339
 340         // Each type of GC state transition is protected by a lock.
 341         // Since multiple threads can simultaneously detect the state
 342         // transition condition, any thread that detects a transition
 343         // condition must acquire the appropriate transition lock,
 344         // re-check the transition condition and return if it no
 345         // longer holds or perform the transition if it does.
 346         // Likewise, any transition must invalidate the transition
 347         // condition before releasing the lock. This ensures that each
 348         // transition is performed by exactly one thread and threads
 349         // that need the transition to happen block until it has
 350         // happened.
 351         //
 352         // startSema protects the transition from "off" to mark or
 353         // mark termination.
 354         startSema uint32
 355         // markDoneSema protects transitions from mark to mark termination.
 356         markDoneSema uint32
 357
 358         bgMarkReady note   // signal background mark worker has started
 359         bgMarkDone  uint32 // cas to 1 when at a background mark completion point
 360         // Background mark completion signaling
 361
 362         // mode is the concurrency mode of the current GC cycle.
 363         mode gcMode
 364
 365         // userForced indicates the current GC cycle was forced by an
 366         // explicit user call.
 367         userForced bool
 368
 369         // initialHeapLive is the value of gcController.heapLive at the
 370         // beginning of this GC cycle.
 371         initialHeapLive uint64
 372
 373         // assistQueue is a queue of assists that are blocked because
 374         // there was neither enough credit to steal or enough work to
 375         // do.
 376         assistQueue struct {
 377                 lock mutex
 378                 q    gQueue
 379         }
 380
 381         // sweepWaiters is a list of blocked goroutines to wake when
 382         // we transition from mark termination to sweep.
 383         sweepWaiters struct {
 384                 lock mutex
 385                 list gList
 386         }
 387
 388         // cycles is the number of completed GC cycles, where a GC
 389         // cycle is sweep termination, mark, mark termination, and
 390         // sweep. This differs from memstats.numgc, which is
 391         // incremented at mark termination.
 392         cycles atomic.Uint32
 393
 394         // Timing/utilization stats for this cycle.
 395         stwprocs, maxprocs                 int32
 396         tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
 397
 398         pauseNS    int64 // total STW time this cycle
 399         pauseStart int64 // nanotime() of last STW
 400
 401         // debug.gctrace heap sizes for this cycle.
 402         heap0, heap1, heap2 uint64
 403
 404         // Cumulative estimated CPU usage.
 405         cpuStats
 406 }
 407
 408 // GC runs a garbage collection and blocks the caller until the
 409 // garbage collection is complete. It may also block the entire
 410 // program.
 411 func GC() {
 412         // We consider a cycle to be: sweep termination, mark, mark
 413         // termination, and sweep. This function shouldn't return
 414         // until a full cycle has been completed, from beginning to
 415         // end. Hence, we always want to finish up the current cycle
 416         // and start a new one. That means:
 417         //
 418         // 1. In sweep termination, mark, or mark termination of cycle
 419         // N, wait until mark termination N completes and transitions
 420         // to sweep N.
 421         //
 422         // 2. In sweep N, help with sweep N.
 423         //
 424         // At this point we can begin a full cycle N+1.
 425         //
 426         // 3. Trigger cycle N+1 by starting sweep termination N+1.
 427         //
 428         // 4. Wait for mark termination N+1 to complete.
 429         //
 430         // 5. Help with sweep N+1 until it's done.
 431         //
 432         // This all has to be written to deal with the fact that the
 433         // GC may move ahead on its own. For example, when we block
 434         // until mark termination N, we may wake up in cycle N+2.
 435
 436         // Wait until the current sweep termination, mark, and mark
 437         // termination complete.
 438         n := work.cycles.Load()
 439         gcWaitOnMark(n)
 440
 441         // We're now in sweep N or later. Trigger GC cycle N+1, which
 442         // will first finish sweep N if necessary and then enter sweep
 443         // termination N+1.
 444         gcStart(gcTrigger{kind: gcTriggerCycle, n: n + 1})
 445
 446         // Wait for mark termination N+1 to complete.
 447         gcWaitOnMark(n + 1)
 448
 449         // Finish sweep N+1 before returning. We do this both to
 450         // complete the cycle and because runtime.GC() is often used
 451         // as part of tests and benchmarks to get the system into a
 452         // relatively stable and isolated state.
 453         for work.cycles.Load() == n+1 && sweepone() != ^uintptr(0) {
 454                 sweep.nbgsweep++
 455                 Gosched()
 456         }
 457
 458         // Callers may assume that the heap profile reflects the
 459         // just-completed cycle when this returns (historically this
 460         // happened because this was a STW GC), but right now the
 461         // profile still reflects mark termination N, not N+1.
 462         //
 463         // As soon as all of the sweep frees from cycle N+1 are done,
 464         // we can go ahead and publish the heap profile.
 465         //
 466         // First, wait for sweeping to finish. (We know there are no
 467         // more spans on the sweep queue, but we may be concurrently
 468         // sweeping spans, so we have to wait.)
 469         for work.cycles.Load() == n+1 && !isSweepDone() {
 470                 Gosched()
 471         }
 472
 473         // Now we're really done with sweeping, so we can publish the
 474         // stable heap profile. Only do this if we haven't already hit
 475         // another mark termination.
 476         mp := acquirem()
 477         cycle := work.cycles.Load()
 478         if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
 479                 mProf_PostSweep()
 480         }
 481         releasem(mp)
 482 }
 483
 484 // gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
 485 // already completed this mark phase, it returns immediately.
 486 func gcWaitOnMark(n uint32) {
 487         for {
 488                 // Disable phase transitions.
 489                 lock(&work.sweepWaiters.lock)
 490                 nMarks := work.cycles.Load()
 491                 if gcphase != _GCmark {
 492                         // We've already completed this cycle's mark.
 493                         nMarks++
 494                 }
 495                 if nMarks > n {
 496                         // We're done.
 497                         unlock(&work.sweepWaiters.lock)
 498                         return
 499                 }
 500
 501                 // Wait until sweep termination, mark, and mark
 502                 // termination of cycle N complete.
 503                 work.sweepWaiters.list.push(getg())
 504                 goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
 505         }
 506 }
 507
 508 // gcMode indicates how concurrent a GC cycle should be.
 509 type gcMode int
 510
 511 const (
 512         gcBackgroundMode gcMode = iota // concurrent GC and sweep
 513         gcForceMode                    // stop-the-world GC now, concurrent sweep
 514         gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
 515 )
 516
 517 // A gcTrigger is a predicate for starting a GC cycle. Specifically,
 518 // it is an exit condition for the _GCoff phase.
 519 type gcTrigger struct {
 520         kind gcTriggerKind
 521         now  int64  // gcTriggerTime: current time
 522         n    uint32 // gcTriggerCycle: cycle number to start
 523 }
 524
 525 type gcTriggerKind int
 526
 527 const (
 528         // gcTriggerHeap indicates that a cycle should be started when
 529         // the heap size reaches the trigger heap size computed by the
 530         // controller.
 531         gcTriggerHeap gcTriggerKind = iota
 532
 533         // gcTriggerTime indicates that a cycle should be started when
 534         // it's been more than forcegcperiod nanoseconds since the
 535         // previous GC cycle.
 536         gcTriggerTime
 537
 538         // gcTriggerCycle indicates that a cycle should be started if
 539         // we have not yet started cycle number gcTrigger.n (relative
 540         // to work.cycles).
 541         gcTriggerCycle
 542 )
 543
 544 // test reports whether the trigger condition is satisfied, meaning
 545 // that the exit condition for the _GCoff phase has been met. The exit
 546 // condition should be tested when allocating.
 547 func (t gcTrigger) test() bool {
 548         if !memstats.enablegc || panicking.Load() != 0 || gcphase != _GCoff {
 549                 return false
 550         }
 551         switch t.kind {
 552         case gcTriggerHeap:
 553                 // Non-atomic access to gcController.heapLive for performance. If
 554                 // we are going to trigger on this, this thread just
 555                 // atomically wrote gcController.heapLive anyway and we'll see our
 556                 // own write.
 557                 trigger, _ := gcController.trigger()
 558                 return gcController.heapLive.Load() >= trigger
 559         case gcTriggerTime:
 560                 if gcController.gcPercent.Load() < 0 {
 561                         return false
 562                 }
 563                 lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
 564                 return lastgc != 0 && t.now-lastgc > forcegcperiod
 565         case gcTriggerCycle:
 566                 // t.n > work.cycles, but accounting for wraparound.
 567                 return int32(t.n-work.cycles.Load()) > 0
 568         }
 569         return true
 570 }
 571
 572 // gcStart starts the GC. It transitions from _GCoff to _GCmark (if
 573 // debug.gcstoptheworld == 0) or performs all of GC (if
 574 // debug.gcstoptheworld != 0).
 575 //
 576 // This may return without performing this transition in some cases,
 577 // such as when called on a system stack or with locks held.
 578 func gcStart(trigger gcTrigger) {
 579         // Since this is called from malloc and malloc is called in
 580         // the guts of a number of libraries that might be holding
 581         // locks, don't attempt to start GC in non-preemptible or
 582         // potentially unstable situations.
 583         mp := acquirem()
 584         if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
 585                 releasem(mp)
 586                 return
 587         }
 588         releasem(mp)
 589         mp = nil
 590
 591         // Pick up the remaining unswept/not being swept spans concurrently
 592         //
 593         // This shouldn't happen if we're being invoked in background
 594         // mode since proportional sweep should have just finished
 595         // sweeping everything, but rounding errors, etc, may leave a
 596         // few spans unswept. In forced mode, this is necessary since
 597         // GC can be forced at any point in the sweeping cycle.
 598         //
 599         // We check the transition condition continuously here in case
 600         // this G gets delayed in to the next GC cycle.
 601         for trigger.test() && sweepone() != ^uintptr(0) {
 602                 sweep.nbgsweep++
 603         }
 604
 605         // Perform GC initialization and the sweep termination
 606         // transition.
 607         semacquire(&work.startSema)
 608         // Re-check transition condition under transition lock.
 609         if !trigger.test() {
 610                 semrelease(&work.startSema)
 611                 return
 612         }
 613
 614         // In gcstoptheworld debug mode, upgrade the mode accordingly.
 615         // We do this after re-checking the transition condition so
 616         // that multiple goroutines that detect the heap trigger don't
 617         // start multiple STW GCs.
 618         mode := gcBackgroundMode
 619         if debug.gcstoptheworld == 1 {
 620                 mode = gcForceMode
 621         } else if debug.gcstoptheworld == 2 {
 622                 mode = gcForceBlockMode
 623         }
 624
 625         // Ok, we're doing it! Stop everybody else
 626         semacquire(&gcsema)
 627         semacquire(&worldsema)
 628
 629         // For stats, check if this GC was forced by the user.
 630         // Update it under gcsema to avoid gctrace getting wrong values.
 631         work.userForced = trigger.kind == gcTriggerCycle
 632
 633         if trace.enabled {
 634                 traceGCStart()
 635         }
 636
 637         // Check that all Ps have finished deferred mcache flushes.
 638         for _, p := range allp {
 639                 if fg := p.mcache.flushGen.Load(); fg != mheap_.sweepgen {
 640                         println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
 641                         throw("p mcache not flushed")
 642                 }
 643         }
 644
 645         gcBgMarkStartWorkers()
 646
 647         systemstack(gcResetMarkState)
 648
 649         work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
 650         if work.stwprocs > ncpu {
 651                 // This is used to compute CPU time of the STW phases,
 652                 // so it can't be more than ncpu, even if GOMAXPROCS is.
 653                 work.stwprocs = ncpu
 654         }
 655         work.heap0 = gcController.heapLive.Load()
 656         work.pauseNS = 0
 657         work.mode = mode
 658
 659         now := nanotime()
 660         work.tSweepTerm = now
 661         work.pauseStart = now
 662         if trace.enabled {
 663                 traceGCSTWStart(1)
 664         }
 665         systemstack(stopTheWorldWithSema)
 666         // Finish sweep before we start concurrent scan.
 667         systemstack(func() {
 668                 finishsweep_m()
 669         })
 670
 671         // clearpools before we start the GC. If we wait they memory will not be
 672         // reclaimed until the next GC cycle.
 673         clearpools()
 674
 675         work.cycles.Add(1)
 676
 677         // Assists and workers can start the moment we start
 678         // the world.
 679         gcController.startCycle(now, int(gomaxprocs), trigger)
 680
 681         // Notify the CPU limiter that assists may begin.
 682         gcCPULimiter.startGCTransition(true, now)
 683
 684         // In STW mode, disable scheduling of user Gs. This may also
 685         // disable scheduling of this goroutine, so it may block as
 686         // soon as we start the world again.
 687         if mode != gcBackgroundMode {
 688                 schedEnableUser(false)
 689         }
 690
 691         // Enter concurrent mark phase and enable
 692         // write barriers.
 693         //
 694         // Because the world is stopped, all Ps will
 695         // observe that write barriers are enabled by
 696         // the time we start the world and begin
 697         // scanning.
 698         //
 699         // Write barriers must be enabled before assists are
 700         // enabled because they must be enabled before
 701         // any non-leaf heap objects are marked. Since
 702         // allocations are blocked until assists can
 703         // happen, we want enable assists as early as
 704         // possible.
 705         setGCPhase(_GCmark)
 706
 707         gcBgMarkPrepare() // Must happen before assist enable.
 708         gcMarkRootPrepare()
 709
 710         // Mark all active tinyalloc blocks. Since we're
 711         // allocating from these, they need to be black like
 712         // other allocations. The alternative is to blacken
 713         // the tiny block on every allocation from it, which
 714         // would slow down the tiny allocator.
 715         gcMarkTinyAllocs()
 716
 717         // At this point all Ps have enabled the write
 718         // barrier, thus maintaining the no white to
 719         // black invariant. Enable mutator assists to
 720         // put back-pressure on fast allocating
 721         // mutators.
 722         atomic.Store(&gcBlackenEnabled, 1)
 723
 724         // In STW mode, we could block the instant systemstack
 725         // returns, so make sure we're not preemptible.
 726         mp = acquirem()
 727
 728         // Concurrent mark.
 729         systemstack(func() {
 730                 now = startTheWorldWithSema(trace.enabled)
 731                 work.pauseNS += now - work.pauseStart
 732                 work.tMark = now
 733                 memstats.gcPauseDist.record(now - work.pauseStart)
 734
 735                 // Release the CPU limiter.
 736                 gcCPULimiter.finishGCTransition(now)
 737         })
 738
 739         // Release the world sema before Gosched() in STW mode
 740         // because we will need to reacquire it later but before
 741         // this goroutine becomes runnable again, and we could
 742         // self-deadlock otherwise.
 743         semrelease(&worldsema)
 744         releasem(mp)
 745
 746         // Make sure we block instead of returning to user code
 747         // in STW mode.
 748         if mode != gcBackgroundMode {
 749                 Gosched()
 750         }
 751
 752         semrelease(&work.startSema)
 753 }
 754
 755 // gcMarkDoneFlushed counts the number of P's with flushed work.
 756 //
 757 // Ideally this would be a captured local in gcMarkDone, but forEachP
 758 // escapes its callback closure, so it can't capture anything.
 759 //
 760 // This is protected by markDoneSema.
 761 var gcMarkDoneFlushed uint32
 762
 763 // gcMarkDone transitions the GC from mark to mark termination if all
 764 // reachable objects have been marked (that is, there are no grey
 765 // objects and can be no more in the future). Otherwise, it flushes
 766 // all local work to the global queues where it can be discovered by
 767 // other workers.
 768 //
 769 // This should be called when all local mark work has been drained and
 770 // there are no remaining workers. Specifically, when
 771 //
 772 //      work.nwait == work.nproc && !gcMarkWorkAvailable(p)
 773 //
 774 // The calling context must be preemptible.
 775 //
 776 // Flushing local work is important because idle Ps may have local
 777 // work queued. This is the only way to make that work visible and
 778 // drive GC to completion.
 779 //
 780 // It is explicitly okay to have write barriers in this function. If
 781 // it does transition to mark termination, then all reachable objects
 782 // have been marked, so the write barrier cannot shade any more
 783 // objects.
 784 func gcMarkDone() {
 785         // Ensure only one thread is running the ragged barrier at a
 786         // time.
 787         semacquire(&work.markDoneSema)
 788
 789 top:
 790         // Re-check transition condition under transition lock.
 791         //
 792         // It's critical that this checks the global work queues are
 793         // empty before performing the ragged barrier. Otherwise,
 794         // there could be global work that a P could take after the P
 795         // has passed the ragged barrier.
 796         if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
 797                 semrelease(&work.markDoneSema)
 798                 return
 799         }
 800
 801         // forEachP needs worldsema to execute, and we'll need it to
 802         // stop the world later, so acquire worldsema now.
 803         semacquire(&worldsema)
 804
 805         // Flush all local buffers and collect flushedWork flags.
 806         gcMarkDoneFlushed = 0
 807         systemstack(func() {
 808                 gp := getg().m.curg
 809                 // Mark the user stack as preemptible so that it may be scanned.
 810                 // Otherwise, our attempt to force all P's to a safepoint could
 811                 // result in a deadlock as we attempt to preempt a worker that's
 812                 // trying to preempt us (e.g. for a stack scan).
 813                 casGToWaiting(gp, _Grunning, waitReasonGCMarkTermination)
 814                 forEachP(func(pp *p) {
 815                         // Flush the write barrier buffer, since this may add
 816                         // work to the gcWork.
 817                         wbBufFlush1(pp)
 818
 819                         // Flush the gcWork, since this may create global work
 820                         // and set the flushedWork flag.
 821                         //
 822                         // TODO(austin): Break up these workbufs to
 823                         // better distribute work.
 824                         pp.gcw.dispose()
 825                         // Collect the flushedWork flag.
 826                         if pp.gcw.flushedWork {
 827                                 atomic.Xadd(&gcMarkDoneFlushed, 1)
 828                                 pp.gcw.flushedWork = false
 829                         }
 830                 })
 831                 casgstatus(gp, _Gwaiting, _Grunning)
 832         })
 833
 834         if gcMarkDoneFlushed != 0 {
 835                 // More grey objects were discovered since the
 836                 // previous termination check, so there may be more
 837                 // work to do. Keep going. It's possible the
 838                 // transition condition became true again during the
 839                 // ragged barrier, so re-check it.
 840                 semrelease(&worldsema)
 841                 goto top
 842         }
 843
 844         // There was no global work, no local work, and no Ps
 845         // communicated work since we took markDoneSema. Therefore
 846         // there are no grey objects and no more objects can be
 847         // shaded. Transition to mark termination.
 848         now := nanotime()
 849         work.tMarkTerm = now
 850         work.pauseStart = now
 851         getg().m.preemptoff = "gcing"
 852         if trace.enabled {
 853                 traceGCSTWStart(0)
 854         }
 855         systemstack(stopTheWorldWithSema)
 856         // The gcphase is _GCmark, it will transition to _GCmarktermination
 857         // below. The important thing is that the wb remains active until
 858         // all marking is complete. This includes writes made by the GC.
 859
 860         // There is sometimes work left over when we enter mark termination due
 861         // to write barriers performed after the completion barrier above.
 862         // Detect this and resume concurrent mark. This is obviously
 863         // unfortunate.
 864         //
 865         // See issue #27993 for details.
 866         //
 867         // Switch to the system stack to call wbBufFlush1, though in this case
 868         // it doesn't matter because we're non-preemptible anyway.
 869         restart := false
 870         systemstack(func() {
 871                 for _, p := range allp {
 872                         wbBufFlush1(p)
 873                         if !p.gcw.empty() {
 874                                 restart = true
 875                                 break
 876                         }
 877                 }
 878         })
 879         if restart {
 880                 getg().m.preemptoff = ""
 881                 systemstack(func() {
 882                         now := startTheWorldWithSema(trace.enabled)
 883                         work.pauseNS += now - work.pauseStart
 884                         memstats.gcPauseDist.record(now - work.pauseStart)
 885                 })
 886                 semrelease(&worldsema)
 887                 goto top
 888         }
 889
 890         gcComputeStartingStackSize()
 891
 892         // Disable assists and background workers. We must do
 893         // this before waking blocked assists.
 894         atomic.Store(&gcBlackenEnabled, 0)
 895
 896         // Notify the CPU limiter that GC assists will now cease.
 897         gcCPULimiter.startGCTransition(false, now)
 898
 899         // Wake all blocked assists. These will run when we
 900         // start the world again.
 901         gcWakeAllAssists()
 902
 903         // Likewise, release the transition lock. Blocked
 904         // workers and assists will run when we start the
 905         // world again.
 906         semrelease(&work.markDoneSema)
 907
 908         // In STW mode, re-enable user goroutines. These will be
 909         // queued to run after we start the world.
 910         schedEnableUser(true)
 911
 912         // endCycle depends on all gcWork cache stats being flushed.
 913         // The termination algorithm above ensured that up to
 914         // allocations since the ragged barrier.
 915         gcController.endCycle(now, int(gomaxprocs), work.userForced)
 916
 917         // Perform mark termination. This will restart the world.
 918         gcMarkTermination()
 919 }
 920
 921 // World must be stopped and mark assists and background workers must be
 922 // disabled.
 923 func gcMarkTermination() {
 924         // Start marktermination (write barrier remains enabled for now).
 925         setGCPhase(_GCmarktermination)
 926
 927         work.heap1 = gcController.heapLive.Load()
 928         startTime := nanotime()
 929
 930         mp := acquirem()
 931         mp.preemptoff = "gcing"
 932         mp.traceback = 2
 933         curgp := mp.curg
 934         casGToWaiting(curgp, _Grunning, waitReasonGarbageCollection)
 935
 936         // Run gc on the g0 stack. We do this so that the g stack
 937         // we're currently running on will no longer change. Cuts
 938         // the root set down a bit (g0 stacks are not scanned, and
 939         // we don't need to scan gc's internal state).  We also
 940         // need to switch to g0 so we can shrink the stack.
 941         systemstack(func() {
 942                 gcMark(startTime)
 943                 // Must return immediately.
 944                 // The outer function's stack may have moved
 945                 // during gcMark (it shrinks stacks, including the
 946                 // outer function's stack), so we must not refer
 947                 // to any of its variables. Return back to the
 948                 // non-system stack to pick up the new addresses
 949                 // before continuing.
 950         })
 951
 952         systemstack(func() {
 953                 work.heap2 = work.bytesMarked
 954                 if debug.gccheckmark > 0 {
 955                         // Run a full non-parallel, stop-the-world
 956                         // mark using checkmark bits, to check that we
 957                         // didn't forget to mark anything during the
 958                         // concurrent mark process.
 959                         startCheckmarks()
 960                         gcResetMarkState()
 961                         gcw := &getg().m.p.ptr().gcw
 962                         gcDrain(gcw, 0)
 963                         wbBufFlush1(getg().m.p.ptr())
 964                         gcw.dispose()
 965                         endCheckmarks()
 966                 }
 967
 968                 // marking is complete so we can turn the write barrier off
 969                 setGCPhase(_GCoff)
 970                 gcSweep(work.mode)
 971         })
 972
 973         mp.traceback = 0
 974         casgstatus(curgp, _Gwaiting, _Grunning)
 975
 976         if trace.enabled {
 977                 traceGCDone()
 978         }
 979
 980         // all done
 981         mp.preemptoff = ""
 982
 983         if gcphase != _GCoff {
 984                 throw("gc done but gcphase != _GCoff")
 985         }
 986
 987         // Record heapInUse for scavenger.
 988         memstats.lastHeapInUse = gcController.heapInUse.load()
 989
 990         // Update GC trigger and pacing, as well as downstream consumers
 991         // of this pacing information, for the next cycle.
 992         systemstack(gcControllerCommit)
 993
 994         // Update timing memstats
 995         now := nanotime()
 996         sec, nsec, _ := time_now()
 997         unixNow := sec*1e9 + int64(nsec)
 998         work.pauseNS += now - work.pauseStart
 999         work.tEnd = now
1000         memstats.gcPauseDist.record(now - work.pauseStart)
1001         atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
1002         atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
1003         memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
1004         memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
1005         memstats.pause_total_ns += uint64(work.pauseNS)
1006
1007         sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
1008         // We report idle marking time below, but omit it from the
1009         // overall utilization here since it's "free".
1010         markAssistCpu := gcController.assistTime.Load()
1011         markDedicatedCpu := gcController.dedicatedMarkTime.Load()
1012         markFractionalCpu := gcController.fractionalMarkTime.Load()
1013         markIdleCpu := gcController.idleMarkTime.Load()
1014         markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
1015         scavAssistCpu := scavenge.assistTime.Load()
1016         scavBgCpu := scavenge.backgroundTime.Load()
1017
1018         // Update cumulative GC CPU stats.
1019         work.cpuStats.gcAssistTime += markAssistCpu
1020         work.cpuStats.gcDedicatedTime += markDedicatedCpu + markFractionalCpu
1021         work.cpuStats.gcIdleTime += markIdleCpu
1022         work.cpuStats.gcPauseTime += sweepTermCpu + markTermCpu
1023         work.cpuStats.gcTotalTime += sweepTermCpu + markAssistCpu + markDedicatedCpu + markFractionalCpu + markIdleCpu + markTermCpu
1024
1025         // Update cumulative scavenge CPU stats.
1026         work.cpuStats.scavengeAssistTime += scavAssistCpu
1027         work.cpuStats.scavengeBgTime += scavBgCpu
1028         work.cpuStats.scavengeTotalTime += scavAssistCpu + scavBgCpu
1029
1030         // Update total CPU.
1031         work.cpuStats.totalTime = sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
1032         work.cpuStats.idleTime += sched.idleTime.Load()
1033
1034         // Compute userTime. We compute this indirectly as everything that's not the above.
1035         //
1036         // Since time spent in _Pgcstop is covered by gcPauseTime, and time spent in _Pidle
1037         // is covered by idleTime, what we're left with is time spent in _Prunning and _Psyscall,
1038         // the latter of which is fine because the P will either go idle or get used for something
1039         // else via sysmon. Meanwhile if we subtract GC time from whatever's left, we get non-GC
1040         // _Prunning time. Note that this still leaves time spent in sweeping and in the scheduler,
1041         // but that's fine. The overwhelming majority of this time will be actual user time.
1042         work.cpuStats.userTime = work.cpuStats.totalTime - (work.cpuStats.gcTotalTime +
1043                 work.cpuStats.scavengeTotalTime + work.cpuStats.idleTime)
1044
1045         // Compute overall GC CPU utilization.
1046         // Omit idle marking time from the overall utilization here since it's "free".
1047         memstats.gc_cpu_fraction = float64(work.cpuStats.gcTotalTime-work.cpuStats.gcIdleTime) / float64(work.cpuStats.totalTime)
1048
1049         // Reset assist time and background time stats.
1050         //
1051         // Do this now, instead of at the start of the next GC cycle, because
1052         // these two may keep accumulating even if the GC is not active.
1053         scavenge.assistTime.Store(0)
1054         scavenge.backgroundTime.Store(0)
1055
1056         // Reset idle time stat.
1057         sched.idleTime.Store(0)
1058
1059         // Reset sweep state.
1060         sweep.nbgsweep = 0
1061         sweep.npausesweep = 0
1062
1063         if work.userForced {
1064                 memstats.numforcedgc++
1065         }
1066
1067         // Bump GC cycle count and wake goroutines waiting on sweep.
1068         lock(&work.sweepWaiters.lock)
1069         memstats.numgc++
1070         injectglist(&work.sweepWaiters.list)
1071         unlock(&work.sweepWaiters.lock)
1072
1073         // Release the CPU limiter.
1074         gcCPULimiter.finishGCTransition(now)
1075
1076         // Finish the current heap profiling cycle and start a new
1077         // heap profiling cycle. We do this before starting the world
1078         // so events don't leak into the wrong cycle.
1079         mProf_NextCycle()
1080
1081         // There may be stale spans in mcaches that need to be swept.
1082         // Those aren't tracked in any sweep lists, so we need to
1083         // count them against sweep completion until we ensure all
1084         // those spans have been forced out.
1085         sl := sweep.active.begin()
1086         if !sl.valid {
1087                 throw("failed to set sweep barrier")
1088         }
1089
1090         systemstack(func() { startTheWorldWithSema(trace.enabled) })
1091
1092         // Flush the heap profile so we can start a new cycle next GC.
1093         // This is relatively expensive, so we don't do it with the
1094         // world stopped.
1095         mProf_Flush()
1096
1097         // Prepare workbufs for freeing by the sweeper. We do this
1098         // asynchronously because it can take non-trivial time.
1099         prepareFreeWorkbufs()
1100
1101         // Free stack spans. This must be done between GC cycles.
1102         systemstack(freeStackSpans)
1103
1104         // Ensure all mcaches are flushed. Each P will flush its own
1105         // mcache before allocating, but idle Ps may not. Since this
1106         // is necessary to sweep all spans, we need to ensure all
1107         // mcaches are flushed before we start the next GC cycle.
1108         systemstack(func() {
1109                 forEachP(func(pp *p) {
1110                         pp.mcache.prepareForSweep()
1111                 })
1112         })
1113         // Now that we've swept stale spans in mcaches, they don't
1114         // count against unswept spans.
1115         sweep.active.end(sl)
1116
1117         // Print gctrace before dropping worldsema. As soon as we drop
1118         // worldsema another cycle could start and smash the stats
1119         // we're trying to print.
1120         if debug.gctrace > 0 {
1121                 util := int(memstats.gc_cpu_fraction * 100)
1122
1123                 var sbuf [24]byte
1124                 printlock()
1125                 print("gc ", memstats.numgc,
1126                         " @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
1127                         util, "%: ")
1128                 prev := work.tSweepTerm
1129                 for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
1130                         if i != 0 {
1131                                 print("+")
1132                         }
1133                         print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
1134                         prev = ns
1135                 }
1136                 print(" ms clock, ")
1137                 for i, ns := range []int64{
1138                         sweepTermCpu,
1139                         gcController.assistTime.Load(),
1140                         gcController.dedicatedMarkTime.Load() + gcController.fractionalMarkTime.Load(),
1141                         gcController.idleMarkTime.Load(),
1142                         markTermCpu,
1143                 } {
1144                         if i == 2 || i == 3 {
1145                                 // Separate mark time components with /.
1146                                 print("/")
1147                         } else if i != 0 {
1148                                 print("+")
1149                         }
1150                         print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
1151                 }
1152                 print(" ms cpu, ",
1153                         work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
1154                         gcController.lastHeapGoal>>20, " MB goal, ",
1155                         gcController.lastStackScan.Load()>>20, " MB stacks, ",
1156                         gcController.globalsScan.Load()>>20, " MB globals, ",
1157                         work.maxprocs, " P")
1158                 if work.userForced {
1159                         print(" (forced)")
1160                 }
1161                 print("\n")
1162                 printunlock()
1163         }
1164
1165         // Set any arena chunks that were deferred to fault.
1166         lock(&userArenaState.lock)
1167         faultList := userArenaState.fault
1168         userArenaState.fault = nil
1169         unlock(&userArenaState.lock)
1170         for _, lc := range faultList {
1171                 lc.mspan.setUserArenaChunkToFault()
1172         }
1173
1174         semrelease(&worldsema)
1175         semrelease(&gcsema)
1176         // Careful: another GC cycle may start now.
1177
1178         releasem(mp)
1179         mp = nil
1180
1181         // now that gc is done, kick off finalizer thread if needed
1182         if !concurrentSweep {
1183                 // give the queued finalizers, if any, a chance to run
1184                 Gosched()
1185         }
1186 }
1187
1188 // gcBgMarkStartWorkers prepares background mark worker goroutines. These
1189 // goroutines will not run until the mark phase, but they must be started while
1190 // the work is not stopped and from a regular G stack. The caller must hold
1191 // worldsema.
1192 func gcBgMarkStartWorkers() {
1193         // Background marking is performed by per-P G's. Ensure that each P has
1194         // a background GC G.
1195         //
1196         // Worker Gs don't exit if gomaxprocs is reduced. If it is raised
1197         // again, we can reuse the old workers; no need to create new workers.
1198         for gcBgMarkWorkerCount < gomaxprocs {
1199                 go gcBgMarkWorker()
1200
1201                 notetsleepg(&work.bgMarkReady, -1)
1202                 noteclear(&work.bgMarkReady)
1203                 // The worker is now guaranteed to be added to the pool before
1204                 // its P's next findRunnableGCWorker.
1205
1206                 gcBgMarkWorkerCount++
1207         }
1208 }
1209
1210 // gcBgMarkPrepare sets up state for background marking.
1211 // Mutator assists must not yet be enabled.
1212 func gcBgMarkPrepare() {
1213         // Background marking will stop when the work queues are empty
1214         // and there are no more workers (note that, since this is
1215         // concurrent, this may be a transient state, but mark
1216         // termination will clean it up). Between background workers
1217         // and assists, we don't really know how many workers there
1218         // will be, so we pretend to have an arbitrarily large number
1219         // of workers, almost all of which are "waiting". While a
1220         // worker is working it decrements nwait. If nproc == nwait,
1221         // there are no workers.
1222         work.nproc = ^uint32(0)
1223         work.nwait = ^uint32(0)
1224 }
1225
1226 // gcBgMarkWorker is an entry in the gcBgMarkWorkerPool. It points to a single
1227 // gcBgMarkWorker goroutine.
1228 type gcBgMarkWorkerNode struct {
1229         // Unused workers are managed in a lock-free stack. This field must be first.
1230         node lfnode
1231
1232         // The g of this worker.
1233         gp guintptr
1234
1235         // Release this m on park. This is used to communicate with the unlock
1236         // function, which cannot access the G's stack. It is unused outside of
1237         // gcBgMarkWorker().
1238         m muintptr
1239 }
1240
1241 func gcBgMarkWorker() {
1242         gp := getg()
1243
1244         // We pass node to a gopark unlock function, so it can't be on
1245         // the stack (see gopark). Prevent deadlock from recursively
1246         // starting GC by disabling preemption.
1247         gp.m.preemptoff = "GC worker init"
1248         node := new(gcBgMarkWorkerNode)
1249         gp.m.preemptoff = ""
1250
1251         node.gp.set(gp)
1252
1253         node.m.set(acquirem())
1254         notewakeup(&work.bgMarkReady)
1255         // After this point, the background mark worker is generally scheduled
1256         // cooperatively by gcController.findRunnableGCWorker. While performing
1257         // work on the P, preemption is disabled because we are working on
1258         // P-local work buffers. When the preempt flag is set, this puts itself
1259         // into _Gwaiting to be woken up by gcController.findRunnableGCWorker
1260         // at the appropriate time.
1261         //
1262         // When preemption is enabled (e.g., while in gcMarkDone), this worker
1263         // may be preempted and schedule as a _Grunnable G from a runq. That is
1264         // fine; it will eventually gopark again for further scheduling via
1265         // findRunnableGCWorker.
1266         //
1267         // Since we disable preemption before notifying bgMarkReady, we
1268         // guarantee that this G will be in the worker pool for the next
1269         // findRunnableGCWorker. This isn't strictly necessary, but it reduces
1270         // latency between _GCmark starting and the workers starting.
1271
1272         for {
1273                 // Go to sleep until woken by
1274                 // gcController.findRunnableGCWorker.
1275                 gopark(func(g *g, nodep unsafe.Pointer) bool {
1276                         node := (*gcBgMarkWorkerNode)(nodep)
1277
1278                         if mp := node.m.ptr(); mp != nil {
1279                                 // The worker G is no longer running; release
1280                                 // the M.
1281                                 //
1282                                 // N.B. it is _safe_ to release the M as soon
1283                                 // as we are no longer performing P-local mark
1284                                 // work.
1285                                 //
1286                                 // However, since we cooperatively stop work
1287                                 // when gp.preempt is set, if we releasem in
1288                                 // the loop then the following call to gopark
1289                                 // would immediately preempt the G. This is
1290                                 // also safe, but inefficient: the G must
1291                                 // schedule again only to enter gopark and park
1292                                 // again. Thus, we defer the release until
1293                                 // after parking the G.
1294                                 releasem(mp)
1295                         }
1296
1297                         // Release this G to the pool.
1298                         gcBgMarkWorkerPool.push(&node.node)
1299                         // Note that at this point, the G may immediately be
1300                         // rescheduled and may be running.
1301                         return true
1302                 }, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
1303
1304                 // Preemption must not occur here, or another G might see
1305                 // p.gcMarkWorkerMode.
1306
1307                 // Disable preemption so we can use the gcw. If the
1308                 // scheduler wants to preempt us, we'll stop draining,
1309                 // dispose the gcw, and then preempt.
1310                 node.m.set(acquirem())
1311                 pp := gp.m.p.ptr() // P can't change with preemption disabled.
1312
1313                 if gcBlackenEnabled == 0 {
1314                         println("worker mode", pp.gcMarkWorkerMode)
1315                         throw("gcBgMarkWorker: blackening not enabled")
1316                 }
1317
1318                 if pp.gcMarkWorkerMode == gcMarkWorkerNotWorker {
1319                         throw("gcBgMarkWorker: mode not set")
1320                 }
1321
1322                 startTime := nanotime()
1323                 pp.gcMarkWorkerStartTime = startTime
1324                 var trackLimiterEvent bool
1325                 if pp.gcMarkWorkerMode == gcMarkWorkerIdleMode {
1326                         trackLimiterEvent = pp.limiterEvent.start(limiterEventIdleMarkWork, startTime)
1327                 }
1328
1329                 decnwait := atomic.Xadd(&work.nwait, -1)
1330                 if decnwait == work.nproc {
1331                         println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
1332                         throw("work.nwait was > work.nproc")
1333                 }
1334
1335                 systemstack(func() {
1336                         // Mark our goroutine preemptible so its stack
1337                         // can be scanned. This lets two mark workers
1338                         // scan each other (otherwise, they would
1339                         // deadlock). We must not modify anything on
1340                         // the G stack. However, stack shrinking is
1341                         // disabled for mark workers, so it is safe to
1342                         // read from the G stack.
1343                         casGToWaiting(gp, _Grunning, waitReasonGCWorkerActive)
1344                         switch pp.gcMarkWorkerMode {
1345                         default:
1346                                 throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
1347                         case gcMarkWorkerDedicatedMode:
1348                                 gcDrain(&pp.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
1349                                 if gp.preempt {
1350                                         // We were preempted. This is
1351                                         // a useful signal to kick
1352                                         // everything out of the run
1353                                         // queue so it can run
1354                                         // somewhere else.
1355                                         if drainQ, n := runqdrain(pp); n > 0 {
1356                                                 lock(&sched.lock)
1357                                                 globrunqputbatch(&drainQ, int32(n))
1358                                                 unlock(&sched.lock)
1359                                         }
1360                                 }
1361                                 // Go back to draining, this time
1362                                 // without preemption.
1363                                 gcDrain(&pp.gcw, gcDrainFlushBgCredit)
1364                         case gcMarkWorkerFractionalMode:
1365                                 gcDrain(&pp.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1366                         case gcMarkWorkerIdleMode:
1367                                 gcDrain(&pp.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1368                         }
1369                         casgstatus(gp, _Gwaiting, _Grunning)
1370                 })
1371
1372                 // Account for time and mark us as stopped.
1373                 now := nanotime()
1374                 duration := now - startTime
1375                 gcController.markWorkerStop(pp.gcMarkWorkerMode, duration)
1376                 if trackLimiterEvent {
1377                         pp.limiterEvent.stop(limiterEventIdleMarkWork, now)
1378                 }
1379                 if pp.gcMarkWorkerMode == gcMarkWorkerFractionalMode {
1380                         atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
1381                 }
1382
1383                 // Was this the last worker and did we run out
1384                 // of work?
1385                 incnwait := atomic.Xadd(&work.nwait, +1)
1386                 if incnwait > work.nproc {
1387                         println("runtime: p.gcMarkWorkerMode=", pp.gcMarkWorkerMode,
1388                                 "work.nwait=", incnwait, "work.nproc=", work.nproc)
1389                         throw("work.nwait > work.nproc")
1390                 }
1391
1392                 // We'll releasem after this point and thus this P may run
1393                 // something else. We must clear the worker mode to avoid
1394                 // attributing the mode to a different (non-worker) G in
1395                 // traceGoStart.
1396                 pp.gcMarkWorkerMode = gcMarkWorkerNotWorker
1397
1398                 // If this worker reached a background mark completion
1399                 // point, signal the main GC goroutine.
1400                 if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1401                         // We don't need the P-local buffers here, allow
1402                         // preemption because we may schedule like a regular
1403                         // goroutine in gcMarkDone (block on locks, etc).
1404                         releasem(node.m.ptr())
1405                         node.m.set(nil)
1406
1407                         gcMarkDone()
1408                 }
1409         }
1410 }
1411
1412 // gcMarkWorkAvailable reports whether executing a mark worker
1413 // on p is potentially useful. p may be nil, in which case it only
1414 // checks the global sources of work.
1415 func gcMarkWorkAvailable(p *p) bool {
1416         if p != nil && !p.gcw.empty() {
1417                 return true
1418         }
1419         if !work.full.empty() {
1420                 return true // global work available
1421         }
1422         if work.markrootNext < work.markrootJobs {
1423                 return true // root scan work available
1424         }
1425         return false
1426 }
1427
1428 // gcMark runs the mark (or, for concurrent GC, mark termination)
1429 // All gcWork caches must be empty.
1430 // STW is in effect at this point.
1431 func gcMark(startTime int64) {
1432         if debug.allocfreetrace > 0 {
1433                 tracegc()
1434         }
1435
1436         if gcphase != _GCmarktermination {
1437                 throw("in gcMark expecting to see gcphase as _GCmarktermination")
1438         }
1439         work.tstart = startTime
1440
1441         // Check that there's no marking work remaining.
1442         if work.full != 0 || work.markrootNext < work.markrootJobs {
1443                 print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
1444                 panic("non-empty mark queue after concurrent mark")
1445         }
1446
1447         if debug.gccheckmark > 0 {
1448                 // This is expensive when there's a large number of
1449                 // Gs, so only do it if checkmark is also enabled.
1450                 gcMarkRootCheck()
1451         }
1452         if work.full != 0 {
1453                 throw("work.full != 0")
1454         }
1455
1456         // Drop allg snapshot. allgs may have grown, in which case
1457         // this is the only reference to the old backing store and
1458         // there's no need to keep it around.
1459         work.stackRoots = nil
1460
1461         // Clear out buffers and double-check that all gcWork caches
1462         // are empty. This should be ensured by gcMarkDone before we
1463         // enter mark termination.
1464         //
1465         // TODO: We could clear out buffers just before mark if this
1466         // has a non-negligible impact on STW time.
1467         for _, p := range allp {
1468                 // The write barrier may have buffered pointers since
1469                 // the gcMarkDone barrier. However, since the barrier
1470                 // ensured all reachable objects were marked, all of
1471                 // these must be pointers to black objects. Hence we
1472                 // can just discard the write barrier buffer.
1473                 if debug.gccheckmark > 0 {
1474                         // For debugging, flush the buffer and make
1475                         // sure it really was all marked.
1476                         wbBufFlush1(p)
1477                 } else {
1478                         p.wbBuf.reset()
1479                 }
1480
1481                 gcw := &p.gcw
1482                 if !gcw.empty() {
1483                         printlock()
1484                         print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
1485                         if gcw.wbuf1 == nil {
1486                                 print(" wbuf1=<nil>")
1487                         } else {
1488                                 print(" wbuf1.n=", gcw.wbuf1.nobj)
1489                         }
1490                         if gcw.wbuf2 == nil {
1491                                 print(" wbuf2=<nil>")
1492                         } else {
1493                                 print(" wbuf2.n=", gcw.wbuf2.nobj)
1494                         }
1495                         print("\n")
1496                         throw("P has cached GC work at end of mark termination")
1497                 }
1498                 // There may still be cached empty buffers, which we
1499                 // need to flush since we're going to free them. Also,
1500                 // there may be non-zero stats because we allocated
1501                 // black after the gcMarkDone barrier.
1502                 gcw.dispose()
1503         }
1504
1505         // Flush scanAlloc from each mcache since we're about to modify
1506         // heapScan directly. If we were to flush this later, then scanAlloc
1507         // might have incorrect information.
1508         //
1509         // Note that it's not important to retain this information; we know
1510         // exactly what heapScan is at this point via scanWork.
1511         for _, p := range allp {
1512                 c := p.mcache
1513                 if c == nil {
1514                         continue
1515                 }
1516                 c.scanAlloc = 0
1517         }
1518
1519         // Reset controller state.
1520         gcController.resetLive(work.bytesMarked)
1521 }
1522
1523 // gcSweep must be called on the system stack because it acquires the heap
1524 // lock. See mheap for details.
1525 //
1526 // The world must be stopped.
1527 //
1528 //go:systemstack
1529 func gcSweep(mode gcMode) {
1530         assertWorldStopped()
1531
1532         if gcphase != _GCoff {
1533                 throw("gcSweep being done but phase is not GCoff")
1534         }
1535
1536         lock(&mheap_.lock)
1537         mheap_.sweepgen += 2
1538         sweep.active.reset()
1539         mheap_.pagesSwept.Store(0)
1540         mheap_.sweepArenas = mheap_.allArenas
1541         mheap_.reclaimIndex.Store(0)
1542         mheap_.reclaimCredit.Store(0)
1543         unlock(&mheap_.lock)
1544
1545         sweep.centralIndex.clear()
1546
1547         if !_ConcurrentSweep || mode == gcForceBlockMode {
1548                 // Special case synchronous sweep.
1549                 // Record that no proportional sweeping has to happen.
1550                 lock(&mheap_.lock)
1551                 mheap_.sweepPagesPerByte = 0
1552                 unlock(&mheap_.lock)
1553                 // Sweep all spans eagerly.
1554                 for sweepone() != ^uintptr(0) {
1555                         sweep.npausesweep++
1556                 }
1557                 // Free workbufs eagerly.
1558                 prepareFreeWorkbufs()
1559                 for freeSomeWbufs(false) {
1560                 }
1561                 // All "free" events for this mark/sweep cycle have
1562                 // now happened, so we can make this profile cycle
1563                 // available immediately.
1564                 mProf_NextCycle()
1565                 mProf_Flush()
1566                 return
1567         }
1568
1569         // Background sweep.
1570         lock(&sweep.lock)
1571         if sweep.parked {
1572                 sweep.parked = false
1573                 ready(sweep.g, 0, true)
1574         }
1575         unlock(&sweep.lock)
1576 }
1577
1578 // gcResetMarkState resets global state prior to marking (concurrent
1579 // or STW) and resets the stack scan state of all Gs.
1580 //
1581 // This is safe to do without the world stopped because any Gs created
1582 // during or after this will start out in the reset state.
1583 //
1584 // gcResetMarkState must be called on the system stack because it acquires
1585 // the heap lock. See mheap for details.
1586 //
1587 //go:systemstack
1588 func gcResetMarkState() {
1589         // This may be called during a concurrent phase, so lock to make sure
1590         // allgs doesn't change.
1591         forEachG(func(gp *g) {
1592                 gp.gcscandone = false // set to true in gcphasework
1593                 gp.gcAssistBytes = 0
1594         })
1595
1596         // Clear page marks. This is just 1MB per 64GB of heap, so the
1597         // time here is pretty trivial.
1598         lock(&mheap_.lock)
1599         arenas := mheap_.allArenas
1600         unlock(&mheap_.lock)
1601         for _, ai := range arenas {
1602                 ha := mheap_.arenas[ai.l1()][ai.l2()]
1603                 for i := range ha.pageMarks {
1604                         ha.pageMarks[i] = 0
1605                 }
1606         }
1607
1608         work.bytesMarked = 0
1609         work.initialHeapLive = gcController.heapLive.Load()
1610 }
1611
1612 // Hooks for other packages
1613
1614 var poolcleanup func()
1615 var boringCaches []unsafe.Pointer // for crypto/internal/boring
1616
1617 //go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
1618 func sync_runtime_registerPoolCleanup(f func()) {
1619         poolcleanup = f
1620 }
1621
1622 //go:linkname boring_registerCache crypto/internal/boring/bcache.registerCache
1623 func boring_registerCache(p unsafe.Pointer) {
1624         boringCaches = append(boringCaches, p)
1625 }
1626
1627 func clearpools() {
1628         // clear sync.Pools
1629         if poolcleanup != nil {
1630                 poolcleanup()
1631         }
1632
1633         // clear boringcrypto caches
1634         for _, p := range boringCaches {
1635                 atomicstorep(p, nil)
1636         }
1637
1638         // Clear central sudog cache.
1639         // Leave per-P caches alone, they have strictly bounded size.
1640         // Disconnect cached list before dropping it on the floor,
1641         // so that a dangling ref to one entry does not pin all of them.
1642         lock(&sched.sudoglock)
1643         var sg, sgnext *sudog
1644         for sg = sched.sudogcache; sg != nil; sg = sgnext {
1645                 sgnext = sg.next
1646                 sg.next = nil
1647         }
1648         sched.sudogcache = nil
1649         unlock(&sched.sudoglock)
1650
1651         // Clear central defer pool.
1652         // Leave per-P pools alone, they have strictly bounded size.
1653         lock(&sched.deferlock)
1654         // disconnect cached list before dropping it on the floor,
1655         // so that a dangling ref to one entry does not pin all of them.
1656         var d, dlink *_defer
1657         for d = sched.deferpool; d != nil; d = dlink {
1658                 dlink = d.link
1659                 d.link = nil
1660         }
1661         sched.deferpool = nil
1662         unlock(&sched.deferlock)
1663 }
1664
1665 // Timing
1666
1667 // itoaDiv formats val/(10**dec) into buf.
1668 func itoaDiv(buf []byte, val uint64, dec int) []byte {
1669         i := len(buf) - 1
1670         idec := i - dec
1671         for val >= 10 || i >= idec {
1672                 buf[i] = byte(val%10 + '0')
1673                 i--
1674                 if i == idec {
1675                         buf[i] = '.'
1676                         i--
1677                 }
1678                 val /= 10
1679         }
1680         buf[i] = byte(val + '0')
1681         return buf[i:]
1682 }
1683
1684 // fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
1685 func fmtNSAsMS(buf []byte, ns uint64) []byte {
1686         if ns >= 10e6 {
1687                 // Format as whole milliseconds.
1688                 return itoaDiv(buf, ns/1e6, 0)
1689         }
1690         // Format two digits of precision, with at most three decimal places.
1691         x := ns / 1e3
1692         if x == 0 {
1693                 buf[0] = '0'
1694                 return buf[:1]
1695         }
1696         dec := 3
1697         for x >= 100 {
1698                 x /= 10
1699                 dec--
1700         }
1701         return itoaDiv(buf, x, dec)
1702 }
1703
1704 // Helpers for testing GC.
1705
1706 // gcTestMoveStackOnNextCall causes the stack to be moved on a call
1707 // immediately following the call to this. It may not work correctly
1708 // if any other work appears after this call (such as returning).
1709 // Typically the following call should be marked go:noinline so it
1710 // performs a stack check.
1711 //
1712 // In rare cases this may not cause the stack to move, specifically if
1713 // there's a preemption between this call and the next.
1714 func gcTestMoveStackOnNextCall() {
1715         gp := getg()
1716         gp.stackguard0 = stackForceMove
1717 }
1718
1719 // gcTestIsReachable performs a GC and returns a bit set where bit i
1720 // is set if ptrs[i] is reachable.
1721 func gcTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
1722         // This takes the pointers as unsafe.Pointers in order to keep
1723         // them live long enough for us to attach specials. After
1724         // that, we drop our references to them.
1725
1726         if len(ptrs) > 64 {
1727                 panic("too many pointers for uint64 mask")
1728         }
1729
1730         // Block GC while we attach specials and drop our references
1731         // to ptrs. Otherwise, if a GC is in progress, it could mark
1732         // them reachable via this function before we have a chance to
1733         // drop them.
1734         semacquire(&gcsema)
1735
1736         // Create reachability specials for ptrs.
1737         specials := make([]*specialReachable, len(ptrs))
1738         for i, p := range ptrs {
1739                 lock(&mheap_.speciallock)
1740                 s := (*specialReachable)(mheap_.specialReachableAlloc.alloc())
1741                 unlock(&mheap_.speciallock)
1742                 s.special.kind = _KindSpecialReachable
1743                 if !addspecial(p, &s.special) {
1744                         throw("already have a reachable special (duplicate pointer?)")
1745                 }
1746                 specials[i] = s
1747                 // Make sure we don't retain ptrs.
1748                 ptrs[i] = nil
1749         }
1750
1751         semrelease(&gcsema)
1752
1753         // Force a full GC and sweep.
1754         GC()
1755
1756         // Process specials.
1757         for i, s := range specials {
1758                 if !s.done {
1759                         printlock()
1760                         println("runtime: object", i, "was not swept")
1761                         throw("IsReachable failed")
1762                 }
1763                 if s.reachable {
1764                         mask |= 1 << i
1765                 }
1766                 lock(&mheap_.speciallock)
1767                 mheap_.specialReachableAlloc.free(unsafe.Pointer(s))
1768                 unlock(&mheap_.speciallock)
1769         }
1770
1771         return mask
1772 }
1773
1774 // gcTestPointerClass returns the category of what p points to, one of:
1775 // "heap", "stack", "data", "bss", "other". This is useful for checking
1776 // that a test is doing what it's intended to do.
1777 //
1778 // This is nosplit simply to avoid extra pointer shuffling that may
1779 // complicate a test.
1780 //
1781 //go:nosplit
1782 func gcTestPointerClass(p unsafe.Pointer) string {
1783         p2 := uintptr(noescape(p))
1784         gp := getg()
1785         if gp.stack.lo <= p2 && p2 < gp.stack.hi {
1786                 return "stack"
1787         }
1788         if base, _, _ := findObject(p2, 0, 0); base != 0 {
1789                 return "heap"
1790         }
1791         for _, datap := range activeModules() {
1792                 if datap.data <= p2 && p2 < datap.edata || datap.noptrdata <= p2 && p2 < datap.enoptrdata {
1793                         return "data"
1794                 }
1795                 if datap.bss <= p2 && p2 < datap.ebss || datap.noptrbss <= p2 && p2 <= datap.enoptrbss {
1796                         return "bss"
1797                 }
1798         }
1799         KeepAlive(p)
1800         return "other"
1801 }