src/runtime/mgc.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Garbage collector (GC).
   6 //
   7 // The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
   8 // GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
   9 // non-generational and non-compacting. Allocation is done using size segregated per P allocation
  10 // areas to minimize fragmentation while eliminating locks in the common case.
  11 //
  12 // The algorithm decomposes into several steps.
  13 // This is a high level description of the algorithm being used. For an overview of GC a good
  14 // place to start is Richard Jones' gchandbook.org.
  15 //
  16 // The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
  17 // Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
  18 // On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
  19 // 966-975.
  20 // For journal quality proofs that these steps are complete, correct, and terminate see
  21 // Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
  22 // Concurrency and Computation: Practice and Experience 15(3-5), 2003.
  23 //
  24 // 1. GC performs sweep termination.
  25 //
  26 //    a. Stop the world. This causes all Ps to reach a GC safe-point.
  27 //
  28 //    b. Sweep any unswept spans. There will only be unswept spans if
  29 //    this GC cycle was forced before the expected time.
  30 //
  31 // 2. GC performs the mark phase.
  32 //
  33 //    a. Prepare for the mark phase by setting gcphase to _GCmark
  34 //    (from _GCoff), enabling the write barrier, enabling mutator
  35 //    assists, and enqueueing root mark jobs. No objects may be
  36 //    scanned until all Ps have enabled the write barrier, which is
  37 //    accomplished using STW.
  38 //
  39 //    b. Start the world. From this point, GC work is done by mark
  40 //    workers started by the scheduler and by assists performed as
  41 //    part of allocation. The write barrier shades both the
  42 //    overwritten pointer and the new pointer value for any pointer
  43 //    writes (see mbarrier.go for details). Newly allocated objects
  44 //    are immediately marked black.
  45 //
  46 //    c. GC performs root marking jobs. This includes scanning all
  47 //    stacks, shading all globals, and shading any heap pointers in
  48 //    off-heap runtime data structures. Scanning a stack stops a
  49 //    goroutine, shades any pointers found on its stack, and then
  50 //    resumes the goroutine.
  51 //
  52 //    d. GC drains the work queue of grey objects, scanning each grey
  53 //    object to black and shading all pointers found in the object
  54 //    (which in turn may add those pointers to the work queue).
  55 //
  56 //    e. Because GC work is spread across local caches, GC uses a
  57 //    distributed termination algorithm to detect when there are no
  58 //    more root marking jobs or grey objects (see gcMarkDone). At this
  59 //    point, GC transitions to mark termination.
  60 //
  61 // 3. GC performs mark termination.
  62 //
  63 //    a. Stop the world.
  64 //
  65 //    b. Set gcphase to _GCmarktermination, and disable workers and
  66 //    assists.
  67 //
  68 //    c. Perform housekeeping like flushing mcaches.
  69 //
  70 // 4. GC performs the sweep phase.
  71 //
  72 //    a. Prepare for the sweep phase by setting gcphase to _GCoff,
  73 //    setting up sweep state and disabling the write barrier.
  74 //
  75 //    b. Start the world. From this point on, newly allocated objects
  76 //    are white, and allocating sweeps spans before use if necessary.
  77 //
  78 //    c. GC does concurrent sweeping in the background and in response
  79 //    to allocation. See description below.
  80 //
  81 // 5. When sufficient allocation has taken place, replay the sequence
  82 // starting with 1 above. See discussion of GC rate below.
  83
  84 // Concurrent sweep.
  85 //
  86 // The sweep phase proceeds concurrently with normal program execution.
  87 // The heap is swept span-by-span both lazily (when a goroutine needs another span)
  88 // and concurrently in a background goroutine (this helps programs that are not CPU bound).
  89 // At the end of STW mark termination all spans are marked as "needs sweeping".
  90 //
  91 // The background sweeper goroutine simply sweeps spans one-by-one.
  92 //
  93 // To avoid requesting more OS memory while there are unswept spans, when a
  94 // goroutine needs another span, it first attempts to reclaim that much memory
  95 // by sweeping. When a goroutine needs to allocate a new small-object span, it
  96 // sweeps small-object spans for the same object size until it frees at least
  97 // one object. When a goroutine needs to allocate large-object span from heap,
  98 // it sweeps spans until it frees at least that many pages into heap. There is
  99 // one case where this may not suffice: if a goroutine sweeps and frees two
 100 // nonadjacent one-page spans to the heap, it will allocate a new two-page
 101 // span, but there can still be other one-page unswept spans which could be
 102 // combined into a two-page span.
 103 //
 104 // It's critical to ensure that no operations proceed on unswept spans (that would corrupt
 105 // mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
 106 // so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
 107 // When a goroutine explicitly frees an object or sets a finalizer, it ensures that
 108 // the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
 109 // The finalizer goroutine is kicked off only when all spans are swept.
 110 // When the next GC starts, it sweeps all not-yet-swept spans (if any).
 111
 112 // GC rate.
 113 // Next GC is after we've allocated an extra amount of memory proportional to
 114 // the amount already in use. The proportion is controlled by GOGC environment variable
 115 // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
 116 // (this mark is tracked in gcController.heapGoal variable). This keeps the GC cost in
 117 // linear proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 118 // (and also the amount of extra memory used).
 119
 120 // Oblets
 121 //
 122 // In order to prevent long pauses while scanning large objects and to
 123 // improve parallelism, the garbage collector breaks up scan jobs for
 124 // objects larger than maxObletBytes into "oblets" of at most
 125 // maxObletBytes. When scanning encounters the beginning of a large
 126 // object, it scans only the first oblet and enqueues the remaining
 127 // oblets as new scan jobs.
 128
 129 package runtime
 130
 131 import (
 132         "internal/cpu"
 133         "runtime/internal/atomic"
 134         "unsafe"
 135 )
 136
 137 const (
 138         _DebugGC         = 0
 139         _ConcurrentSweep = true
 140         _FinBlockSize    = 4 * 1024
 141
 142         // debugScanConservative enables debug logging for stack
 143         // frames that are scanned conservatively.
 144         debugScanConservative = false
 145
 146         // sweepMinHeapDistance is a lower bound on the heap distance
 147         // (in bytes) reserved for concurrent sweeping between GC
 148         // cycles.
 149         sweepMinHeapDistance = 1024 * 1024
 150 )
 151
 152 func gcinit() {
 153         if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
 154                 throw("size of Workbuf is suboptimal")
 155         }
 156         // No sweep on the first cycle.
 157         mheap_.sweepDrained = 1
 158
 159         // Initialize GC pacer state.
 160         // Use the environment variable GOGC for the initial gcPercent value.
 161         gcController.init(readGOGC())
 162
 163         work.startSema = 1
 164         work.markDoneSema = 1
 165         lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
 166         lockInit(&work.assistQueue.lock, lockRankAssistQueue)
 167         lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
 168 }
 169
 170 // Temporary in order to enable register ABI work.
 171 // TODO(register args): convert back to local chan in gcenabled, passed to "go" stmts.
 172 var gcenable_setup chan int
 173
 174 // gcenable is called after the bulk of the runtime initialization,
 175 // just before we're about to start letting user code run.
 176 // It kicks off the background sweeper goroutine, the background
 177 // scavenger goroutine, and enables GC.
 178 func gcenable() {
 179         // Kick off sweeping and scavenging.
 180         gcenable_setup = make(chan int, 2)
 181         go bgsweep()
 182         go bgscavenge()
 183         <-gcenable_setup
 184         <-gcenable_setup
 185         gcenable_setup = nil
 186         memstats.enablegc = true // now that runtime is initialized, GC is okay
 187 }
 188
 189 // Garbage collector phase.
 190 // Indicates to write barrier and synchronization task to perform.
 191 var gcphase uint32
 192
 193 // The compiler knows about this variable.
 194 // If you change it, you must change builtin/runtime.go, too.
 195 // If you change the first four bytes, you must also change the write
 196 // barrier insertion code.
 197 var writeBarrier struct {
 198         enabled bool    // compiler emits a check of this before calling write barrier
 199         pad     [3]byte // compiler uses 32-bit load for "enabled" field
 200         needed  bool    // whether we need a write barrier for current GC phase
 201         cgo     bool    // whether we need a write barrier for a cgo check
 202         alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
 203 }
 204
 205 // gcBlackenEnabled is 1 if mutator assists and background mark
 206 // workers are allowed to blacken objects. This must only be set when
 207 // gcphase == _GCmark.
 208 var gcBlackenEnabled uint32
 209
 210 const (
 211         _GCoff             = iota // GC not running; sweeping in background, write barrier disabled
 212         _GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
 213         _GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
 214 )
 215
 216 //go:nosplit
 217 func setGCPhase(x uint32) {
 218         atomic.Store(&gcphase, x)
 219         writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
 220         writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
 221 }
 222
 223 // gcMarkWorkerMode represents the mode that a concurrent mark worker
 224 // should operate in.
 225 //
 226 // Concurrent marking happens through four different mechanisms. One
 227 // is mutator assists, which happen in response to allocations and are
 228 // not scheduled. The other three are variations in the per-P mark
 229 // workers and are distinguished by gcMarkWorkerMode.
 230 type gcMarkWorkerMode int
 231
 232 const (
 233         // gcMarkWorkerNotWorker indicates that the next scheduled G is not
 234         // starting work and the mode should be ignored.
 235         gcMarkWorkerNotWorker gcMarkWorkerMode = iota
 236
 237         // gcMarkWorkerDedicatedMode indicates that the P of a mark
 238         // worker is dedicated to running that mark worker. The mark
 239         // worker should run without preemption.
 240         gcMarkWorkerDedicatedMode
 241
 242         // gcMarkWorkerFractionalMode indicates that a P is currently
 243         // running the "fractional" mark worker. The fractional worker
 244         // is necessary when GOMAXPROCS*gcBackgroundUtilization is not
 245         // an integer and using only dedicated workers would result in
 246         // utilization too far from the target of gcBackgroundUtilization.
 247         // The fractional worker should run until it is preempted and
 248         // will be scheduled to pick up the fractional part of
 249         // GOMAXPROCS*gcBackgroundUtilization.
 250         gcMarkWorkerFractionalMode
 251
 252         // gcMarkWorkerIdleMode indicates that a P is running the mark
 253         // worker because it has nothing else to do. The idle worker
 254         // should run until it is preempted and account its time
 255         // against gcController.idleMarkTime.
 256         gcMarkWorkerIdleMode
 257 )
 258
 259 // gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
 260 // to use in execution traces.
 261 var gcMarkWorkerModeStrings = [...]string{
 262         "Not worker",
 263         "GC (dedicated)",
 264         "GC (fractional)",
 265         "GC (idle)",
 266 }
 267
 268 // pollFractionalWorkerExit reports whether a fractional mark worker
 269 // should self-preempt. It assumes it is called from the fractional
 270 // worker.
 271 func pollFractionalWorkerExit() bool {
 272         // This should be kept in sync with the fractional worker
 273         // scheduler logic in findRunnableGCWorker.
 274         now := nanotime()
 275         delta := now - gcController.markStartTime
 276         if delta <= 0 {
 277                 return true
 278         }
 279         p := getg().m.p.ptr()
 280         selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
 281         // Add some slack to the utilization goal so that the
 282         // fractional worker isn't behind again the instant it exits.
 283         return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
 284 }
 285
 286 var work struct {
 287         full  lfstack          // lock-free list of full blocks workbuf
 288         empty lfstack          // lock-free list of empty blocks workbuf
 289         pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
 290
 291         wbufSpans struct {
 292                 lock mutex
 293                 // free is a list of spans dedicated to workbufs, but
 294                 // that don't currently contain any workbufs.
 295                 free mSpanList
 296                 // busy is a list of all spans containing workbufs on
 297                 // one of the workbuf lists.
 298                 busy mSpanList
 299         }
 300
 301         // Restore 64-bit alignment on 32-bit.
 302         _ uint32
 303
 304         // bytesMarked is the number of bytes marked this cycle. This
 305         // includes bytes blackened in scanned objects, noscan objects
 306         // that go straight to black, and permagrey objects scanned by
 307         // markroot during the concurrent scan phase. This is updated
 308         // atomically during the cycle. Updates may be batched
 309         // arbitrarily, since the value is only read at the end of the
 310         // cycle.
 311         //
 312         // Because of benign races during marking, this number may not
 313         // be the exact number of marked bytes, but it should be very
 314         // close.
 315         //
 316         // Put this field here because it needs 64-bit atomic access
 317         // (and thus 8-byte alignment even on 32-bit architectures).
 318         bytesMarked uint64
 319
 320         markrootNext uint32 // next markroot job
 321         markrootJobs uint32 // number of markroot jobs
 322
 323         nproc  uint32
 324         tstart int64
 325         nwait  uint32
 326
 327         // Number of roots of various root types. Set by gcMarkRootPrepare.
 328         nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
 329
 330         // Base indexes of each root type. Set by gcMarkRootPrepare.
 331         baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32
 332
 333         // Each type of GC state transition is protected by a lock.
 334         // Since multiple threads can simultaneously detect the state
 335         // transition condition, any thread that detects a transition
 336         // condition must acquire the appropriate transition lock,
 337         // re-check the transition condition and return if it no
 338         // longer holds or perform the transition if it does.
 339         // Likewise, any transition must invalidate the transition
 340         // condition before releasing the lock. This ensures that each
 341         // transition is performed by exactly one thread and threads
 342         // that need the transition to happen block until it has
 343         // happened.
 344         //
 345         // startSema protects the transition from "off" to mark or
 346         // mark termination.
 347         startSema uint32
 348         // markDoneSema protects transitions from mark to mark termination.
 349         markDoneSema uint32
 350
 351         bgMarkReady note   // signal background mark worker has started
 352         bgMarkDone  uint32 // cas to 1 when at a background mark completion point
 353         // Background mark completion signaling
 354
 355         // mode is the concurrency mode of the current GC cycle.
 356         mode gcMode
 357
 358         // userForced indicates the current GC cycle was forced by an
 359         // explicit user call.
 360         userForced bool
 361
 362         // totaltime is the CPU nanoseconds spent in GC since the
 363         // program started if debug.gctrace > 0.
 364         totaltime int64
 365
 366         // initialHeapLive is the value of gcController.heapLive at the
 367         // beginning of this GC cycle.
 368         initialHeapLive uint64
 369
 370         // assistQueue is a queue of assists that are blocked because
 371         // there was neither enough credit to steal or enough work to
 372         // do.
 373         assistQueue struct {
 374                 lock mutex
 375                 q    gQueue
 376         }
 377
 378         // sweepWaiters is a list of blocked goroutines to wake when
 379         // we transition from mark termination to sweep.
 380         sweepWaiters struct {
 381                 lock mutex
 382                 list gList
 383         }
 384
 385         // cycles is the number of completed GC cycles, where a GC
 386         // cycle is sweep termination, mark, mark termination, and
 387         // sweep. This differs from memstats.numgc, which is
 388         // incremented at mark termination.
 389         cycles uint32
 390
 391         // Timing/utilization stats for this cycle.
 392         stwprocs, maxprocs                 int32
 393         tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
 394
 395         pauseNS    int64 // total STW time this cycle
 396         pauseStart int64 // nanotime() of last STW
 397
 398         // debug.gctrace heap sizes for this cycle.
 399         heap0, heap1, heap2, heapGoal uint64
 400 }
 401
 402 // GC runs a garbage collection and blocks the caller until the
 403 // garbage collection is complete. It may also block the entire
 404 // program.
 405 func GC() {
 406         // We consider a cycle to be: sweep termination, mark, mark
 407         // termination, and sweep. This function shouldn't return
 408         // until a full cycle has been completed, from beginning to
 409         // end. Hence, we always want to finish up the current cycle
 410         // and start a new one. That means:
 411         //
 412         // 1. In sweep termination, mark, or mark termination of cycle
 413         // N, wait until mark termination N completes and transitions
 414         // to sweep N.
 415         //
 416         // 2. In sweep N, help with sweep N.
 417         //
 418         // At this point we can begin a full cycle N+1.
 419         //
 420         // 3. Trigger cycle N+1 by starting sweep termination N+1.
 421         //
 422         // 4. Wait for mark termination N+1 to complete.
 423         //
 424         // 5. Help with sweep N+1 until it's done.
 425         //
 426         // This all has to be written to deal with the fact that the
 427         // GC may move ahead on its own. For example, when we block
 428         // until mark termination N, we may wake up in cycle N+2.
 429
 430         // Wait until the current sweep termination, mark, and mark
 431         // termination complete.
 432         n := atomic.Load(&work.cycles)
 433         gcWaitOnMark(n)
 434
 435         // We're now in sweep N or later. Trigger GC cycle N+1, which
 436         // will first finish sweep N if necessary and then enter sweep
 437         // termination N+1.
 438         gcStart(gcTrigger{kind: gcTriggerCycle, n: n + 1})
 439
 440         // Wait for mark termination N+1 to complete.
 441         gcWaitOnMark(n + 1)
 442
 443         // Finish sweep N+1 before returning. We do this both to
 444         // complete the cycle and because runtime.GC() is often used
 445         // as part of tests and benchmarks to get the system into a
 446         // relatively stable and isolated state.
 447         for atomic.Load(&work.cycles) == n+1 && sweepone() != ^uintptr(0) {
 448                 sweep.nbgsweep++
 449                 Gosched()
 450         }
 451
 452         // Callers may assume that the heap profile reflects the
 453         // just-completed cycle when this returns (historically this
 454         // happened because this was a STW GC), but right now the
 455         // profile still reflects mark termination N, not N+1.
 456         //
 457         // As soon as all of the sweep frees from cycle N+1 are done,
 458         // we can go ahead and publish the heap profile.
 459         //
 460         // First, wait for sweeping to finish. (We know there are no
 461         // more spans on the sweep queue, but we may be concurrently
 462         // sweeping spans, so we have to wait.)
 463         for atomic.Load(&work.cycles) == n+1 && !isSweepDone() {
 464                 Gosched()
 465         }
 466
 467         // Now we're really done with sweeping, so we can publish the
 468         // stable heap profile. Only do this if we haven't already hit
 469         // another mark termination.
 470         mp := acquirem()
 471         cycle := atomic.Load(&work.cycles)
 472         if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
 473                 mProf_PostSweep()
 474         }
 475         releasem(mp)
 476 }
 477
 478 // gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
 479 // already completed this mark phase, it returns immediately.
 480 func gcWaitOnMark(n uint32) {
 481         for {
 482                 // Disable phase transitions.
 483                 lock(&work.sweepWaiters.lock)
 484                 nMarks := atomic.Load(&work.cycles)
 485                 if gcphase != _GCmark {
 486                         // We've already completed this cycle's mark.
 487                         nMarks++
 488                 }
 489                 if nMarks > n {
 490                         // We're done.
 491                         unlock(&work.sweepWaiters.lock)
 492                         return
 493                 }
 494
 495                 // Wait until sweep termination, mark, and mark
 496                 // termination of cycle N complete.
 497                 work.sweepWaiters.list.push(getg())
 498                 goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
 499         }
 500 }
 501
 502 // gcMode indicates how concurrent a GC cycle should be.
 503 type gcMode int
 504
 505 const (
 506         gcBackgroundMode gcMode = iota // concurrent GC and sweep
 507         gcForceMode                    // stop-the-world GC now, concurrent sweep
 508         gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
 509 )
 510
 511 // A gcTrigger is a predicate for starting a GC cycle. Specifically,
 512 // it is an exit condition for the _GCoff phase.
 513 type gcTrigger struct {
 514         kind gcTriggerKind
 515         now  int64  // gcTriggerTime: current time
 516         n    uint32 // gcTriggerCycle: cycle number to start
 517 }
 518
 519 type gcTriggerKind int
 520
 521 const (
 522         // gcTriggerHeap indicates that a cycle should be started when
 523         // the heap size reaches the trigger heap size computed by the
 524         // controller.
 525         gcTriggerHeap gcTriggerKind = iota
 526
 527         // gcTriggerTime indicates that a cycle should be started when
 528         // it's been more than forcegcperiod nanoseconds since the
 529         // previous GC cycle.
 530         gcTriggerTime
 531
 532         // gcTriggerCycle indicates that a cycle should be started if
 533         // we have not yet started cycle number gcTrigger.n (relative
 534         // to work.cycles).
 535         gcTriggerCycle
 536 )
 537
 538 // test reports whether the trigger condition is satisfied, meaning
 539 // that the exit condition for the _GCoff phase has been met. The exit
 540 // condition should be tested when allocating.
 541 func (t gcTrigger) test() bool {
 542         if !memstats.enablegc || panicking != 0 || gcphase != _GCoff {
 543                 return false
 544         }
 545         switch t.kind {
 546         case gcTriggerHeap:
 547                 // Non-atomic access to gcController.heapLive for performance. If
 548                 // we are going to trigger on this, this thread just
 549                 // atomically wrote gcController.heapLive anyway and we'll see our
 550                 // own write.
 551                 return gcController.heapLive >= gcController.trigger
 552         case gcTriggerTime:
 553                 if gcController.gcPercent < 0 {
 554                         return false
 555                 }
 556                 lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
 557                 return lastgc != 0 && t.now-lastgc > forcegcperiod
 558         case gcTriggerCycle:
 559                 // t.n > work.cycles, but accounting for wraparound.
 560                 return int32(t.n-work.cycles) > 0
 561         }
 562         return true
 563 }
 564
 565 // gcStart starts the GC. It transitions from _GCoff to _GCmark (if
 566 // debug.gcstoptheworld == 0) or performs all of GC (if
 567 // debug.gcstoptheworld != 0).
 568 //
 569 // This may return without performing this transition in some cases,
 570 // such as when called on a system stack or with locks held.
 571 func gcStart(trigger gcTrigger) {
 572         // Since this is called from malloc and malloc is called in
 573         // the guts of a number of libraries that might be holding
 574         // locks, don't attempt to start GC in non-preemptible or
 575         // potentially unstable situations.
 576         mp := acquirem()
 577         if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
 578                 releasem(mp)
 579                 return
 580         }
 581         releasem(mp)
 582         mp = nil
 583
 584         // Pick up the remaining unswept/not being swept spans concurrently
 585         //
 586         // This shouldn't happen if we're being invoked in background
 587         // mode since proportional sweep should have just finished
 588         // sweeping everything, but rounding errors, etc, may leave a
 589         // few spans unswept. In forced mode, this is necessary since
 590         // GC can be forced at any point in the sweeping cycle.
 591         //
 592         // We check the transition condition continuously here in case
 593         // this G gets delayed in to the next GC cycle.
 594         for trigger.test() && sweepone() != ^uintptr(0) {
 595                 sweep.nbgsweep++
 596         }
 597
 598         // Perform GC initialization and the sweep termination
 599         // transition.
 600         semacquire(&work.startSema)
 601         // Re-check transition condition under transition lock.
 602         if !trigger.test() {
 603                 semrelease(&work.startSema)
 604                 return
 605         }
 606
 607         // For stats, check if this GC was forced by the user.
 608         work.userForced = trigger.kind == gcTriggerCycle
 609
 610         // In gcstoptheworld debug mode, upgrade the mode accordingly.
 611         // We do this after re-checking the transition condition so
 612         // that multiple goroutines that detect the heap trigger don't
 613         // start multiple STW GCs.
 614         mode := gcBackgroundMode
 615         if debug.gcstoptheworld == 1 {
 616                 mode = gcForceMode
 617         } else if debug.gcstoptheworld == 2 {
 618                 mode = gcForceBlockMode
 619         }
 620
 621         // Ok, we're doing it! Stop everybody else
 622         semacquire(&gcsema)
 623         semacquire(&worldsema)
 624
 625         if trace.enabled {
 626                 traceGCStart()
 627         }
 628
 629         // Check that all Ps have finished deferred mcache flushes.
 630         for _, p := range allp {
 631                 if fg := atomic.Load(&p.mcache.flushGen); fg != mheap_.sweepgen {
 632                         println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
 633                         throw("p mcache not flushed")
 634                 }
 635         }
 636
 637         gcBgMarkStartWorkers()
 638
 639         systemstack(gcResetMarkState)
 640
 641         work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
 642         if work.stwprocs > ncpu {
 643                 // This is used to compute CPU time of the STW phases,
 644                 // so it can't be more than ncpu, even if GOMAXPROCS is.
 645                 work.stwprocs = ncpu
 646         }
 647         work.heap0 = atomic.Load64(&gcController.heapLive)
 648         work.pauseNS = 0
 649         work.mode = mode
 650
 651         now := nanotime()
 652         work.tSweepTerm = now
 653         work.pauseStart = now
 654         if trace.enabled {
 655                 traceGCSTWStart(1)
 656         }
 657         systemstack(stopTheWorldWithSema)
 658         // Finish sweep before we start concurrent scan.
 659         systemstack(func() {
 660                 finishsweep_m()
 661         })
 662
 663         // clearpools before we start the GC. If we wait they memory will not be
 664         // reclaimed until the next GC cycle.
 665         clearpools()
 666
 667         work.cycles++
 668
 669         gcController.startCycle()
 670         work.heapGoal = gcController.heapGoal
 671
 672         // In STW mode, disable scheduling of user Gs. This may also
 673         // disable scheduling of this goroutine, so it may block as
 674         // soon as we start the world again.
 675         if mode != gcBackgroundMode {
 676                 schedEnableUser(false)
 677         }
 678
 679         // Enter concurrent mark phase and enable
 680         // write barriers.
 681         //
 682         // Because the world is stopped, all Ps will
 683         // observe that write barriers are enabled by
 684         // the time we start the world and begin
 685         // scanning.
 686         //
 687         // Write barriers must be enabled before assists are
 688         // enabled because they must be enabled before
 689         // any non-leaf heap objects are marked. Since
 690         // allocations are blocked until assists can
 691         // happen, we want enable assists as early as
 692         // possible.
 693         setGCPhase(_GCmark)
 694
 695         gcBgMarkPrepare() // Must happen before assist enable.
 696         gcMarkRootPrepare()
 697
 698         // Mark all active tinyalloc blocks. Since we're
 699         // allocating from these, they need to be black like
 700         // other allocations. The alternative is to blacken
 701         // the tiny block on every allocation from it, which
 702         // would slow down the tiny allocator.
 703         gcMarkTinyAllocs()
 704
 705         // At this point all Ps have enabled the write
 706         // barrier, thus maintaining the no white to
 707         // black invariant. Enable mutator assists to
 708         // put back-pressure on fast allocating
 709         // mutators.
 710         atomic.Store(&gcBlackenEnabled, 1)
 711
 712         // Assists and workers can start the moment we start
 713         // the world.
 714         gcController.markStartTime = now
 715
 716         // In STW mode, we could block the instant systemstack
 717         // returns, so make sure we're not preemptible.
 718         mp = acquirem()
 719
 720         // Concurrent mark.
 721         systemstack(func() {
 722                 now = startTheWorldWithSema(trace.enabled)
 723                 work.pauseNS += now - work.pauseStart
 724                 work.tMark = now
 725                 memstats.gcPauseDist.record(now - work.pauseStart)
 726         })
 727
 728         // Release the world sema before Gosched() in STW mode
 729         // because we will need to reacquire it later but before
 730         // this goroutine becomes runnable again, and we could
 731         // self-deadlock otherwise.
 732         semrelease(&worldsema)
 733         releasem(mp)
 734
 735         // Make sure we block instead of returning to user code
 736         // in STW mode.
 737         if mode != gcBackgroundMode {
 738                 Gosched()
 739         }
 740
 741         semrelease(&work.startSema)
 742 }
 743
 744 // gcMarkDoneFlushed counts the number of P's with flushed work.
 745 //
 746 // Ideally this would be a captured local in gcMarkDone, but forEachP
 747 // escapes its callback closure, so it can't capture anything.
 748 //
 749 // This is protected by markDoneSema.
 750 var gcMarkDoneFlushed uint32
 751
 752 // gcMarkDone transitions the GC from mark to mark termination if all
 753 // reachable objects have been marked (that is, there are no grey
 754 // objects and can be no more in the future). Otherwise, it flushes
 755 // all local work to the global queues where it can be discovered by
 756 // other workers.
 757 //
 758 // This should be called when all local mark work has been drained and
 759 // there are no remaining workers. Specifically, when
 760 //
 761 //   work.nwait == work.nproc && !gcMarkWorkAvailable(p)
 762 //
 763 // The calling context must be preemptible.
 764 //
 765 // Flushing local work is important because idle Ps may have local
 766 // work queued. This is the only way to make that work visible and
 767 // drive GC to completion.
 768 //
 769 // It is explicitly okay to have write barriers in this function. If
 770 // it does transition to mark termination, then all reachable objects
 771 // have been marked, so the write barrier cannot shade any more
 772 // objects.
 773 func gcMarkDone() {
 774         // Ensure only one thread is running the ragged barrier at a
 775         // time.
 776         semacquire(&work.markDoneSema)
 777
 778 top:
 779         // Re-check transition condition under transition lock.
 780         //
 781         // It's critical that this checks the global work queues are
 782         // empty before performing the ragged barrier. Otherwise,
 783         // there could be global work that a P could take after the P
 784         // has passed the ragged barrier.
 785         if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
 786                 semrelease(&work.markDoneSema)
 787                 return
 788         }
 789
 790         // forEachP needs worldsema to execute, and we'll need it to
 791         // stop the world later, so acquire worldsema now.
 792         semacquire(&worldsema)
 793
 794         // Flush all local buffers and collect flushedWork flags.
 795         gcMarkDoneFlushed = 0
 796         systemstack(func() {
 797                 gp := getg().m.curg
 798                 // Mark the user stack as preemptible so that it may be scanned.
 799                 // Otherwise, our attempt to force all P's to a safepoint could
 800                 // result in a deadlock as we attempt to preempt a worker that's
 801                 // trying to preempt us (e.g. for a stack scan).
 802                 casgstatus(gp, _Grunning, _Gwaiting)
 803                 forEachP(func(_p_ *p) {
 804                         // Flush the write barrier buffer, since this may add
 805                         // work to the gcWork.
 806                         wbBufFlush1(_p_)
 807
 808                         // Flush the gcWork, since this may create global work
 809                         // and set the flushedWork flag.
 810                         //
 811                         // TODO(austin): Break up these workbufs to
 812                         // better distribute work.
 813                         _p_.gcw.dispose()
 814                         // Collect the flushedWork flag.
 815                         if _p_.gcw.flushedWork {
 816                                 atomic.Xadd(&gcMarkDoneFlushed, 1)
 817                                 _p_.gcw.flushedWork = false
 818                         }
 819                 })
 820                 casgstatus(gp, _Gwaiting, _Grunning)
 821         })
 822
 823         if gcMarkDoneFlushed != 0 {
 824                 // More grey objects were discovered since the
 825                 // previous termination check, so there may be more
 826                 // work to do. Keep going. It's possible the
 827                 // transition condition became true again during the
 828                 // ragged barrier, so re-check it.
 829                 semrelease(&worldsema)
 830                 goto top
 831         }
 832
 833         // There was no global work, no local work, and no Ps
 834         // communicated work since we took markDoneSema. Therefore
 835         // there are no grey objects and no more objects can be
 836         // shaded. Transition to mark termination.
 837         now := nanotime()
 838         work.tMarkTerm = now
 839         work.pauseStart = now
 840         getg().m.preemptoff = "gcing"
 841         if trace.enabled {
 842                 traceGCSTWStart(0)
 843         }
 844         systemstack(stopTheWorldWithSema)
 845         // The gcphase is _GCmark, it will transition to _GCmarktermination
 846         // below. The important thing is that the wb remains active until
 847         // all marking is complete. This includes writes made by the GC.
 848
 849         // There is sometimes work left over when we enter mark termination due
 850         // to write barriers performed after the completion barrier above.
 851         // Detect this and resume concurrent mark. This is obviously
 852         // unfortunate.
 853         //
 854         // See issue #27993 for details.
 855         //
 856         // Switch to the system stack to call wbBufFlush1, though in this case
 857         // it doesn't matter because we're non-preemptible anyway.
 858         restart := false
 859         systemstack(func() {
 860                 for _, p := range allp {
 861                         wbBufFlush1(p)
 862                         if !p.gcw.empty() {
 863                                 restart = true
 864                                 break
 865                         }
 866                 }
 867         })
 868         if restart {
 869                 getg().m.preemptoff = ""
 870                 systemstack(func() {
 871                         now := startTheWorldWithSema(true)
 872                         work.pauseNS += now - work.pauseStart
 873                         memstats.gcPauseDist.record(now - work.pauseStart)
 874                 })
 875                 semrelease(&worldsema)
 876                 goto top
 877         }
 878
 879         // Disable assists and background workers. We must do
 880         // this before waking blocked assists.
 881         atomic.Store(&gcBlackenEnabled, 0)
 882
 883         // Wake all blocked assists. These will run when we
 884         // start the world again.
 885         gcWakeAllAssists()
 886
 887         // Likewise, release the transition lock. Blocked
 888         // workers and assists will run when we start the
 889         // world again.
 890         semrelease(&work.markDoneSema)
 891
 892         // In STW mode, re-enable user goroutines. These will be
 893         // queued to run after we start the world.
 894         schedEnableUser(true)
 895
 896         // endCycle depends on all gcWork cache stats being flushed.
 897         // The termination algorithm above ensured that up to
 898         // allocations since the ragged barrier.
 899         nextTriggerRatio := gcController.endCycle(work.userForced)
 900
 901         // Perform mark termination. This will restart the world.
 902         gcMarkTermination(nextTriggerRatio)
 903 }
 904
 905 // World must be stopped and mark assists and background workers must be
 906 // disabled.
 907 func gcMarkTermination(nextTriggerRatio float64) {
 908         // Start marktermination (write barrier remains enabled for now).
 909         setGCPhase(_GCmarktermination)
 910
 911         work.heap1 = gcController.heapLive
 912         startTime := nanotime()
 913
 914         mp := acquirem()
 915         mp.preemptoff = "gcing"
 916         _g_ := getg()
 917         _g_.m.traceback = 2
 918         gp := _g_.m.curg
 919         casgstatus(gp, _Grunning, _Gwaiting)
 920         gp.waitreason = waitReasonGarbageCollection
 921
 922         // Run gc on the g0 stack. We do this so that the g stack
 923         // we're currently running on will no longer change. Cuts
 924         // the root set down a bit (g0 stacks are not scanned, and
 925         // we don't need to scan gc's internal state).  We also
 926         // need to switch to g0 so we can shrink the stack.
 927         systemstack(func() {
 928                 gcMark(startTime)
 929                 // Must return immediately.
 930                 // The outer function's stack may have moved
 931                 // during gcMark (it shrinks stacks, including the
 932                 // outer function's stack), so we must not refer
 933                 // to any of its variables. Return back to the
 934                 // non-system stack to pick up the new addresses
 935                 // before continuing.
 936         })
 937
 938         systemstack(func() {
 939                 work.heap2 = work.bytesMarked
 940                 if debug.gccheckmark > 0 {
 941                         // Run a full non-parallel, stop-the-world
 942                         // mark using checkmark bits, to check that we
 943                         // didn't forget to mark anything during the
 944                         // concurrent mark process.
 945                         startCheckmarks()
 946                         gcResetMarkState()
 947                         gcw := &getg().m.p.ptr().gcw
 948                         gcDrain(gcw, 0)
 949                         wbBufFlush1(getg().m.p.ptr())
 950                         gcw.dispose()
 951                         endCheckmarks()
 952                 }
 953
 954                 // marking is complete so we can turn the write barrier off
 955                 setGCPhase(_GCoff)
 956                 gcSweep(work.mode)
 957         })
 958
 959         _g_.m.traceback = 0
 960         casgstatus(gp, _Gwaiting, _Grunning)
 961
 962         if trace.enabled {
 963                 traceGCDone()
 964         }
 965
 966         // all done
 967         mp.preemptoff = ""
 968
 969         if gcphase != _GCoff {
 970                 throw("gc done but gcphase != _GCoff")
 971         }
 972
 973         // Record heapGoal and heap_inuse for scavenger.
 974         gcController.lastHeapGoal = gcController.heapGoal
 975         memstats.last_heap_inuse = memstats.heap_inuse
 976
 977         // Update GC trigger and pacing for the next cycle.
 978         gcController.commit(nextTriggerRatio)
 979
 980         // Update timing memstats
 981         now := nanotime()
 982         sec, nsec, _ := time_now()
 983         unixNow := sec*1e9 + int64(nsec)
 984         work.pauseNS += now - work.pauseStart
 985         work.tEnd = now
 986         memstats.gcPauseDist.record(now - work.pauseStart)
 987         atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
 988         atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
 989         memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
 990         memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
 991         memstats.pause_total_ns += uint64(work.pauseNS)
 992
 993         // Update work.totaltime.
 994         sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
 995         // We report idle marking time below, but omit it from the
 996         // overall utilization here since it's "free".
 997         markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
 998         markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
 999         cycleCpu := sweepTermCpu + markCpu + markTermCpu
1000         work.totaltime += cycleCpu
1001
1002         // Compute overall GC CPU utilization.
1003         totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
1004         memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
1005
1006         // Reset sweep state.
1007         sweep.nbgsweep = 0
1008         sweep.npausesweep = 0
1009
1010         if work.userForced {
1011                 memstats.numforcedgc++
1012         }
1013
1014         // Bump GC cycle count and wake goroutines waiting on sweep.
1015         lock(&work.sweepWaiters.lock)
1016         memstats.numgc++
1017         injectglist(&work.sweepWaiters.list)
1018         unlock(&work.sweepWaiters.lock)
1019
1020         // Finish the current heap profiling cycle and start a new
1021         // heap profiling cycle. We do this before starting the world
1022         // so events don't leak into the wrong cycle.
1023         mProf_NextCycle()
1024
1025         // There may be stale spans in mcaches that need to be swept.
1026         // Those aren't tracked in any sweep lists, so we need to
1027         // count them against sweep completion until we ensure all
1028         // those spans have been forced out.
1029         sl := newSweepLocker()
1030         sl.blockCompletion()
1031
1032         systemstack(func() { startTheWorldWithSema(true) })
1033
1034         // Flush the heap profile so we can start a new cycle next GC.
1035         // This is relatively expensive, so we don't do it with the
1036         // world stopped.
1037         mProf_Flush()
1038
1039         // Prepare workbufs for freeing by the sweeper. We do this
1040         // asynchronously because it can take non-trivial time.
1041         prepareFreeWorkbufs()
1042
1043         // Free stack spans. This must be done between GC cycles.
1044         systemstack(freeStackSpans)
1045
1046         // Ensure all mcaches are flushed. Each P will flush its own
1047         // mcache before allocating, but idle Ps may not. Since this
1048         // is necessary to sweep all spans, we need to ensure all
1049         // mcaches are flushed before we start the next GC cycle.
1050         systemstack(func() {
1051                 forEachP(func(_p_ *p) {
1052                         _p_.mcache.prepareForSweep()
1053                 })
1054         })
1055         // Now that we've swept stale spans in mcaches, they don't
1056         // count against unswept spans.
1057         sl.dispose()
1058
1059         // Print gctrace before dropping worldsema. As soon as we drop
1060         // worldsema another cycle could start and smash the stats
1061         // we're trying to print.
1062         if debug.gctrace > 0 {
1063                 util := int(memstats.gc_cpu_fraction * 100)
1064
1065                 var sbuf [24]byte
1066                 printlock()
1067                 print("gc ", memstats.numgc,
1068                         " @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
1069                         util, "%: ")
1070                 prev := work.tSweepTerm
1071                 for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
1072                         if i != 0 {
1073                                 print("+")
1074                         }
1075                         print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
1076                         prev = ns
1077                 }
1078                 print(" ms clock, ")
1079                 for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
1080                         if i == 2 || i == 3 {
1081                                 // Separate mark time components with /.
1082                                 print("/")
1083                         } else if i != 0 {
1084                                 print("+")
1085                         }
1086                         print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
1087                 }
1088                 print(" ms cpu, ",
1089                         work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
1090                         work.heapGoal>>20, " MB goal, ",
1091                         work.maxprocs, " P")
1092                 if work.userForced {
1093                         print(" (forced)")
1094                 }
1095                 print("\n")
1096                 printunlock()
1097         }
1098
1099         semrelease(&worldsema)
1100         semrelease(&gcsema)
1101         // Careful: another GC cycle may start now.
1102
1103         releasem(mp)
1104         mp = nil
1105
1106         // now that gc is done, kick off finalizer thread if needed
1107         if !concurrentSweep {
1108                 // give the queued finalizers, if any, a chance to run
1109                 Gosched()
1110         }
1111 }
1112
1113 // gcBgMarkStartWorkers prepares background mark worker goroutines. These
1114 // goroutines will not run until the mark phase, but they must be started while
1115 // the work is not stopped and from a regular G stack. The caller must hold
1116 // worldsema.
1117 func gcBgMarkStartWorkers() {
1118         // Background marking is performed by per-P G's. Ensure that each P has
1119         // a background GC G.
1120         //
1121         // Worker Gs don't exit if gomaxprocs is reduced. If it is raised
1122         // again, we can reuse the old workers; no need to create new workers.
1123         for gcBgMarkWorkerCount < gomaxprocs {
1124                 go gcBgMarkWorker()
1125
1126                 notetsleepg(&work.bgMarkReady, -1)
1127                 noteclear(&work.bgMarkReady)
1128                 // The worker is now guaranteed to be added to the pool before
1129                 // its P's next findRunnableGCWorker.
1130
1131                 gcBgMarkWorkerCount++
1132         }
1133 }
1134
1135 // gcBgMarkPrepare sets up state for background marking.
1136 // Mutator assists must not yet be enabled.
1137 func gcBgMarkPrepare() {
1138         // Background marking will stop when the work queues are empty
1139         // and there are no more workers (note that, since this is
1140         // concurrent, this may be a transient state, but mark
1141         // termination will clean it up). Between background workers
1142         // and assists, we don't really know how many workers there
1143         // will be, so we pretend to have an arbitrarily large number
1144         // of workers, almost all of which are "waiting". While a
1145         // worker is working it decrements nwait. If nproc == nwait,
1146         // there are no workers.
1147         work.nproc = ^uint32(0)
1148         work.nwait = ^uint32(0)
1149 }
1150
1151 // gcBgMarkWorker is an entry in the gcBgMarkWorkerPool. It points to a single
1152 // gcBgMarkWorker goroutine.
1153 type gcBgMarkWorkerNode struct {
1154         // Unused workers are managed in a lock-free stack. This field must be first.
1155         node lfnode
1156
1157         // The g of this worker.
1158         gp guintptr
1159
1160         // Release this m on park. This is used to communicate with the unlock
1161         // function, which cannot access the G's stack. It is unused outside of
1162         // gcBgMarkWorker().
1163         m muintptr
1164 }
1165
1166 func gcBgMarkWorker() {
1167         gp := getg()
1168
1169         // We pass node to a gopark unlock function, so it can't be on
1170         // the stack (see gopark). Prevent deadlock from recursively
1171         // starting GC by disabling preemption.
1172         gp.m.preemptoff = "GC worker init"
1173         node := new(gcBgMarkWorkerNode)
1174         gp.m.preemptoff = ""
1175
1176         node.gp.set(gp)
1177
1178         node.m.set(acquirem())
1179         notewakeup(&work.bgMarkReady)
1180         // After this point, the background mark worker is generally scheduled
1181         // cooperatively by gcController.findRunnableGCWorker. While performing
1182         // work on the P, preemption is disabled because we are working on
1183         // P-local work buffers. When the preempt flag is set, this puts itself
1184         // into _Gwaiting to be woken up by gcController.findRunnableGCWorker
1185         // at the appropriate time.
1186         //
1187         // When preemption is enabled (e.g., while in gcMarkDone), this worker
1188         // may be preempted and schedule as a _Grunnable G from a runq. That is
1189         // fine; it will eventually gopark again for further scheduling via
1190         // findRunnableGCWorker.
1191         //
1192         // Since we disable preemption before notifying bgMarkReady, we
1193         // guarantee that this G will be in the worker pool for the next
1194         // findRunnableGCWorker. This isn't strictly necessary, but it reduces
1195         // latency between _GCmark starting and the workers starting.
1196
1197         for {
1198                 // Go to sleep until woken by
1199                 // gcController.findRunnableGCWorker.
1200                 gopark(func(g *g, nodep unsafe.Pointer) bool {
1201                         node := (*gcBgMarkWorkerNode)(nodep)
1202
1203                         if mp := node.m.ptr(); mp != nil {
1204                                 // The worker G is no longer running; release
1205                                 // the M.
1206                                 //
1207                                 // N.B. it is _safe_ to release the M as soon
1208                                 // as we are no longer performing P-local mark
1209                                 // work.
1210                                 //
1211                                 // However, since we cooperatively stop work
1212                                 // when gp.preempt is set, if we releasem in
1213                                 // the loop then the following call to gopark
1214                                 // would immediately preempt the G. This is
1215                                 // also safe, but inefficient: the G must
1216                                 // schedule again only to enter gopark and park
1217                                 // again. Thus, we defer the release until
1218                                 // after parking the G.
1219                                 releasem(mp)
1220                         }
1221
1222                         // Release this G to the pool.
1223                         gcBgMarkWorkerPool.push(&node.node)
1224                         // Note that at this point, the G may immediately be
1225                         // rescheduled and may be running.
1226                         return true
1227                 }, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceEvGoBlock, 0)
1228
1229                 // Preemption must not occur here, or another G might see
1230                 // p.gcMarkWorkerMode.
1231
1232                 // Disable preemption so we can use the gcw. If the
1233                 // scheduler wants to preempt us, we'll stop draining,
1234                 // dispose the gcw, and then preempt.
1235                 node.m.set(acquirem())
1236                 pp := gp.m.p.ptr() // P can't change with preemption disabled.
1237
1238                 if gcBlackenEnabled == 0 {
1239                         println("worker mode", pp.gcMarkWorkerMode)
1240                         throw("gcBgMarkWorker: blackening not enabled")
1241                 }
1242
1243                 if pp.gcMarkWorkerMode == gcMarkWorkerNotWorker {
1244                         throw("gcBgMarkWorker: mode not set")
1245                 }
1246
1247                 startTime := nanotime()
1248                 pp.gcMarkWorkerStartTime = startTime
1249
1250                 decnwait := atomic.Xadd(&work.nwait, -1)
1251                 if decnwait == work.nproc {
1252                         println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
1253                         throw("work.nwait was > work.nproc")
1254                 }
1255
1256                 systemstack(func() {
1257                         // Mark our goroutine preemptible so its stack
1258                         // can be scanned. This lets two mark workers
1259                         // scan each other (otherwise, they would
1260                         // deadlock). We must not modify anything on
1261                         // the G stack. However, stack shrinking is
1262                         // disabled for mark workers, so it is safe to
1263                         // read from the G stack.
1264                         casgstatus(gp, _Grunning, _Gwaiting)
1265                         switch pp.gcMarkWorkerMode {
1266                         default:
1267                                 throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
1268                         case gcMarkWorkerDedicatedMode:
1269                                 gcDrain(&pp.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
1270                                 if gp.preempt {
1271                                         // We were preempted. This is
1272                                         // a useful signal to kick
1273                                         // everything out of the run
1274                                         // queue so it can run
1275                                         // somewhere else.
1276                                         if drainQ, n := runqdrain(pp); n > 0 {
1277                                                 lock(&sched.lock)
1278                                                 globrunqputbatch(&drainQ, int32(n))
1279                                                 unlock(&sched.lock)
1280                                         }
1281                                 }
1282                                 // Go back to draining, this time
1283                                 // without preemption.
1284                                 gcDrain(&pp.gcw, gcDrainFlushBgCredit)
1285                         case gcMarkWorkerFractionalMode:
1286                                 gcDrain(&pp.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1287                         case gcMarkWorkerIdleMode:
1288                                 gcDrain(&pp.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1289                         }
1290                         casgstatus(gp, _Gwaiting, _Grunning)
1291                 })
1292
1293                 // Account for time.
1294                 duration := nanotime() - startTime
1295                 switch pp.gcMarkWorkerMode {
1296                 case gcMarkWorkerDedicatedMode:
1297                         atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
1298                         atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
1299                 case gcMarkWorkerFractionalMode:
1300                         atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
1301                         atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
1302                 case gcMarkWorkerIdleMode:
1303                         atomic.Xaddint64(&gcController.idleMarkTime, duration)
1304                 }
1305
1306                 // Was this the last worker and did we run out
1307                 // of work?
1308                 incnwait := atomic.Xadd(&work.nwait, +1)
1309                 if incnwait > work.nproc {
1310                         println("runtime: p.gcMarkWorkerMode=", pp.gcMarkWorkerMode,
1311                                 "work.nwait=", incnwait, "work.nproc=", work.nproc)
1312                         throw("work.nwait > work.nproc")
1313                 }
1314
1315                 // We'll releasem after this point and thus this P may run
1316                 // something else. We must clear the worker mode to avoid
1317                 // attributing the mode to a different (non-worker) G in
1318                 // traceGoStart.
1319                 pp.gcMarkWorkerMode = gcMarkWorkerNotWorker
1320
1321                 // If this worker reached a background mark completion
1322                 // point, signal the main GC goroutine.
1323                 if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1324                         // We don't need the P-local buffers here, allow
1325                         // preemption becuse we may schedule like a regular
1326                         // goroutine in gcMarkDone (block on locks, etc).
1327                         releasem(node.m.ptr())
1328                         node.m.set(nil)
1329
1330                         gcMarkDone()
1331                 }
1332         }
1333 }
1334
1335 // gcMarkWorkAvailable reports whether executing a mark worker
1336 // on p is potentially useful. p may be nil, in which case it only
1337 // checks the global sources of work.
1338 func gcMarkWorkAvailable(p *p) bool {
1339         if p != nil && !p.gcw.empty() {
1340                 return true
1341         }
1342         if !work.full.empty() {
1343                 return true // global work available
1344         }
1345         if work.markrootNext < work.markrootJobs {
1346                 return true // root scan work available
1347         }
1348         return false
1349 }
1350
1351 // gcMark runs the mark (or, for concurrent GC, mark termination)
1352 // All gcWork caches must be empty.
1353 // STW is in effect at this point.
1354 func gcMark(startTime int64) {
1355         if debug.allocfreetrace > 0 {
1356                 tracegc()
1357         }
1358
1359         if gcphase != _GCmarktermination {
1360                 throw("in gcMark expecting to see gcphase as _GCmarktermination")
1361         }
1362         work.tstart = startTime
1363
1364         // Check that there's no marking work remaining.
1365         if work.full != 0 || work.markrootNext < work.markrootJobs {
1366                 print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
1367                 panic("non-empty mark queue after concurrent mark")
1368         }
1369
1370         if debug.gccheckmark > 0 {
1371                 // This is expensive when there's a large number of
1372                 // Gs, so only do it if checkmark is also enabled.
1373                 gcMarkRootCheck()
1374         }
1375         if work.full != 0 {
1376                 throw("work.full != 0")
1377         }
1378
1379         // Clear out buffers and double-check that all gcWork caches
1380         // are empty. This should be ensured by gcMarkDone before we
1381         // enter mark termination.
1382         //
1383         // TODO: We could clear out buffers just before mark if this
1384         // has a non-negligible impact on STW time.
1385         for _, p := range allp {
1386                 // The write barrier may have buffered pointers since
1387                 // the gcMarkDone barrier. However, since the barrier
1388                 // ensured all reachable objects were marked, all of
1389                 // these must be pointers to black objects. Hence we
1390                 // can just discard the write barrier buffer.
1391                 if debug.gccheckmark > 0 {
1392                         // For debugging, flush the buffer and make
1393                         // sure it really was all marked.
1394                         wbBufFlush1(p)
1395                 } else {
1396                         p.wbBuf.reset()
1397                 }
1398
1399                 gcw := &p.gcw
1400                 if !gcw.empty() {
1401                         printlock()
1402                         print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
1403                         if gcw.wbuf1 == nil {
1404                                 print(" wbuf1=<nil>")
1405                         } else {
1406                                 print(" wbuf1.n=", gcw.wbuf1.nobj)
1407                         }
1408                         if gcw.wbuf2 == nil {
1409                                 print(" wbuf2=<nil>")
1410                         } else {
1411                                 print(" wbuf2.n=", gcw.wbuf2.nobj)
1412                         }
1413                         print("\n")
1414                         throw("P has cached GC work at end of mark termination")
1415                 }
1416                 // There may still be cached empty buffers, which we
1417                 // need to flush since we're going to free them. Also,
1418                 // there may be non-zero stats because we allocated
1419                 // black after the gcMarkDone barrier.
1420                 gcw.dispose()
1421         }
1422
1423         // Update the marked heap stat.
1424         gcController.heapMarked = work.bytesMarked
1425
1426         // Flush scanAlloc from each mcache since we're about to modify
1427         // heapScan directly. If we were to flush this later, then scanAlloc
1428         // might have incorrect information.
1429         for _, p := range allp {
1430                 c := p.mcache
1431                 if c == nil {
1432                         continue
1433                 }
1434                 gcController.heapScan += uint64(c.scanAlloc)
1435                 c.scanAlloc = 0
1436         }
1437
1438         // Update other GC heap size stats. This must happen after
1439         // cachestats (which flushes local statistics to these) and
1440         // flushallmcaches (which modifies gcController.heapLive).
1441         gcController.heapLive = work.bytesMarked
1442         gcController.heapScan = uint64(gcController.scanWork)
1443
1444         if trace.enabled {
1445                 traceHeapAlloc()
1446         }
1447 }
1448
1449 // gcSweep must be called on the system stack because it acquires the heap
1450 // lock. See mheap for details.
1451 //
1452 // The world must be stopped.
1453 //
1454 //go:systemstack
1455 func gcSweep(mode gcMode) {
1456         assertWorldStopped()
1457
1458         if gcphase != _GCoff {
1459                 throw("gcSweep being done but phase is not GCoff")
1460         }
1461
1462         lock(&mheap_.lock)
1463         mheap_.sweepgen += 2
1464         mheap_.sweepDrained = 0
1465         mheap_.pagesSwept = 0
1466         mheap_.sweepArenas = mheap_.allArenas
1467         mheap_.reclaimIndex = 0
1468         mheap_.reclaimCredit = 0
1469         unlock(&mheap_.lock)
1470
1471         sweep.centralIndex.clear()
1472
1473         if !_ConcurrentSweep || mode == gcForceBlockMode {
1474                 // Special case synchronous sweep.
1475                 // Record that no proportional sweeping has to happen.
1476                 lock(&mheap_.lock)
1477                 mheap_.sweepPagesPerByte = 0
1478                 unlock(&mheap_.lock)
1479                 // Sweep all spans eagerly.
1480                 for sweepone() != ^uintptr(0) {
1481                         sweep.npausesweep++
1482                 }
1483                 // Free workbufs eagerly.
1484                 prepareFreeWorkbufs()
1485                 for freeSomeWbufs(false) {
1486                 }
1487                 // All "free" events for this mark/sweep cycle have
1488                 // now happened, so we can make this profile cycle
1489                 // available immediately.
1490                 mProf_NextCycle()
1491                 mProf_Flush()
1492                 return
1493         }
1494
1495         // Background sweep.
1496         lock(&sweep.lock)
1497         if sweep.parked {
1498                 sweep.parked = false
1499                 ready(sweep.g, 0, true)
1500         }
1501         unlock(&sweep.lock)
1502 }
1503
1504 // gcResetMarkState resets global state prior to marking (concurrent
1505 // or STW) and resets the stack scan state of all Gs.
1506 //
1507 // This is safe to do without the world stopped because any Gs created
1508 // during or after this will start out in the reset state.
1509 //
1510 // gcResetMarkState must be called on the system stack because it acquires
1511 // the heap lock. See mheap for details.
1512 //
1513 //go:systemstack
1514 func gcResetMarkState() {
1515         // This may be called during a concurrent phase, so lock to make sure
1516         // allgs doesn't change.
1517         forEachG(func(gp *g) {
1518                 gp.gcscandone = false // set to true in gcphasework
1519                 gp.gcAssistBytes = 0
1520         })
1521
1522         // Clear page marks. This is just 1MB per 64GB of heap, so the
1523         // time here is pretty trivial.
1524         lock(&mheap_.lock)
1525         arenas := mheap_.allArenas
1526         unlock(&mheap_.lock)
1527         for _, ai := range arenas {
1528                 ha := mheap_.arenas[ai.l1()][ai.l2()]
1529                 for i := range ha.pageMarks {
1530                         ha.pageMarks[i] = 0
1531                 }
1532         }
1533
1534         work.bytesMarked = 0
1535         work.initialHeapLive = atomic.Load64(&gcController.heapLive)
1536 }
1537
1538 // Hooks for other packages
1539
1540 var poolcleanup func()
1541
1542 //go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
1543 func sync_runtime_registerPoolCleanup(f func()) {
1544         poolcleanup = f
1545 }
1546
1547 func clearpools() {
1548         // clear sync.Pools
1549         if poolcleanup != nil {
1550                 poolcleanup()
1551         }
1552
1553         // Clear central sudog cache.
1554         // Leave per-P caches alone, they have strictly bounded size.
1555         // Disconnect cached list before dropping it on the floor,
1556         // so that a dangling ref to one entry does not pin all of them.
1557         lock(&sched.sudoglock)
1558         var sg, sgnext *sudog
1559         for sg = sched.sudogcache; sg != nil; sg = sgnext {
1560                 sgnext = sg.next
1561                 sg.next = nil
1562         }
1563         sched.sudogcache = nil
1564         unlock(&sched.sudoglock)
1565
1566         // Clear central defer pools.
1567         // Leave per-P pools alone, they have strictly bounded size.
1568         lock(&sched.deferlock)
1569         for i := range sched.deferpool {
1570                 // disconnect cached list before dropping it on the floor,
1571                 // so that a dangling ref to one entry does not pin all of them.
1572                 var d, dlink *_defer
1573                 for d = sched.deferpool[i]; d != nil; d = dlink {
1574                         dlink = d.link
1575                         d.link = nil
1576                 }
1577                 sched.deferpool[i] = nil
1578         }
1579         unlock(&sched.deferlock)
1580 }
1581
1582 // Timing
1583
1584 // itoaDiv formats val/(10**dec) into buf.
1585 func itoaDiv(buf []byte, val uint64, dec int) []byte {
1586         i := len(buf) - 1
1587         idec := i - dec
1588         for val >= 10 || i >= idec {
1589                 buf[i] = byte(val%10 + '0')
1590                 i--
1591                 if i == idec {
1592                         buf[i] = '.'
1593                         i--
1594                 }
1595                 val /= 10
1596         }
1597         buf[i] = byte(val + '0')
1598         return buf[i:]
1599 }
1600
1601 // fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
1602 func fmtNSAsMS(buf []byte, ns uint64) []byte {
1603         if ns >= 10e6 {
1604                 // Format as whole milliseconds.
1605                 return itoaDiv(buf, ns/1e6, 0)
1606         }
1607         // Format two digits of precision, with at most three decimal places.
1608         x := ns / 1e3
1609         if x == 0 {
1610                 buf[0] = '0'
1611                 return buf[:1]
1612         }
1613         dec := 3
1614         for x >= 100 {
1615                 x /= 10
1616                 dec--
1617         }
1618         return itoaDiv(buf, x, dec)
1619 }
1620
1621 // Helpers for testing GC.
1622
1623 // gcTestMoveStackOnNextCall causes the stack to be moved on a call
1624 // immediately following the call to this. It may not work correctly
1625 // if any other work appears after this call (such as returning).
1626 // Typically the following call should be marked go:noinline so it
1627 // performs a stack check.
1628 //
1629 // In rare cases this may not cause the stack to move, specifically if
1630 // there's a preemption between this call and the next.
1631 func gcTestMoveStackOnNextCall() {
1632         gp := getg()
1633         gp.stackguard0 = stackForceMove
1634 }
1635
1636 // gcTestIsReachable performs a GC and returns a bit set where bit i
1637 // is set if ptrs[i] is reachable.
1638 func gcTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
1639         // This takes the pointers as unsafe.Pointers in order to keep
1640         // them live long enough for us to attach specials. After
1641         // that, we drop our references to them.
1642
1643         if len(ptrs) > 64 {
1644                 panic("too many pointers for uint64 mask")
1645         }
1646
1647         // Block GC while we attach specials and drop our references
1648         // to ptrs. Otherwise, if a GC is in progress, it could mark
1649         // them reachable via this function before we have a chance to
1650         // drop them.
1651         semacquire(&gcsema)
1652
1653         // Create reachability specials for ptrs.
1654         specials := make([]*specialReachable, len(ptrs))
1655         for i, p := range ptrs {
1656                 lock(&mheap_.speciallock)
1657                 s := (*specialReachable)(mheap_.specialReachableAlloc.alloc())
1658                 unlock(&mheap_.speciallock)
1659                 s.special.kind = _KindSpecialReachable
1660                 if !addspecial(p, &s.special) {
1661                         throw("already have a reachable special (duplicate pointer?)")
1662                 }
1663                 specials[i] = s
1664                 // Make sure we don't retain ptrs.
1665                 ptrs[i] = nil
1666         }
1667
1668         semrelease(&gcsema)
1669
1670         // Force a full GC and sweep.
1671         GC()
1672
1673         // Process specials.
1674         for i, s := range specials {
1675                 if !s.done {
1676                         printlock()
1677                         println("runtime: object", i, "was not swept")
1678                         throw("IsReachable failed")
1679                 }
1680                 if s.reachable {
1681                         mask |= 1 << i
1682                 }
1683                 lock(&mheap_.speciallock)
1684                 mheap_.specialReachableAlloc.free(unsafe.Pointer(s))
1685                 unlock(&mheap_.speciallock)
1686         }
1687
1688         return mask
1689 }
1690
1691 // gcTestPointerClass returns the category of what p points to, one of:
1692 // "heap", "stack", "data", "bss", "other". This is useful for checking
1693 // that a test is doing what it's intended to do.
1694 //
1695 // This is nosplit simply to avoid extra pointer shuffling that may
1696 // complicate a test.
1697 //
1698 //go:nosplit
1699 func gcTestPointerClass(p unsafe.Pointer) string {
1700         p2 := uintptr(noescape(p))
1701         gp := getg()
1702         if gp.stack.lo <= p2 && p2 < gp.stack.hi {
1703                 return "stack"
1704         }
1705         if base, _, _ := findObject(p2, 0, 0); base != 0 {
1706                 return "heap"
1707         }
1708         for _, datap := range activeModules() {
1709                 if datap.data <= p2 && p2 < datap.edata || datap.noptrdata <= p2 && p2 < datap.enoptrdata {
1710                         return "data"
1711                 }
1712                 if datap.bss <= p2 && p2 < datap.ebss || datap.noptrbss <= p2 && p2 <= datap.enoptrbss {
1713                         return "bss"
1714                 }
1715         }
1716         KeepAlive(p)
1717         return "other"
1718 }