cmd/compile,runtime: dedup writeBarrier needed

[gostls13.git] / src / runtime / mgcmark.go
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go

index 874d9107209c0448368a1b90e5f2a0d9249b1b87..adf1d4fa2817b9f6106ac2f969ad9af5906b8aca 100644 (file)
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -7,8 +7,10 @@
  package runtime
  
  import (
+       "internal/abi"
         "internal/goarch"
         "runtime/internal/atomic"
+       "runtime/internal/sys"
         "unsafe"
  )
  
@@ -100,7 +102,8 @@ func gcMarkRootPrepare() {
         // ignore them because they begin life without any roots, so
         // there's nothing to scan, and any roots they create during
         // the concurrent phase will be caught by the write barrier.
-       work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+       work.stackRoots = allGsSnapshot()
+       work.nStackRoots = len(work.stackRoots)
  
         work.markrootNext = 0
         work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
@@ -150,20 +153,28 @@ var oneptrmask = [...]uint8{1}
  //
  // Preemption must be disabled (because this uses a gcWork).
  //
+// Returns the amount of GC work credit produced by the operation.
+// If flushBgCredit is true, then that credit is also flushed
+// to the background credit pool.
+//
  // nowritebarrier is only advisory here.
  //
  //go:nowritebarrier
-func markroot(gcw *gcWork, i uint32) {
+func markroot(gcw *gcWork, i uint32, flushBgCredit bool) int64 {
         // Note: if you add a case here, please also update heapdump.go:dumproots.
+       var workDone int64
+       var workCounter *atomic.Int64
         switch {
         case work.baseData <= i && i < work.baseBSS:
+               workCounter = &gcController.globalsScanWork
                 for _, datap := range activeModules() {
-                       markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-work.baseData))
+                       workDone += markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-work.baseData))
                 }
  
         case work.baseBSS <= i && i < work.baseSpans:
+               workCounter = &gcController.globalsScanWork
                 for _, datap := range activeModules() {
-                       markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-work.baseBSS))
+                       workDone += markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-work.baseBSS))
                 }
  
         case i == fixedRootFinalizers:
@@ -183,15 +194,13 @@ func markroot(gcw *gcWork, i uint32) {
  
         default:
                 // the rest is scanning goroutine stacks
-               var gp *g
-               if work.baseStacks <= i && i < work.baseEnd {
-                       // N.B. Atomic read of allglen in gcMarkRootPrepare
-                       // acts as a barrier to ensure that allgs must be large
-                       // enough to contain all relevant Gs.
-                       gp = allgs[i-work.baseStacks]
-               } else {
+               workCounter = &gcController.stackScanWork
+               if i < work.baseStacks || work.baseEnd <= i {
+                       printlock()
+                       print("runtime: markroot index ", i, " not in stack roots range [", work.baseStacks, ", ", work.baseEnd, ")\n")
                         throw("markroot: bad index")
                 }
+               gp := work.stackRoots[i-work.baseStacks]
  
                 // remember when we've first observed the G blocked
                 // needed only to output in traceback
@@ -210,8 +219,7 @@ func markroot(gcw *gcWork, i uint32) {
                         userG := getg().m.curg
                         selfScan := gp == userG && readgstatus(userG) == _Grunning
                         if selfScan {
-                               casgstatus(userG, _Grunning, _Gwaiting)
-                               userG.waitreason = waitReasonGarbageCollectionScan
+                               casGToWaiting(userG, _Grunning, waitReasonGarbageCollectionScan)
                         }
  
                         // TODO: suspendG blocks (and spins) until gp
@@ -229,7 +237,7 @@ func markroot(gcw *gcWork, i uint32) {
                         if gp.gcscandone {
                                 throw("g already scanned")
                         }
-                       scanstack(gp, gcw)
+                       workDone += scanstack(gp, gcw)
                         gp.gcscandone = true
                         resumeG(stopped)
  
@@ -238,13 +246,22 @@ func markroot(gcw *gcWork, i uint32) {
                         }
                 })
         }
+       if workCounter != nil && workDone != 0 {
+               workCounter.Add(workDone)
+               if flushBgCredit {
+                       gcFlushBgCredit(workDone)
+               }
+       }
+       return workDone
  }
  
  // markrootBlock scans the shard'th shard of the block of memory [b0,
  // b0+n0), with the given pointer mask.
  //
+// Returns the amount of work done.
+//
  //go:nowritebarrier
-func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
+func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) int64 {
         if rootBlockBytes%(8*goarch.PtrSize) != 0 {
                 // This is necessary to pick byte offsets in ptrmask0.
                 throw("rootBlockBytes must be a multiple of 8*ptrSize")
@@ -255,7 +272,7 @@ func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
         // These tests are written to avoid any possible overflow.
         off := uintptr(shard) * rootBlockBytes
         if off >= n0 {
-               return
+               return 0
         }
         b := b0 + off
         ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*goarch.PtrSize))))
@@ -266,6 +283,7 @@ func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
  
         // Scan this shard.
         scanblock(b, n, ptrmask, gcw, nil)
+       return int64(n)
  }
  
  // markrootFreeGStacks frees stacks of dead Gs.
@@ -369,7 +387,9 @@ func markrootSpans(gcw *gcWork, shard int) {
                                 // Mark everything that can be reached from
                                 // the object (but *not* the object itself or
                                 // we'll never collect it).
-                               scanobject(p, gcw)
+                               if !s.spanclass.noscan() {
+                                       scanobject(p, gcw)
+                               }
  
                                 // The special itself is a root.
                                 scanblock(uintptr(unsafe.Pointer(&spf.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
@@ -380,7 +400,7 @@ func markrootSpans(gcw *gcWork, shard int) {
  }
  
  // gcAssistAlloc performs GC work to make gp's assist debt positive.
-// gp must be the calling user gorountine.
+// gp must be the calling user goroutine.
  //
  // This must be called with preemption enabled.
  func gcAssistAlloc(gp *g) {
@@ -395,12 +415,20 @@ func gcAssistAlloc(gp *g) {
  
         traced := false
  retry:
+       if gcCPULimiter.limiting() {
+               // If the CPU limiter is enabled, intentionally don't
+               // assist to reduce the amount of CPU time spent in the GC.
+               if traced {
+                       traceGCMarkAssistDone()
+               }
+               return
+       }
         // Compute the amount of scan work we need to do to make the
         // balance positive. When the required amount of work is low,
         // we over-assist to build up credit for future allocations
         // and amortize the cost of assisting.
-       assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
-       assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+       assistWorkPerByte := gcController.assistWorkPerByte.Load()
+       assistBytesPerWork := gcController.assistBytesPerWork.Load()
         debtBytes := -gp.gcAssistBytes
         scanWork := int64(assistWorkPerByte * float64(debtBytes))
         if scanWork < gcOverAssistWork {
@@ -414,7 +442,7 @@ retry:
         // will just cause steals to fail until credit is accumulated
         // again, so in the long run it doesn't really matter, but we
         // do have to handle the negative credit case.
-       bgScanCredit := atomic.Loadint64(&gcController.bgScanCredit)
+       bgScanCredit := gcController.bgScanCredit.Load()
         stolen := int64(0)
         if bgScanCredit > 0 {
                 if bgScanCredit < scanWork {
@@ -424,7 +452,7 @@ retry:
                         stolen = scanWork
                         gp.gcAssistBytes += debtBytes
                 }
-               atomic.Xaddint64(&gcController.bgScanCredit, -stolen)
+               gcController.bgScanCredit.Add(-stolen)
  
                 scanWork -= stolen
  
@@ -438,7 +466,7 @@ retry:
                 }
         }
  
-       if trace.enabled && !traced {
+       if traceEnabled() && !traced {
                 traced = true
                 traceGCMarkAssistStart()
         }
@@ -509,7 +537,7 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
                 // The gcBlackenEnabled check in malloc races with the
                 // store that clears it but an atomic check in every malloc
                 // would be a performance hit.
-               // Instead we recheck it here on the non-preemptable system
+               // Instead we recheck it here on the non-preemptible system
                 // stack to determine if we should perform an assist.
  
                 // GC is done, so ignore any remaining debt.
@@ -519,7 +547,11 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
         // Track time spent in this assist. Since we're on the
         // system stack, this is non-preemptible, so we can
         // just measure start and end time.
+       //
+       // Limiter event tracking might be disabled if we end up here
+       // while on a mark worker.
         startTime := nanotime()
+       trackLimiterEvent := gp.m.p.ptr().limiterEvent.start(limiterEventMarkAssist, startTime)
  
         decnwait := atomic.Xadd(&work.nwait, -1)
         if decnwait == work.nproc {
@@ -528,8 +560,7 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
         }
  
         // gcDrainN requires the caller to be preemptible.
-       casgstatus(gp, _Grunning, _Gwaiting)
-       gp.waitreason = waitReasonGCAssistMarking
+       casGToWaiting(gp, _Grunning, waitReasonGCAssistMarking)
  
         // drain own cached work first in the hopes that it
         // will be more cache friendly.
@@ -544,7 +575,7 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
         // this scan work counts for. The "1+" is a poor man's
         // round-up, to ensure this adds credit even if
         // assistBytesPerWork is very low.
-       assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+       assistBytesPerWork := gcController.assistBytesPerWork.Load()
         gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(workDone))
  
         // If this is the last worker and we ran out of work,
@@ -563,12 +594,17 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
                 // a valid pointer).
                 gp.param = unsafe.Pointer(gp)
         }
-       duration := nanotime() - startTime
-       _p_ := gp.m.p.ptr()
-       _p_.gcAssistTime += duration
-       if _p_.gcAssistTime > gcAssistTimeSlack {
-               atomic.Xaddint64(&gcController.assistTime, _p_.gcAssistTime)
-               _p_.gcAssistTime = 0
+       now := nanotime()
+       duration := now - startTime
+       pp := gp.m.p.ptr()
+       pp.gcAssistTime += duration
+       if trackLimiterEvent {
+               pp.limiterEvent.stop(limiterEventMarkAssist, now)
+       }
+       if pp.gcAssistTime > gcAssistTimeSlack {
+               gcController.assistTime.Add(pp.gcAssistTime)
+               gcCPULimiter.update(now)
+               pp.gcAssistTime = 0
         }
  }
  
@@ -586,8 +622,6 @@ func gcWakeAllAssists() {
  //
  // gcParkAssist reports whether the assist is now satisfied. If it
  // returns false, the caller must retry the assist.
-//
-//go:nowritebarrier
  func gcParkAssist() bool {
         lock(&work.assistQueue.lock)
         // If the GC cycle finished while we were getting the lock,
@@ -606,7 +640,7 @@ func gcParkAssist() bool {
         // the queue, but can still back out. This avoids a
         // race in case background marking has flushed more
         // credit since we checked above.
-       if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
+       if gcController.bgScanCredit.Load() > 0 {
                 work.assistQueue.q = oldList
                 if oldList.tail != 0 {
                         oldList.tail.ptr().schedlink.set(nil)
@@ -615,7 +649,7 @@ func gcParkAssist() bool {
                 return false
         }
         // Park.
-       goparkunlock(&work.assistQueue.lock, waitReasonGCAssistWait, traceEvGoBlockGC, 2)
+       goparkunlock(&work.assistQueue.lock, waitReasonGCAssistWait, traceBlockGCMarkAssist, 2)
         return true
  }
  
@@ -635,11 +669,11 @@ func gcFlushBgCredit(scanWork int64) {
                 // small window here where an assist may add itself to
                 // the blocked queue and park. If that happens, we'll
                 // just get it on the next flush.
-               atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+               gcController.bgScanCredit.Add(scanWork)
                 return
         }
  
-       assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+       assistBytesPerWork := gcController.assistBytesPerWork.Load()
         scanBytes := int64(float64(scanWork) * assistBytesPerWork)
  
         lock(&work.assistQueue.lock)
@@ -673,15 +707,21 @@ func gcFlushBgCredit(scanWork int64) {
  
         if scanBytes > 0 {
                 // Convert from scan bytes back to work.
-               assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+               assistWorkPerByte := gcController.assistWorkPerByte.Load()
                 scanWork = int64(float64(scanBytes) * assistWorkPerByte)
-               atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+               gcController.bgScanCredit.Add(scanWork)
         }
         unlock(&work.assistQueue.lock)
  }
  
  // scanstack scans gp's stack, greying all pointers found on the stack.
  //
+// Returns the amount of scan work performed, but doesn't update
+// gcController.stackScanWork or flush any credit. Any background credit produced
+// by this function should be flushed by its caller. scanstack itself can't
+// safely flush because it may result in trying to wake up a goroutine that
+// was just scanned, resulting in a self-deadlock.
+//
  // scanstack will also shrink the stack if it is safe to do so. If it
  // is not, it schedules a stack shrink for the next synchronous safe
  // point.
@@ -691,7 +731,7 @@ func gcFlushBgCredit(scanWork int64) {
  //
  //go:nowritebarrier
  //go:systemstack
-func scanstack(gp *g, gcw *gcWork) {
+func scanstack(gp *g, gcw *gcWork) int64 {
         if readgstatus(gp)&_Gscan == 0 {
                 print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
                 throw("scanstack - bad status")
@@ -702,7 +742,7 @@ func scanstack(gp *g, gcw *gcWork) {
                 print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
                 throw("mark - bad status")
         case _Gdead:
-               return
+               return 0
         case _Grunning:
                 print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
                 throw("scanstack: goroutine not stopped")
@@ -714,6 +754,23 @@ func scanstack(gp *g, gcw *gcWork) {
                 throw("can't scan our own stack")
         }
  
+       // scannedSize is the amount of work we'll be reporting.
+       //
+       // It is less than the allocated size (which is hi-lo).
+       var sp uintptr
+       if gp.syscallsp != 0 {
+               sp = gp.syscallsp // If in a system call this is the stack pointer (gp.sched.sp can be 0 in this case on Windows).
+       } else {
+               sp = gp.sched.sp
+       }
+       scannedSize := gp.stack.hi - sp
+
+       // Keep statistics for initial stack size calculation.
+       // Note that this accumulates the scanned size, not the allocated size.
+       p := getg().m.p.ptr()
+       p.scannedStackSize += uint64(scannedSize)
+       p.scannedStacks++
+
         if isShrinkStackSafe(gp) {
                 // Shrink the stack if not much of it is being used.
                 shrinkstack(gp)
@@ -741,11 +798,10 @@ func scanstack(gp *g, gcw *gcWork) {
         }
  
         // Scan the stack. Accumulate a list of stack objects.
-       scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
-               scanframeworker(frame, &state, gcw)
-               return true
+       var u unwinder
+       for u.init(gp, 0); u.valid(); u.next() {
+               scanframeworker(&u.frame, &state, gcw)
         }
-       gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
  
         // Find additional pointers that point into the stack from the heap.
         // Currently this includes defers and panics. See also function copystack.
@@ -804,7 +860,7 @@ func scanstack(gp *g, gcw *gcWork) {
                         println()
                         printunlock()
                 }
-               gcdata := r.gcdata
+               gcdata := r.gcdata()
                 var s *mspan
                 if r.useGCProg() {
                         // This path is pretty unlikely, an object large enough
@@ -853,17 +909,19 @@ func scanstack(gp *g, gcw *gcWork) {
         if state.buf != nil || state.cbuf != nil || state.freeBuf != nil {
                 throw("remaining pointer buffers")
         }
+       return int64(scannedSize)
  }
  
  // Scan a stack frame: local variables and function arguments/results.
+//
  //go:nowritebarrier
  func scanframeworker(frame *stkframe, state *stackScanState, gcw *gcWork) {
         if _DebugGC > 1 && frame.continpc != 0 {
                 print("scanframe ", funcname(frame.fn), "\n")
         }
  
-       isAsyncPreempt := frame.fn.valid() && frame.fn.funcID == funcID_asyncPreempt
-       isDebugCall := frame.fn.valid() && frame.fn.funcID == funcID_debugCallV2
+       isAsyncPreempt := frame.fn.valid() && frame.fn.funcID == abi.FuncID_asyncPreempt
+       isDebugCall := frame.fn.valid() && frame.fn.funcID == abi.FuncID_debugCallV2
         if state.conservative || isAsyncPreempt || isDebugCall {
                 if debugScanConservative {
                         println("conservatively scanning function", funcname(frame.fn), "at PC", hex(frame.continpc))
@@ -885,10 +943,10 @@ func scanframeworker(frame *stkframe, state *stackScanState, gcw *gcWork) {
                 }
  
                 // Scan arguments to this frame.
-               if frame.arglen != 0 {
+               if n := frame.argBytes(); n != 0 {
                         // TODO: We could pass the entry argument map
                         // to narrow this down further.
-                       scanConservative(frame.argp, frame.arglen, nil, gcw, state)
+                       scanConservative(frame.argp, n, nil, gcw, state)
                 }
  
                 if isAsyncPreempt || isDebugCall {
@@ -906,7 +964,7 @@ func scanframeworker(frame *stkframe, state *stackScanState, gcw *gcWork) {
                 return
         }
  
-       locals, args, objs := getStackMap(frame, &state.cache, false)
+       locals, args, objs := frame.getStackMap(false)
  
         // Scan local variables if stack frame has been allocated.
         if locals.n > 0 {
@@ -924,7 +982,8 @@ func scanframeworker(frame *stkframe, state *stackScanState, gcw *gcWork) {
                 // varp is 0 for defers, where there are no locals.
                 // In that case, there can't be a pointer to its args, either.
                 // (And all args would be scanned above anyway.)
-               for i, obj := range objs {
+               for i := range objs {
+                       obj := &objs[i]
                         off := obj.off
                         base := frame.varp // locals base pointer
                         if off >= 0 {
@@ -938,7 +997,7 @@ func scanframeworker(frame *stkframe, state *stackScanState, gcw *gcWork) {
                         if stackTraceDebug {
                                 println("stkobj at", hex(ptr), "of size", obj.size)
                         }
-                       state.addObject(ptr, &objs[i])
+                       state.addObject(ptr, obj)
                 }
         }
  }
@@ -952,6 +1011,28 @@ const (
         gcDrainFractional
  )
  
+// gcDrainMarkWorkerIdle is a wrapper for gcDrain that exists to better account
+// mark time in profiles.
+func gcDrainMarkWorkerIdle(gcw *gcWork) {
+       gcDrain(gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+}
+
+// gcDrainMarkWorkerDedicated is a wrapper for gcDrain that exists to better account
+// mark time in profiles.
+func gcDrainMarkWorkerDedicated(gcw *gcWork, untilPreempt bool) {
+       flags := gcDrainFlushBgCredit
+       if untilPreempt {
+               flags |= gcDrainUntilPreempt
+       }
+       gcDrain(gcw, flags)
+}
+
+// gcDrainMarkWorkerFractional is a wrapper for gcDrain that exists to better account
+// mark time in profiles.
+func gcDrainMarkWorkerFractional(gcw *gcWork) {
+       gcDrain(gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
+}
+
  // gcDrain scans roots and objects in work buffers, blackening grey
  // objects until it is unable to get more work. It may return before
  // GC is done; it's the caller's responsibility to balance work from
@@ -973,9 +1054,17 @@ const (
  //
  // gcDrain will always return if there is a pending STW.
  //
+// Disabling write barriers is necessary to ensure that after we've
+// confirmed that we've drained gcw, that we don't accidentally end
+// up flipping that condition by immediately adding work in the form
+// of a write barrier buffer flush.
+//
+// Don't set nowritebarrierrec because it's safe for some callees to
+// have write barriers enabled.
+//
  //go:nowritebarrier
  func gcDrain(gcw *gcWork, flags gcDrainFlags) {
-       if !writeBarrier.needed {
+       if !writeBarrier.enabled {
                 throw("gcDrain phase incorrect")
         }
  
@@ -984,7 +1073,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
         flushBgCredit := flags&gcDrainFlushBgCredit != 0
         idle := flags&gcDrainIdle != 0
  
-       initScanWork := gcw.scanWork
+       initScanWork := gcw.heapScanWork
  
         // checkWork is the scan work before performing the next
         // self-preempt check.
@@ -1002,12 +1091,12 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
         // Drain root marking jobs.
         if work.markrootNext < work.markrootJobs {
                 // Stop if we're preemptible or if someone wants to STW.
-               for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
+               for !(gp.preempt && (preemptible || sched.gcwaiting.Load())) {
                         job := atomic.Xadd(&work.markrootNext, +1) - 1
                         if job >= work.markrootJobs {
                                 break
                         }
-                       markroot(gcw, job)
+                       markroot(gcw, job, flushBgCredit)
                         if check != nil && check() {
                                 goto done
                         }
@@ -1016,7 +1105,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
  
         // Drain heap marking jobs.
         // Stop if we're preemptible or if someone wants to STW.
-       for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
+       for !(gp.preempt && (preemptible || sched.gcwaiting.Load())) {
                 // Try to keep work available on the global queue. We used to
                 // check if there were waiting workers, but it's better to
                 // just keep work available than to make workers wait. In the
@@ -1033,7 +1122,7 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
                                 // Flush the write barrier
                                 // buffer; this may create
                                 // more work.
-                               wbBufFlush(nil, 0)
+                               wbBufFlush()
                                 b = gcw.tryGet()
                         }
                 }
@@ -1046,14 +1135,14 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
                 // Flush background scan work credit to the global
                 // account if we've accumulated enough locally so
                 // mutator assists can draw on it.
-               if gcw.scanWork >= gcCreditSlack {
-                       atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+               if gcw.heapScanWork >= gcCreditSlack {
+                       gcController.heapScanWork.Add(gcw.heapScanWork)
                         if flushBgCredit {
-                               gcFlushBgCredit(gcw.scanWork - initScanWork)
+                               gcFlushBgCredit(gcw.heapScanWork - initScanWork)
                                 initScanWork = 0
                         }
-                       checkWork -= gcw.scanWork
-                       gcw.scanWork = 0
+                       checkWork -= gcw.heapScanWork
+                       gcw.heapScanWork = 0
  
                         if checkWork <= 0 {
                                 checkWork += drainCheckThreshold
@@ -1066,12 +1155,12 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
  
  done:
         // Flush remaining scan work credit.
-       if gcw.scanWork > 0 {
-               atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+       if gcw.heapScanWork > 0 {
+               gcController.heapScanWork.Add(gcw.heapScanWork)
                 if flushBgCredit {
-                       gcFlushBgCredit(gcw.scanWork - initScanWork)
+                       gcFlushBgCredit(gcw.heapScanWork - initScanWork)
                 }
-               gcw.scanWork = 0
+               gcw.heapScanWork = 0
         }
  }
  
@@ -1089,59 +1178,54 @@ done:
  //go:nowritebarrier
  //go:systemstack
  func gcDrainN(gcw *gcWork, scanWork int64) int64 {
-       if !writeBarrier.needed {
+       if !writeBarrier.enabled {
                 throw("gcDrainN phase incorrect")
         }
  
         // There may already be scan work on the gcw, which we don't
         // want to claim was done by this call.
-       workFlushed := -gcw.scanWork
+       workFlushed := -gcw.heapScanWork
  
+       // In addition to backing out because of a preemption, back out
+       // if the GC CPU limiter is enabled.
         gp := getg().m.curg
-       for !gp.preempt && workFlushed+gcw.scanWork < scanWork {
+       for !gp.preempt && !gcCPULimiter.limiting() && workFlushed+gcw.heapScanWork < scanWork {
                 // See gcDrain comment.
                 if work.full == 0 {
                         gcw.balance()
                 }
  
-               // This might be a good place to add prefetch code...
-               // if(wbuf.nobj > 4) {
-               //         PREFETCH(wbuf->obj[wbuf.nobj - 3];
-               //  }
-               //
                 b := gcw.tryGetFast()
                 if b == 0 {
                         b = gcw.tryGet()
                         if b == 0 {
                                 // Flush the write barrier buffer;
                                 // this may create more work.
-                               wbBufFlush(nil, 0)
+                               wbBufFlush()
                                 b = gcw.tryGet()
                         }
                 }
  
                 if b == 0 {
                         // Try to do a root job.
-                       //
-                       // TODO: Assists should get credit for this
-                       // work.
                         if work.markrootNext < work.markrootJobs {
                                 job := atomic.Xadd(&work.markrootNext, +1) - 1
                                 if job < work.markrootJobs {
-                                       markroot(gcw, job)
+                                       workFlushed += markroot(gcw, job, false)
                                         continue
                                 }
                         }
                         // No heap or root jobs.
                         break
                 }
+
                 scanobject(b, gcw)
  
                 // Flush background scan work credit.
-               if gcw.scanWork >= gcCreditSlack {
-                       atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
-                       workFlushed += gcw.scanWork
-                       gcw.scanWork = 0
+               if gcw.heapScanWork >= gcCreditSlack {
+                       gcController.heapScanWork.Add(gcw.heapScanWork)
+                       workFlushed += gcw.heapScanWork
+                       gcw.heapScanWork = 0
                 }
         }
  
@@ -1149,16 +1233,17 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
         // here because this never flushes to bgScanCredit and
         // gcw.dispose will flush any remaining work to scanWork.
  
-       return workFlushed + gcw.scanWork
+       return workFlushed + gcw.heapScanWork
  }
  
  // scanblock scans b as scanobject would, but using an explicit
  // pointer bitmap instead of the heap bitmap.
  //
  // This is used to scan non-heap roots, so it does not update
-// gcw.bytesMarked or gcw.scanWork.
+// gcw.bytesMarked or gcw.heapScanWork.
  //
  // If stk != nil, possible stack pointers are also reported to stk.putPtr.
+//
  //go:nowritebarrier
  func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork, stk *stackScanState) {
         // Use local copies of original parameters, so that a stack trace
@@ -1199,33 +1284,32 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork, stk *stackScanState)
  //
  //go:nowritebarrier
  func scanobject(b uintptr, gcw *gcWork) {
+       // Prefetch object before we scan it.
+       //
+       // This will overlap fetching the beginning of the object with initial
+       // setup before we start scanning the object.
+       sys.Prefetch(b)
+
         // Find the bits for b and the size of the object at b.
         //
         // b is either the beginning of an object, in which case this
         // is the size of the object to scan, or it points to an
         // oblet, in which case we compute the size to scan below.
-       hbits := heapBitsForAddr(b)
         s := spanOfUnchecked(b)
         n := s.elemsize
         if n == 0 {
                 throw("scanobject n == 0")
         }
+       if s.spanclass.noscan() {
+               // Correctness-wise this is ok, but it's inefficient
+               // if noscan objects reach here.
+               throw("scanobject of a noscan object")
+       }
  
         if n > maxObletBytes {
                 // Large object. Break into oblets for better
                 // parallelism and lower latency.
                 if b == s.base() {
-                       // It's possible this is a noscan object (not
-                       // from greyobject, but from other code
-                       // paths), in which case we must *not* enqueue
-                       // oblets since their bitmaps will be
-                       // uninitialized.
-                       if s.spanclass.noscan() {
-                               // Bypass the whole scan.
-                               gcw.bytesMarked += uint64(n)
-                               return
-                       }
-
                         // Enqueue the other oblets to scan later.
                         // Some oblets may be in b's scalar tail, but
                         // these will be marked as "no more pointers",
@@ -1242,25 +1326,27 @@ func scanobject(b uintptr, gcw *gcWork) {
                 // must be a large object, s.base() is the beginning
                 // of the object.
                 n = s.base() + s.elemsize - b
-               if n > maxObletBytes {
-                       n = maxObletBytes
-               }
+               n = min(n, maxObletBytes)
         }
  
-       var i uintptr
-       for i = 0; i < n; i, hbits = i+goarch.PtrSize, hbits.next() {
-               // Load bits once. See CL 22712 and issue 16973 for discussion.
-               bits := hbits.bits()
-               if bits&bitScan == 0 {
-                       break // no more pointers in this object
-               }
-               if bits&bitPointer == 0 {
-                       continue // not a pointer
+       hbits := heapBitsForAddr(b, n)
+       var scanSize uintptr
+       for {
+               var addr uintptr
+               if hbits, addr = hbits.nextFast(); addr == 0 {
+                       if hbits, addr = hbits.next(); addr == 0 {
+                               break
+                       }
                 }
  
+               // Keep track of farthest pointer we found, so we can
+               // update heapScanWork. TODO: is there a better metric,
+               // now that we can skip scalar portions pretty efficiently?
+               scanSize = addr - b + goarch.PtrSize
+
                 // Work here is duplicated in scanblock and above.
                 // If you make changes here, make changes there too.
-               obj := *(*uintptr)(unsafe.Pointer(b + i))
+               obj := *(*uintptr)(unsafe.Pointer(addr))
  
                 // At this point we have extracted the next potential pointer.
                 // Quickly filter out nil and pointers back to the current object.
@@ -1274,13 +1360,13 @@ func scanobject(b uintptr, gcw *gcWork) {
                         // heap. In this case, we know the object was
                         // just allocated and hence will be marked by
                         // allocation itself.
-                       if obj, span, objIndex := findObject(obj, b, i); obj != 0 {
-                               greyobject(obj, b, i, span, gcw, objIndex)
+                       if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 {
+                               greyobject(obj, b, addr-b, span, gcw, objIndex)
                         }
                 }
         }
         gcw.bytesMarked += uint64(n)
-       gcw.scanWork += int64(i)
+       gcw.heapScanWork += int64(scanSize)
  }
  
  // scanConservative scans block [b, b+n) conservatively, treating any
@@ -1381,6 +1467,7 @@ func scanConservative(b, n uintptr, ptrmask *uint8, gcw *gcWork, state *stackSca
  // Shade the object if it isn't already.
  // The object is not nil and known to be in the heap.
  // Preemption must be disabled.
+//
  //go:nowritebarrier
  func shade(b uintptr) {
         if obj, span, objIndex := findObject(b, 0, 0); obj != 0 {
@@ -1437,12 +1524,12 @@ func greyobject(obj, base, off uintptr, span *mspan, gcw *gcWork, objIndex uintp
                 }
         }
  
-       // Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
-       // seems like a nice optimization that can be added back in.
-       // There needs to be time between the PREFETCH and the use.
-       // Previously we put the obj in an 8 element buffer that is drained at a rate
-       // to give the PREFETCH time to do its work.
-       // Use of PREFETCHNTA might be more appropriate than PREFETCH
+       // We're adding obj to P's local workbuf, so it's likely
+       // this object will be processed soon by the same P.
+       // Even if the workbuf gets flushed, there will likely still be
+       // some benefit on platforms with inclusive shared caches.
+       sys.Prefetch(obj)
+       // Queue the obj for scanning.
         if !gcw.putFast(obj) {
                 gcw.put(obj)
         }
@@ -1502,7 +1589,7 @@ func gcDumpObject(label string, obj, off uintptr) {
  //
  //go:nowritebarrier
  //go:nosplit
-func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
+func gcmarknewobject(span *mspan, obj, size uintptr) {
         if useCheckmark { // The world should be stopped so this should not happen.
                 throw("gcmarknewobject called while doing checkmark")
         }
@@ -1519,7 +1606,6 @@ func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
  
         gcw := &getg().m.p.ptr().gcw
         gcw.bytesMarked += uint64(size)
-       gcw.scanWork += int64(scanSize)
  }
  
  // gcMarkTinyAllocs greys all active tiny alloc blocks.