runtime: refactor runtime->tracer API to appear more like a lock

[gostls13.git] / src / runtime / proc.go
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index 9cad2161b55cf15896d879e80b2e6421b8929b67..ae2562a5b76caa70e5a6ca3b71eb53d1909f3c4b 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -73,7 +73,7 @@ var modinfo string
  // If there is at least one spinning thread (sched.nmspinning>1), we don't
  // unpark new threads when submitting work. To compensate for that, if the last
  // spinning thread finds work and stops spinning, it must unpark a new spinning
-// thread.  This approach smooths out unjustified spikes of thread unparking,
+// thread. This approach smooths out unjustified spikes of thread unparking,
  // but at the same time guarantees eventual maximal CPU parallelism
  // utilization.
  //
@@ -84,7 +84,7 @@ var modinfo string
  // semi-persistent CPU underutilization.
  //
  // The general pattern for submission is:
-// 1. Submit work to the local run queue, timer heap, or GC state.
+// 1. Submit work to the local or global run queue, timer heap, or GC state.
  // 2. #StoreLoad-style memory barrier.
  // 3. Check sched.nmspinning.
  //
@@ -115,13 +115,12 @@ var (
         g0           g
         mcache0      *mcache
         raceprocctx0 uintptr
+       raceFiniLock mutex
  )
  
-//go:linkname runtime_inittask runtime..inittask
-var runtime_inittask initTask
-
-//go:linkname main_inittask main..inittask
-var main_inittask initTask
+// This slice records the initializing tasks that need to be
+// done to start up the runtime. It is built by the linker.
+var runtime_inittasks []*initTask
  
  // main_init_done is a signal used by cgocallbackg that initialization
  // has been completed. It is made before _cgo_notify_runtime_init_done,
@@ -196,7 +195,7 @@ func main() {
                 inittrace.active = true
         }
  
-       doInit(&runtime_inittask) // Must be before defer.
+       doInit(runtime_inittasks) // Must be before defer.
  
         // Defer unlock so that runtime.Goexit during init does the unlock too.
         needUnlock := true
@@ -210,6 +209,10 @@ func main() {
  
         main_init_done = make(chan bool)
         if iscgo {
+               if _cgo_pthread_key_created == nil {
+                       throw("_cgo_pthread_key_created missing")
+               }
+
                 if _cgo_thread_start == nil {
                         throw("_cgo_thread_start missing")
                 }
@@ -224,13 +227,29 @@ func main() {
                 if _cgo_notify_runtime_init_done == nil {
                         throw("_cgo_notify_runtime_init_done missing")
                 }
+
+               // Set the x_crosscall2_ptr C function pointer variable point to crosscall2.
+               if set_crosscall2 == nil {
+                       throw("set_crosscall2 missing")
+               }
+               set_crosscall2()
+
                 // Start the template thread in case we enter Go from
                 // a C-created thread and need to create a new thread.
                 startTemplateThread()
                 cgocall(_cgo_notify_runtime_init_done, nil)
         }
  
-       doInit(&main_inittask)
+       // Run the initializing tasks. Depending on build mode this
+       // list can arrive a few different ways, but it will always
+       // contain the init tasks computed by the linker for all the
+       // packages in the program (excluding those added at runtime
+       // by package plugin). Run through the modules in dependency
+       // order (the order they are initialized by the dynamic
+       // loader, i.e. they are added to the moduledata linked list).
+       for m := &firstmoduledata; m != nil; m = m.next {
+               doInit(m.inittasks)
+       }
  
         // Disable init tracing after main init done to avoid overhead
         // of collecting statistics in malloc and newproc
@@ -249,6 +268,7 @@ func main() {
         fn := main_main // make an indirect call, as the linker doesn't know the address of the main package when laying down the runtime
         fn()
         if raceenabled {
+               runExitHooks(0) // run hooks now, since racefini does not return
                 racefini()
         }
  
@@ -266,8 +286,9 @@ func main() {
                 }
         }
         if panicking.Load() != 0 {
-               gopark(nil, nil, waitReasonPanicWait, traceEvGoStop, 1)
+               gopark(nil, nil, waitReasonPanicWait, traceBlockForever, 1)
         }
+       runExitHooks(0)
  
         exit(0)
         for {
@@ -279,8 +300,9 @@ func main() {
  // os_beforeExit is called from os.Exit(0).
  //
  //go:linkname os_beforeExit os.runtime_beforeExit
-func os_beforeExit() {
-       if raceenabled {
+func os_beforeExit(exitCode int) {
+       runExitHooks(exitCode)
+       if exitCode == 0 && raceenabled {
                 racefini()
         }
  }
@@ -295,11 +317,11 @@ func forcegchelper() {
         lockInit(&forcegc.lock, lockRankForcegc)
         for {
                 lock(&forcegc.lock)
-               if forcegc.idle != 0 {
+               if forcegc.idle.Load() {
                         throw("forcegc: phase error")
                 }
-               atomic.Store(&forcegc.idle, 1)
-               goparkunlock(&forcegc.lock, waitReasonForceGCIdle, traceEvGoBlock, 1)
+               forcegc.idle.Store(true)
+               goparkunlock(&forcegc.lock, waitReasonForceGCIdle, traceBlockSystemGoroutine, 1)
                 // this goroutine is explicitly resumed by sysmon
                 if debug.gctrace > 0 {
                         println("GC forced")
@@ -309,10 +331,10 @@ func forcegchelper() {
         }
  }
  
-//go:nosplit
-
  // Gosched yields the processor, allowing other goroutines to run. It does not
  // suspend the current goroutine, so execution resumes automatically.
+//
+//go:nosplit
  func Gosched() {
         checkTimeouts()
         mcall(gosched_m)
@@ -326,6 +348,21 @@ func goschedguarded() {
         mcall(goschedguarded_m)
  }
  
+// goschedIfBusy yields the processor like gosched, but only does so if
+// there are no idle Ps or if we're on the only P and there's nothing in
+// the run queue. In both cases, there is freely available idle time.
+//
+//go:nosplit
+func goschedIfBusy() {
+       gp := getg()
+       // Call gosched if gp.preempt is set; we may be in a tight loop that
+       // doesn't otherwise yield.
+       if !gp.preempt && sched.npidle.Load() > 0 {
+               return
+       }
+       mcall(gosched_m)
+}
+
  // Puts the current goroutine into a waiting state and calls unlockf on the
  // system stack.
  //
@@ -343,7 +380,7 @@ func goschedguarded() {
  // Reason explains why the goroutine has been parked. It is displayed in stack
  // traces and heap dumps. Reasons should be unique and descriptive. Do not
  // re-use reasons, add new ones.
-func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceEv byte, traceskip int) {
+func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceReason traceBlockReason, traceskip int) {
         if reason != waitReasonSleep {
                 checkTimeouts() // timeouts may expire while two goroutines keep the scheduler busy
         }
@@ -356,8 +393,8 @@ func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason w
         mp.waitlock = lock
         mp.waitunlockf = unlockf
         gp.waitreason = reason
-       mp.waittraceev = traceEv
-       mp.waittraceskip = traceskip
+       mp.waitTraceBlockReason = traceReason
+       mp.waitTraceSkip = traceskip
         releasem(mp)
         // can't do anything that might move the G between Ms here.
         mcall(park_m)
@@ -365,8 +402,8 @@ func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason w
  
  // Puts the current goroutine into a waiting state and unlocks the lock.
  // The goroutine can be made runnable again by calling goready(gp).
-func goparkunlock(lock *mutex, reason waitReason, traceEv byte, traceskip int) {
-       gopark(parkunlock_c, unsafe.Pointer(lock), reason, traceEv, traceskip)
+func goparkunlock(lock *mutex, reason waitReason, traceReason traceBlockReason, traceskip int) {
+       gopark(parkunlock_c, unsafe.Pointer(lock), reason, traceReason, traceskip)
  }
  
  func goready(gp *g, traceskip int) {
@@ -463,7 +500,7 @@ func releaseSudog(s *sudog) {
         releasem(mp)
  }
  
-// called from assembly
+// called from assembly.
  func badmcall(fn func(*g)) {
         throw("runtime: mcall called on m->g0 stack")
  }
@@ -476,22 +513,29 @@ func badreflectcall() {
         panic(plainError("arg size to reflect.call more than 1GB"))
  }
  
-var badmorestackg0Msg = "fatal: morestack on g0\n"
-
  //go:nosplit
  //go:nowritebarrierrec
  func badmorestackg0() {
-       sp := stringStructOf(&badmorestackg0Msg)
-       write(2, sp.str, int32(sp.len))
-}
+       if !crashStackImplemented {
+               writeErrStr("fatal: morestack on g0\n")
+               return
+       }
+
+       g := getg()
+       switchToCrashStack(func() {
+               print("runtime: morestack on g0, stack [", hex(g.stack.lo), " ", hex(g.stack.hi), "], sp=", hex(g.sched.sp), ", called from\n")
+               g.m.traceback = 2 // include pc and sp in stack trace
+               traceback1(g.sched.pc, g.sched.sp, g.sched.lr, g, 0)
+               print("\n")
  
-var badmorestackgsignalMsg = "fatal: morestack on gsignal\n"
+               throw("morestack on g0")
+       })
+}
  
  //go:nosplit
  //go:nowritebarrierrec
  func badmorestackgsignal() {
-       sp := stringStructOf(&badmorestackgsignalMsg)
-       write(2, sp.str, int32(sp.len))
+       writeErrStr("fatal: morestack on gsignal\n")
  }
  
  //go:nosplit
@@ -499,6 +543,42 @@ func badctxt() {
         throw("ctxt != 0")
  }
  
+// gcrash is a fake g that can be used when crashing due to bad
+// stack conditions.
+var gcrash g
+
+var crashingG atomic.Pointer[g]
+
+// Switch to crashstack and call fn, with special handling of
+// concurrent and recursive cases.
+//
+// Nosplit as it is called in a bad stack condition (we know
+// morestack would fail).
+//
+//go:nosplit
+//go:nowritebarrierrec
+func switchToCrashStack(fn func()) {
+       me := getg()
+       if crashingG.CompareAndSwapNoWB(nil, me) {
+               switchToCrashStack0(fn) // should never return
+               abort()
+       }
+       if crashingG.Load() == me {
+               // recursive crashing. too bad.
+               writeErrStr("fatal: recursive switchToCrashStack\n")
+               abort()
+       }
+       // Another g is crashing. Give it some time, hopefully it will finish traceback.
+       usleep_no_g(100)
+       writeErrStr("fatal: concurrent switchToCrashStack\n")
+       abort()
+}
+
+const crashStackImplemented = GOARCH == "amd64" || GOARCH == "arm64" || GOARCH == "mips64" || GOARCH == "mips64le" || GOARCH == "riscv64"
+
+//go:noescape
+func switchToCrashStack0(fn func()) // in assembly
+
  func lockedOSThread() bool {
         gp := getg()
         return gp.lockedm != 0 && gp.m.lockedg != 0
@@ -600,16 +680,39 @@ const (
         _GoidCacheBatch = 16
  )
  
-// cpuinit extracts the environment variable GODEBUG from the environment on
-// Unix-like operating systems and calls internal/cpu.Initialize.
-func cpuinit() {
-       const prefix = "GODEBUG="
-       var env string
-
+// cpuinit sets up CPU feature flags and calls internal/cpu.Initialize. env should be the complete
+// value of the GODEBUG environment variable.
+func cpuinit(env string) {
         switch GOOS {
         case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
                 cpu.DebugOptions = true
+       }
+       cpu.Initialize(env)
  
+       // Support cpu feature variables are used in code generated by the compiler
+       // to guard execution of instructions that can not be assumed to be always supported.
+       switch GOARCH {
+       case "386", "amd64":
+               x86HasPOPCNT = cpu.X86.HasPOPCNT
+               x86HasSSE41 = cpu.X86.HasSSE41
+               x86HasFMA = cpu.X86.HasFMA
+
+       case "arm":
+               armHasVFPv4 = cpu.ARM.HasVFPv4
+
+       case "arm64":
+               arm64HasATOMICS = cpu.ARM64.HasATOMICS
+       }
+}
+
+// getGodebugEarly extracts the environment variable GODEBUG from the environment on
+// Unix-like operating systems and returns it. This function exists to extract GODEBUG
+// early before much of the runtime is initialized.
+func getGodebugEarly() string {
+       const prefix = "GODEBUG="
+       var env string
+       switch GOOS {
+       case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
                 // Similar to goenv_unix but extracts the environment value for
                 // GODEBUG directly.
                 // TODO(moehrmann): remove when general goenvs() can be called before cpuinit()
@@ -620,7 +723,7 @@ func cpuinit() {
  
                 for i := int32(0); i < n; i++ {
                         p := argv_index(argv, argc+1+i)
-                       s := *(*string)(unsafe.Pointer(&stringStruct{unsafe.Pointer(p), findnull(p)}))
+                       s := unsafe.String(p, findnull(p))
  
                         if hasPrefix(s, prefix) {
                                 env = gostring(p)[len(prefix):]
@@ -628,23 +731,7 @@ func cpuinit() {
                         }
                 }
         }
-
-       cpu.Initialize(env)
-
-       // Support cpu feature variables are used in code generated by the compiler
-       // to guard execution of instructions that can not be assumed to be always supported.
-       switch GOARCH {
-       case "386", "amd64":
-               x86HasPOPCNT = cpu.X86.HasPOPCNT
-               x86HasSSE41 = cpu.X86.HasSSE41
-               x86HasFMA = cpu.X86.HasFMA
-
-       case "arm":
-               armHasVFPv4 = cpu.ARM.HasVFPv4
-
-       case "arm64":
-               arm64HasATOMICS = cpu.ARM64.HasATOMICS
-       }
+       return env
  }
  
  // The bootstrap sequence is:
@@ -666,11 +753,8 @@ func schedinit() {
         lockInit(&allpLock, lockRankAllp)
         lockInit(&reflectOffs.lock, lockRankReflectOffs)
         lockInit(&finlock, lockRankFin)
-       lockInit(&trace.bufLock, lockRankTraceBuf)
-       lockInit(&trace.stringsLock, lockRankTraceStrings)
-       lockInit(&trace.lock, lockRankTrace)
         lockInit(&cpuprof.lock, lockRankCpuprof)
-       lockInit(&trace.stackTab.lock, lockRankTraceStackTab)
+       traceLockInit()
         // Enforce that this lock is always a leaf lock.
         // All of this lock's critical sections should be
         // extremely short.
@@ -691,9 +775,11 @@ func schedinit() {
         moduledataverify()
         stackinit()
         mallocinit()
-       cpuinit()      // must run before alginit
-       alginit()      // maps, hash, fastrand must not be used before this call
-       fastrandinit() // must run before mcommoninit
+       godebug := getGodebugEarly()
+       initPageTrace(godebug) // must run after mallocinit but before anything allocates
+       cpuinit(godebug)       // must run before alginit
+       alginit()              // maps, hash, fastrand must not be used before this call
+       fastrandinit()         // must run before mcommoninit
         mcommoninit(gp.m, -1)
         modulesinit()   // provides activeModules
         typelinksinit() // uses maps, activeModules
@@ -703,16 +789,27 @@ func schedinit() {
         sigsave(&gp.m.sigmask)
         initSigmask = gp.m.sigmask
  
-       if offset := unsafe.Offsetof(sched.timeToRun); offset%8 != 0 {
-               println(offset)
-               throw("sched.timeToRun not aligned to 8 bytes")
-       }
-
         goargs()
         goenvs()
+       secure()
+       checkfds()
         parsedebugvars()
         gcinit()
  
+       // Allocate stack space that can be used when crashing due to bad stack
+       // conditions, e.g. morestack on g0.
+       gcrash.stack = stackalloc(16384)
+       gcrash.stackguard0 = gcrash.stack.lo + 1000
+       gcrash.stackguard1 = gcrash.stack.lo + 1000
+
+       // if disableMemoryProfiling is set, update MemProfileRate to 0 to turn off memprofile.
+       // Note: parsedebugvars may update MemProfileRate, but when disableMemoryProfiling is
+       // set to true by the linker, it means that nothing is consuming the profile, it is
+       // safe to set MemProfileRate to 0.
+       if disableMemoryProfiling {
+               MemProfileRate = 0
+       }
+
         lock(&sched.lock)
         sched.lastpoll.Store(nanotime())
         procs := ncpu
@@ -727,17 +824,6 @@ func schedinit() {
         // World is effectively started now, as P's can run.
         worldStarted()
  
-       // For cgocheck > 1, we turn on the write barrier at all times
-       // and check all pointer writes. We can't do this until after
-       // procresize because the write barrier needs a P.
-       if debug.cgocheck > 1 {
-               writeBarrier.cgo = true
-               writeBarrier.enabled = true
-               for _, pp := range allp {
-                       pp.wbBuf.reset()
-               }
-       }
-
         if buildVersion == "" {
                 // Condition should never trigger. This code just serves
                 // to ensure runtime·buildVersion is kept in the resulting binary.
@@ -760,7 +846,16 @@ func dumpgstatus(gp *g) {
  func checkmcount() {
         assertLockHeld(&sched.lock)
  
-       if mcount() > sched.maxmcount {
+       // Exclude extra M's, which are used for cgocallback from threads
+       // created in C.
+       //
+       // The purpose of the SetMaxThreads limit is to avoid accidental fork
+       // bomb from something like millions of goroutines blocking on system
+       // calls, causing the runtime to create millions of threads. By
+       // definition, this isn't a problem for threads created in C, so we
+       // exclude them from the limit. See https://go.dev/issue/60004.
+       count := mcount() - int32(extraMInUse.Load()) - int32(extraMLength.Load())
+       if count > sched.maxmcount {
                 print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
                 throw("thread exhaustion")
         }
@@ -805,7 +900,7 @@ func mcommoninit(mp *m, id int64) {
                 hi = 1
         }
         // Same behavior as for 1.17.
-       // TODO: Simplify ths.
+       // TODO: Simplify this.
         if goarch.BigEndian {
                 mp.fastrand = uint64(lo)<<32 | uint64(hi)
         } else {
@@ -814,7 +909,7 @@ func mcommoninit(mp *m, id int64) {
  
         mpreinit(mp)
         if mp.gsignal != nil {
-               mp.gsignal.stackguard1 = mp.gsignal.stack.lo + _StackGuard
+               mp.gsignal.stackguard1 = mp.gsignal.stack.lo + stackGuard
         }
  
         // Add to allm so garbage collector doesn't free g->m
@@ -832,6 +927,16 @@ func mcommoninit(mp *m, id int64) {
         }
  }
  
+func (mp *m) becomeSpinning() {
+       mp.spinning = true
+       sched.nmspinning.Add(1)
+       sched.needspinning.Store(0)
+}
+
+func (mp *m) hasCgoOnStack() bool {
+       return mp.ncgo > 0 || mp.isextra
+}
+
  var fastrandseed uintptr
  
  func fastrandinit() {
@@ -841,10 +946,6 @@ func fastrandinit() {
  
  // Mark gp ready to run.
  func ready(gp *g, traceskip int, next bool) {
-       if trace.enabled {
-               traceGoUnpark(gp, traceskip)
-       }
-
         status := readgstatus(gp)
  
         // Mark runnable.
@@ -855,7 +956,12 @@ func ready(gp *g, traceskip int, next bool) {
         }
  
         // status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
+       trace := traceAcquire()
         casgstatus(gp, _Gwaiting, _Grunnable)
+       if trace.ok() {
+               trace.GoUnpark(gp, traceskip)
+               traceRelease(trace)
+       }
         runqput(mp.p.ptr(), gp, next)
         wakep()
         releasem(mp)
@@ -867,20 +973,49 @@ const freezeStopWait = 0x7fffffff
  
  // freezing is set to non-zero if the runtime is trying to freeze the
  // world.
-var freezing uint32
+var freezing atomic.Bool
  
  // Similar to stopTheWorld but best-effort and can be called several times.
  // There is no reverse operation, used during crashing.
  // This function must not lock any mutexes.
  func freezetheworld() {
-       atomic.Store(&freezing, 1)
+       freezing.Store(true)
+       if debug.dontfreezetheworld > 0 {
+               // Don't prempt Ps to stop goroutines. That will perturb
+               // scheduler state, making debugging more difficult. Instead,
+               // allow goroutines to continue execution.
+               //
+               // fatalpanic will tracebackothers to trace all goroutines. It
+               // is unsafe to trace a running goroutine, so tracebackothers
+               // will skip running goroutines. That is OK and expected, we
+               // expect users of dontfreezetheworld to use core files anyway.
+               //
+               // However, allowing the scheduler to continue running free
+               // introduces a race: a goroutine may be stopped when
+               // tracebackothers checks its status, and then start running
+               // later when we are in the middle of traceback, potentially
+               // causing a crash.
+               //
+               // To mitigate this, when an M naturally enters the scheduler,
+               // schedule checks if freezing is set and if so stops
+               // execution. This guarantees that while Gs can transition from
+               // running to stopped, they can never transition from stopped
+               // to running.
+               //
+               // The sleep here allows racing Ms that missed freezing and are
+               // about to run a G to complete the transition to running
+               // before we start traceback.
+               usleep(1000)
+               return
+       }
+
         // stopwait and preemption requests can be lost
         // due to races with concurrently executing threads,
         // so try several times
         for i := 0; i < 5; i++ {
                 // this should tell the scheduler to not start any new goroutines
                 sched.stopwait = freezeStopWait
-               atomic.Store(&sched.gcwaiting, 1)
+               sched.gcwaiting.Store(true)
                 // this should stop running goroutines
                 if !preemptall() {
                         break // no running goroutines
@@ -898,7 +1033,7 @@ func freezetheworld() {
  //
  //go:nosplit
  func readgstatus(gp *g) uint32 {
-       return atomic.Load(&gp.atomicstatus)
+       return gp.atomicstatus.Load()
  }
  
  // The Gscanstatuses are acting like locks and this releases them.
@@ -920,7 +1055,7 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
                 _Gscansyscall,
                 _Gscanpreempted:
                 if newval == oldval&^_Gscan {
-                       success = atomic.Cas(&gp.atomicstatus, oldval, newval)
+                       success = gp.atomicstatus.CompareAndSwap(oldval, newval)
                 }
         }
         if !success {
@@ -940,7 +1075,7 @@ func castogscanstatus(gp *g, oldval, newval uint32) bool {
                 _Gwaiting,
                 _Gsyscall:
                 if newval == oldval|_Gscan {
-                       r := atomic.Cas(&gp.atomicstatus, oldval, newval)
+                       r := gp.atomicstatus.CompareAndSwap(oldval, newval)
                         if r {
                                 acquireLockRank(lockRankGscan)
                         }
@@ -953,6 +1088,10 @@ func castogscanstatus(gp *g, oldval, newval uint32) bool {
         panic("not reached")
  }
  
+// casgstatusAlwaysTrack is a debug flag that causes casgstatus to always track
+// various latencies on every transition instead of sampling them.
+var casgstatusAlwaysTrack = false
+
  // If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
  // and casfrom_Gscanstatus instead.
  // casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that
@@ -976,15 +1115,15 @@ func casgstatus(gp *g, oldval, newval uint32) {
  
         // loop if gp->atomicstatus is in a scan state giving
         // GC time to finish and change the state to oldval.
-       for i := 0; !atomic.Cas(&gp.atomicstatus, oldval, newval); i++ {
-               if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
+       for i := 0; !gp.atomicstatus.CompareAndSwap(oldval, newval); i++ {
+               if oldval == _Gwaiting && gp.atomicstatus.Load() == _Grunnable {
                         throw("casgstatus: waiting for Gwaiting but is Grunnable")
                 }
                 if i == 0 {
                         nextYield = nanotime() + yieldDelay
                 }
                 if nanotime() < nextYield {
-                       for x := 0; x < 10 && gp.atomicstatus != oldval; x++ {
+                       for x := 0; x < 10 && gp.atomicstatus.Load() != oldval; x++ {
                                 procyield(1)
                         }
                 } else {
@@ -993,39 +1132,77 @@ func casgstatus(gp *g, oldval, newval uint32) {
                 }
         }
  
-       // Handle tracking for scheduling latencies.
         if oldval == _Grunning {
-               // Track every 8th time a goroutine transitions out of running.
-               if gp.trackingSeq%gTrackingPeriod == 0 {
+               // Track every gTrackingPeriod time a goroutine transitions out of running.
+               if casgstatusAlwaysTrack || gp.trackingSeq%gTrackingPeriod == 0 {
                         gp.tracking = true
                 }
                 gp.trackingSeq++
         }
-       if gp.tracking {
-               if oldval == _Grunnable {
-                       // We transitioned out of runnable, so measure how much
-                       // time we spent in this state and add it to
-                       // runnableTime.
-                       now := nanotime()
-                       gp.runnableTime += now - gp.runnableStamp
-                       gp.runnableStamp = 0
+       if !gp.tracking {
+               return
+       }
+
+       // Handle various kinds of tracking.
+       //
+       // Currently:
+       // - Time spent in runnable.
+       // - Time spent blocked on a sync.Mutex or sync.RWMutex.
+       switch oldval {
+       case _Grunnable:
+               // We transitioned out of runnable, so measure how much
+               // time we spent in this state and add it to
+               // runnableTime.
+               now := nanotime()
+               gp.runnableTime += now - gp.trackingStamp
+               gp.trackingStamp = 0
+       case _Gwaiting:
+               if !gp.waitreason.isMutexWait() {
+                       // Not blocking on a lock.
+                       break
                 }
-               if newval == _Grunnable {
-                       // We just transitioned into runnable, so record what
-                       // time that happened.
-                       now := nanotime()
-                       gp.runnableStamp = now
-               } else if newval == _Grunning {
-                       // We're transitioning into running, so turn off
-                       // tracking and record how much time we spent in
-                       // runnable.
-                       gp.tracking = false
-                       sched.timeToRun.record(gp.runnableTime)
-                       gp.runnableTime = 0
+               // Blocking on a lock, measure it. Note that because we're
+               // sampling, we have to multiply by our sampling period to get
+               // a more representative estimate of the absolute value.
+               // gTrackingPeriod also represents an accurate sampling period
+               // because we can only enter this state from _Grunning.
+               now := nanotime()
+               sched.totalMutexWaitTime.Add((now - gp.trackingStamp) * gTrackingPeriod)
+               gp.trackingStamp = 0
+       }
+       switch newval {
+       case _Gwaiting:
+               if !gp.waitreason.isMutexWait() {
+                       // Not blocking on a lock.
+                       break
                 }
+               // Blocking on a lock. Write down the timestamp.
+               now := nanotime()
+               gp.trackingStamp = now
+       case _Grunnable:
+               // We just transitioned into runnable, so record what
+               // time that happened.
+               now := nanotime()
+               gp.trackingStamp = now
+       case _Grunning:
+               // We're transitioning into running, so turn off
+               // tracking and record how much time we spent in
+               // runnable.
+               gp.tracking = false
+               sched.timeToRun.record(gp.runnableTime)
+               gp.runnableTime = 0
         }
  }
  
+// casGToWaiting transitions gp from old to _Gwaiting, and sets the wait reason.
+//
+// Use this over casgstatus when possible to ensure that a waitreason is set.
+func casGToWaiting(gp *g, old uint32, reason waitReason) {
+       // Set the wait reason before calling casgstatus, because casgstatus will use it.
+       gp.waitreason = reason
+       casgstatus(gp, old, _Gwaiting)
+}
+
  // casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
  // Returns old status. Cannot call casgstatus directly, because we are racing with an
  // async wakeup that might come in from netpoll. If we see Gwaiting from the readgstatus,
@@ -1039,7 +1216,7 @@ func casgcopystack(gp *g) uint32 {
                 if oldstatus != _Gwaiting && oldstatus != _Grunnable {
                         throw("copystack: bad status, not Gwaiting or Grunnable")
                 }
-               if atomic.Cas(&gp.atomicstatus, oldstatus, _Gcopystack) {
+               if gp.atomicstatus.CompareAndSwap(oldstatus, _Gcopystack) {
                         return oldstatus
                 }
         }
@@ -1054,7 +1231,7 @@ func casGToPreemptScan(gp *g, old, new uint32) {
                 throw("bad g transition")
         }
         acquireLockRank(lockRankGscan)
-       for !atomic.Cas(&gp.atomicstatus, _Grunning, _Gscan|_Gpreempted) {
+       for !gp.atomicstatus.CompareAndSwap(_Grunning, _Gscan|_Gpreempted) {
         }
  }
  
@@ -1065,7 +1242,61 @@ func casGFromPreempted(gp *g, old, new uint32) bool {
         if old != _Gpreempted || new != _Gwaiting {
                 throw("bad g transition")
         }
-       return atomic.Cas(&gp.atomicstatus, _Gpreempted, _Gwaiting)
+       gp.waitreason = waitReasonPreempted
+       return gp.atomicstatus.CompareAndSwap(_Gpreempted, _Gwaiting)
+}
+
+// stwReason is an enumeration of reasons the world is stopping.
+type stwReason uint8
+
+// Reasons to stop-the-world.
+//
+// Avoid reusing reasons and add new ones instead.
+const (
+       stwUnknown                     stwReason = iota // "unknown"
+       stwGCMarkTerm                                   // "GC mark termination"
+       stwGCSweepTerm                                  // "GC sweep termination"
+       stwWriteHeapDump                                // "write heap dump"
+       stwGoroutineProfile                             // "goroutine profile"
+       stwGoroutineProfileCleanup                      // "goroutine profile cleanup"
+       stwAllGoroutinesStack                           // "all goroutines stack trace"
+       stwReadMemStats                                 // "read mem stats"
+       stwAllThreadsSyscall                            // "AllThreadsSyscall"
+       stwGOMAXPROCS                                   // "GOMAXPROCS"
+       stwStartTrace                                   // "start trace"
+       stwStopTrace                                    // "stop trace"
+       stwForTestCountPagesInUse                       // "CountPagesInUse (test)"
+       stwForTestReadMetricsSlow                       // "ReadMetricsSlow (test)"
+       stwForTestReadMemStatsSlow                      // "ReadMemStatsSlow (test)"
+       stwForTestPageCachePagesLeaked                  // "PageCachePagesLeaked (test)"
+       stwForTestResetDebugLog                         // "ResetDebugLog (test)"
+)
+
+func (r stwReason) String() string {
+       return stwReasonStrings[r]
+}
+
+// If you add to this list, also add it to src/internal/trace/parser.go.
+// If you change the values of any of the stw* constants, bump the trace
+// version number and make a copy of this.
+var stwReasonStrings = [...]string{
+       stwUnknown:                     "unknown",
+       stwGCMarkTerm:                  "GC mark termination",
+       stwGCSweepTerm:                 "GC sweep termination",
+       stwWriteHeapDump:               "write heap dump",
+       stwGoroutineProfile:            "goroutine profile",
+       stwGoroutineProfileCleanup:     "goroutine profile cleanup",
+       stwAllGoroutinesStack:          "all goroutines stack trace",
+       stwReadMemStats:                "read mem stats",
+       stwAllThreadsSyscall:           "AllThreadsSyscall",
+       stwGOMAXPROCS:                  "GOMAXPROCS",
+       stwStartTrace:                  "start trace",
+       stwStopTrace:                   "stop trace",
+       stwForTestCountPagesInUse:      "CountPagesInUse (test)",
+       stwForTestReadMetricsSlow:      "ReadMetricsSlow (test)",
+       stwForTestReadMemStatsSlow:     "ReadMemStatsSlow (test)",
+       stwForTestPageCachePagesLeaked: "PageCachePagesLeaked (test)",
+       stwForTestResetDebugLog:        "ResetDebugLog (test)",
  }
  
  // stopTheWorld stops all P's from executing goroutines, interrupting
@@ -1082,10 +1313,10 @@ func casGFromPreempted(gp *g, old, new uint32) bool {
  // This is also used by routines that do stack dumps. If the system is
  // in panic or being exited, this may not reliably stop all
  // goroutines.
-func stopTheWorld(reason string) {
+func stopTheWorld(reason stwReason) {
         semacquire(&worldsema)
         gp := getg()
-       gp.m.preemptoff = reason
+       gp.m.preemptoff = reason.String()
         systemstack(func() {
                 // Mark the goroutine which called stopTheWorld preemptible so its
                 // stack may be scanned.
@@ -1097,15 +1328,16 @@ func stopTheWorld(reason string) {
                 // must have preempted all goroutines, including any attempting
                 // to scan our stack, in which case, any stack shrinking will
                 // have already completed by the time we exit.
-               casgstatus(gp, _Grunning, _Gwaiting)
-               stopTheWorldWithSema()
+               // Don't provide a wait reason because we're still executing.
+               casGToWaiting(gp, _Grunning, waitReasonStoppingTheWorld)
+               stopTheWorldWithSema(reason)
                 casgstatus(gp, _Gwaiting, _Grunning)
         })
  }
  
  // startTheWorld undoes the effects of stopTheWorld.
  func startTheWorld() {
-       systemstack(func() { startTheWorldWithSema(false) })
+       systemstack(func() { startTheWorldWithSema() })
  
         // worldsema must be held over startTheWorldWithSema to ensure
         // gomaxprocs cannot change while worldsema is held.
@@ -1131,7 +1363,7 @@ func startTheWorld() {
  // stopTheWorldGC has the same effect as stopTheWorld, but blocks
  // until the GC is not running. It also blocks a GC from starting
  // until startTheWorldGC is called.
-func stopTheWorldGC(reason string) {
+func stopTheWorldGC(reason stwReason) {
         semacquire(&gcsema)
         stopTheWorld(reason)
  }
@@ -1175,7 +1407,12 @@ var gcsema uint32 = 1
  // startTheWorldWithSema and stopTheWorldWithSema.
  // Holding worldsema causes any other goroutines invoking
  // stopTheWorld to block.
-func stopTheWorldWithSema() {
+func stopTheWorldWithSema(reason stwReason) {
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.STWStart(reason)
+               traceRelease(trace)
+       }
         gp := getg()
  
         // If we hold a lock, then we won't be able to stop another M
@@ -1186,23 +1423,28 @@ func stopTheWorldWithSema() {
  
         lock(&sched.lock)
         sched.stopwait = gomaxprocs
-       atomic.Store(&sched.gcwaiting, 1)
+       sched.gcwaiting.Store(true)
         preemptall()
         // stop current P
         gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
         sched.stopwait--
         // try to retake all P's in Psyscall status
+       trace = traceAcquire()
         for _, pp := range allp {
                 s := pp.status
                 if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) {
-                       if trace.enabled {
-                               traceGoSysBlock(pp)
-                               traceProcStop(pp)
+                       if trace.ok() {
+                               trace.GoSysBlock(pp)
+                               trace.ProcStop(pp)
                         }
                         pp.syscalltick++
                         sched.stopwait--
                 }
         }
+       if trace.ok() {
+               traceRelease(trace)
+       }
+
         // stop idle P's
         now := nanotime()
         for {
@@ -1239,7 +1481,7 @@ func stopTheWorldWithSema() {
                         }
                 }
         }
-       if atomic.Load(&freezing) != 0 {
+       if freezing.Load() {
                 // Some other thread is panicking. This can cause the
                 // sanity checks above to fail if the panic happens in
                 // the signal handler on a stopped thread. Either way,
@@ -1254,13 +1496,14 @@ func stopTheWorldWithSema() {
         worldStopped()
  }
  
-func startTheWorldWithSema(emitTraceEvent bool) int64 {
+func startTheWorldWithSema() int64 {
         assertWorldStopped()
  
         mp := acquirem() // disable preemption because it can be holding p in a local var
         if netpollinited() {
-               list := netpoll(0) // non-blocking
+               list, delta := netpoll(0) // non-blocking
                 injectglist(&list)
+               netpollAdjustWaiters(delta)
         }
         lock(&sched.lock)
  
@@ -1270,9 +1513,9 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
                 newprocs = 0
         }
         p1 := procresize(procs)
-       sched.gcwaiting = 0
-       if sched.sysmonwait != 0 {
-               sched.sysmonwait = 0
+       sched.gcwaiting.Store(false)
+       if sched.sysmonwait.Load() {
+               sched.sysmonwait.Store(false)
                 notewakeup(&sched.sysmonnote)
         }
         unlock(&sched.lock)
@@ -1298,8 +1541,10 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
  
         // Capture start-the-world time before doing clean-up tasks.
         startTime := nanotime()
-       if emitTraceEvent {
-               traceGCSTWDone()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.STWDone()
+               traceRelease(trace)
         }
  
         // Wakeup an additional proc in case we have excessive runnable goroutines
@@ -1319,7 +1564,7 @@ func usesLibcall() bool {
         case "aix", "darwin", "illumos", "ios", "solaris", "windows":
                 return true
         case "openbsd":
-               return GOARCH == "386" || GOARCH == "amd64" || GOARCH == "arm" || GOARCH == "arm64"
+               return GOARCH != "mips64"
         }
         return false
  }
@@ -1331,10 +1576,7 @@ func mStackIsSystemAllocated() bool {
         case "aix", "darwin", "plan9", "illumos", "ios", "solaris", "windows":
                 return true
         case "openbsd":
-               switch GOARCH {
-               case "386", "amd64", "arm", "arm64":
-                       return true
-               }
+               return GOARCH != "mips64"
         }
         return false
  }
@@ -1367,14 +1609,14 @@ func mstart0() {
                 // but is somewhat arbitrary.
                 size := gp.stack.hi
                 if size == 0 {
-                       size = 8192 * sys.StackGuardMultiplier
+                       size = 16384 * sys.StackGuardMultiplier
                 }
                 gp.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
                 gp.stack.lo = gp.stack.hi - size + 1024
         }
         // Initialize stack guard so that we can start calling regular
         // Go code.
-       gp.stackguard0 = gp.stack.lo + _StackGuard
+       gp.stackguard0 = gp.stack.lo + stackGuard
         // This is the g0, so we can also call go:systemstack
         // functions, which check stackguard1.
         gp.stackguard1 = gp.stackguard0
@@ -1514,19 +1756,18 @@ func mexit(osStack bool) {
         }
         throw("m not found in allm")
  found:
-       if !osStack {
-               // Delay reaping m until it's done with the stack.
-               //
-               // If this is using an OS stack, the OS will free it
-               // so there's no need for reaping.
-               atomic.Store(&mp.freeWait, 1)
-               // Put m on the free list, though it will not be reaped until
-               // freeWait is 0. Note that the free list must not be linked
-               // through alllink because some functions walk allm without
-               // locking, so may be using alllink.
-               mp.freelink = sched.freem
-               sched.freem = mp
-       }
+       // Delay reaping m until it's done with the stack.
+       //
+       // Put mp on the free list, though it will not be reaped while freeWait
+       // is freeMWait. mp is no longer reachable via allm, so even if it is
+       // on an OS stack, we must keep a reference to mp alive so that the GC
+       // doesn't free mp while we are still using it.
+       //
+       // Note that the free list must not be linked through alllink because
+       // some functions walk allm without locking, so may be using alllink.
+       mp.freeWait.Store(freeMWait)
+       mp.freelink = sched.freem
+       sched.freem = mp
         unlock(&sched.lock)
  
         atomic.Xadd64(&ncgocall, int64(mp.ncgocall))
@@ -1546,7 +1787,7 @@ found:
         if GOOS == "darwin" || GOOS == "ios" {
                 // Make sure pendingPreemptSignals is correct when an M exits.
                 // For #41702.
-               if atomic.Load(&mp.signalPending) != 0 {
+               if mp.signalPending.Load() != 0 {
                         pendingPreemptSignals.Add(-1)
                 }
         }
@@ -1556,6 +1797,9 @@ found:
         mdestroy(mp)
  
         if osStack {
+               // No more uses of mp, so it is safe to drop the reference.
+               mp.freeWait.Store(freeMRef)
+
                 // Return from mstart and let the system thread
                 // library free the g0 stack and terminate the thread.
                 return
@@ -1619,17 +1863,21 @@ func forEachP(fn func(*p)) {
  
         // Force Ps currently in _Psyscall into _Pidle and hand them
         // off to induce safe point function execution.
+       trace := traceAcquire()
         for _, p2 := range allp {
                 s := p2.status
                 if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) {
-                       if trace.enabled {
-                               traceGoSysBlock(p2)
-                               traceProcStop(p2)
+                       if trace.ok() {
+                               trace.GoSysBlock(p2)
+                               trace.ProcStop(p2)
                         }
                         p2.syscalltick++
                         handoffp(p2)
                 }
         }
+       if trace.ok() {
+               traceRelease(trace)
+       }
  
         // Wait for remaining Ps to run fn.
         if wait {
@@ -1727,19 +1975,25 @@ func allocm(pp *p, fn func(), id int64) *m {
                 lock(&sched.lock)
                 var newList *m
                 for freem := sched.freem; freem != nil; {
-                       if freem.freeWait != 0 {
+                       wait := freem.freeWait.Load()
+                       if wait == freeMWait {
                                 next := freem.freelink
                                 freem.freelink = newList
                                 newList = freem
                                 freem = next
                                 continue
                         }
-                       // stackfree must be on the system stack, but allocm is
-                       // reachable off the system stack transitively from
-                       // startm.
-                       systemstack(func() {
-                               stackfree(freem.g0.stack)
-                       })
+                       // Free the stack if needed. For freeMRef, there is
+                       // nothing to do except drop freem from the sched.freem
+                       // list.
+                       if wait == freeMStack {
+                               // stackfree must be on the system stack, but allocm is
+                               // reachable off the system stack transitively from
+                               // startm.
+                               systemstack(func() {
+                                       stackfree(freem.g0.stack)
+                               })
+                       }
                         freem = freem.freelink
                 }
                 sched.freem = newList
@@ -1755,7 +2009,7 @@ func allocm(pp *p, fn func(), id int64) *m {
         if iscgo || mStackIsSystemAllocated() {
                 mp.g0 = malg(-1)
         } else {
-               mp.g0 = malg(8192 * sys.StackGuardMultiplier)
+               mp.g0 = malg(16384 * sys.StackGuardMultiplier)
         }
         mp.g0.m = mp
  
@@ -1799,11 +2053,15 @@ func allocm(pp *p, fn func(), id int64) *m {
  // pressed into service as the scheduling stack and current
  // goroutine for the duration of the cgo callback.
  //
-// When the callback is done with the m, it calls dropm to
-// put the m back on the list.
+// It calls dropm to put the m back on the list,
+// 1. when the callback is done with the m in non-pthread platforms,
+// 2. or when the C thread exiting on pthread platforms.
+//
+// The signal argument indicates whether we're called from a signal
+// handler.
  //
  //go:nosplit
-func needm() {
+func needm(signal bool) {
         if (iscgo || GOOS == "windows") && !cgoHasExtraM {
                 // Can happen if C/C++ code calls Go from a global ctor.
                 // Can also happen on Windows if a global ctor uses a
@@ -1811,7 +2069,7 @@ func needm() {
                 // for details.
                 //
                 // Can not throw, because scheduler is not initialized yet.
-               write(2, unsafe.Pointer(&earlycgocallback[0]), int32(len(earlycgocallback)))
+               writeErrStr("fatal error: cgo callback before cgo call\n")
                 exit(1)
         }
  
@@ -1827,11 +2085,10 @@ func needm() {
         sigsave(&sigmask)
         sigblock(false)
  
-       // Lock extra list, take head, unlock popped list.
-       // nilokay=false is safe here because of the invariant above,
+       // getExtraM is safe here because of the invariant above,
         // that the extra list always contains or will soon contain
         // at least one m.
-       mp := lockextra(false)
+       mp, last := getExtraM()
  
         // Set needextram when we've just emptied the list,
         // so that the eventual call into cgocallbackg will
@@ -1840,9 +2097,7 @@ func needm() {
         // after exitsyscall makes sure it is okay to be
         // running at all (that is, there's no garbage collection
         // running right now).
-       mp.needextram = mp.schedlink == 0
-       extraMCount--
-       unlockextra(mp.schedlink.ptr())
+       mp.needextram = last
  
         // Store the original signal mask for use by minit.
         mp.sigmask = sigmask
@@ -1852,15 +2107,15 @@ func needm() {
         osSetupTLS(mp)
  
         // Install g (= m->g0) and set the stack bounds
-       // to match the current stack. We don't actually know
-       // how big the stack is, like we don't know how big any
-       // scheduling stack is, but we assume there's at least 32 kB,
-       // which is more than enough for us.
+       // to match the current stack.
         setg(mp.g0)
-       gp := getg()
-       gp.stack.hi = getcallersp() + 1024
-       gp.stack.lo = getcallersp() - 32*1024
-       gp.stackguard0 = gp.stack.lo + _StackGuard
+       sp := getcallersp()
+       callbackUpdateSystemStack(mp, sp, signal)
+
+       // Should mark we are already in Go now.
+       // Otherwise, we may call needm again when we get a signal, before cgocallbackg1,
+       // which means the extram list may be empty, that will cause a deadlock.
+       mp.isExtraInC = false
  
         // Initialize this thread to use the m.
         asminit()
@@ -1871,24 +2126,29 @@ func needm() {
         sched.ngsys.Add(-1)
  }
  
-var earlycgocallback = []byte("fatal error: cgo callback before cgo call\n")
+// Acquire an extra m and bind it to the C thread when a pthread key has been created.
+//
+//go:nosplit
+func needAndBindM() {
+       needm(false)
+
+       if _cgo_pthread_key_created != nil && *(*uintptr)(_cgo_pthread_key_created) != 0 {
+               cgoBindM()
+       }
+}
  
  // newextram allocates m's and puts them on the extra list.
  // It is called with a working local m, so that it can do things
  // like call schedlock and allocate.
  func newextram() {
-       c := atomic.Xchg(&extraMWaiters, 0)
+       c := extraMWaiters.Swap(0)
         if c > 0 {
                 for i := uint32(0); i < c; i++ {
                         oneNewExtraM()
                 }
-       } else {
+       } else if extraMLength.Load() == 0 {
                 // Make sure there is at least one extra M.
-               mp := lockextra(true)
-               unlockextra(mp)
-               if mp == nil {
-                       oneNewExtraM()
-               }
+               oneNewExtraM()
         }
  }
  
@@ -1916,6 +2176,9 @@ func oneNewExtraM() {
         casgstatus(gp, _Gidle, _Gdead)
         gp.m = mp
         mp.curg = gp
+       mp.isextra = true
+       // mark we are in C by default.
+       mp.isExtraInC = true
         mp.lockedInt++
         mp.lockedg.set(gp)
         gp.lockedm.set(mp)
@@ -1923,6 +2186,11 @@ func oneNewExtraM() {
         if raceenabled {
                 gp.racectx = racegostart(abi.FuncPCABIInternal(newextram) + sys.PCQuantum)
         }
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.OneNewExtraM(gp)
+               traceRelease(trace)
+       }
         // put on allg for garbage collector
         allgadd(gp)
  
@@ -1933,15 +2201,14 @@ func oneNewExtraM() {
         sched.ngsys.Add(1)
  
         // Add m to the extra list.
-       mnext := lockextra(true)
-       mp.schedlink.set(mnext)
-       extraMCount++
-       unlockextra(mp)
+       addExtraM(mp)
  }
  
+// dropm puts the current m back onto the extra list.
+//
+// 1. On systems without pthreads, like Windows
  // dropm is called when a cgo callback has called needm but is now
  // done with the callback and returning back into the non-Go thread.
-// It puts the current m back onto the extra list.
  //
  // The main expense here is the call to signalstack to release the
  // m's signal stack, and then the call to needm on the next callback
@@ -1953,15 +2220,23 @@ func oneNewExtraM() {
  // call. These should typically not be scheduling operations, just a few
  // atomics, so the cost should be small.
  //
-// TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
-// variable using pthread_key_create. Unlike the pthread keys we already use
-// on OS X, this dummy key would never be read by Go code. It would exist
-// only so that we could register at thread-exit-time destructor.
-// That destructor would put the m back onto the extra list.
-// This is purely a performance optimization. The current version,
-// in which dropm happens on each cgo call, is still correct too.
-// We may have to keep the current version on systems with cgo
-// but without pthreads, like Windows.
+// 2. On systems with pthreads
+// dropm is called while a non-Go thread is exiting.
+// We allocate a pthread per-thread variable using pthread_key_create,
+// to register a thread-exit-time destructor.
+// And store the g into a thread-specific value associated with the pthread key,
+// when first return back to C.
+// So that the destructor would invoke dropm while the non-Go thread is exiting.
+// This is much faster since it avoids expensive signal-related syscalls.
+//
+// This always runs without a P, so //go:nowritebarrierrec is required.
+//
+// This may run with a different stack than was recorded in g0 (there is no
+// call to callbackUpdateSystemStack prior to dropm), so this must be
+// //go:nosplit to avoid the stack bounds check.
+//
+//go:nowritebarrierrec
+//go:nosplit
  func dropm() {
         // Clear m and g, and return m to the extra list.
         // After the call to setg we can only call nosplit functions
@@ -1981,26 +2256,75 @@ func dropm() {
         sigblock(false)
         unminit()
  
-       mnext := lockextra(true)
-       extraMCount++
-       mp.schedlink.set(mnext)
-
         setg(nil)
  
-       // Commit the release of mp.
-       unlockextra(mp)
+       // Clear g0 stack bounds to ensure that needm always refreshes the
+       // bounds when reusing this M.
+       g0 := mp.g0
+       g0.stack.hi = 0
+       g0.stack.lo = 0
+       g0.stackguard0 = 0
+       g0.stackguard1 = 0
+
+       putExtraM(mp)
  
         msigrestore(sigmask)
  }
  
+// bindm store the g0 of the current m into a thread-specific value.
+//
+// We allocate a pthread per-thread variable using pthread_key_create,
+// to register a thread-exit-time destructor.
+// We are here setting the thread-specific value of the pthread key, to enable the destructor.
+// So that the pthread_key_destructor would dropm while the C thread is exiting.
+//
+// And the saved g will be used in pthread_key_destructor,
+// since the g stored in the TLS by Go might be cleared in some platforms,
+// before the destructor invoked, so, we restore g by the stored g, before dropm.
+//
+// We store g0 instead of m, to make the assembly code simpler,
+// since we need to restore g0 in runtime.cgocallback.
+//
+// On systems without pthreads, like Windows, bindm shouldn't be used.
+//
+// NOTE: this always runs without a P, so, nowritebarrierrec required.
+//
+//go:nosplit
+//go:nowritebarrierrec
+func cgoBindM() {
+       if GOOS == "windows" || GOOS == "plan9" {
+               fatal("bindm in unexpected GOOS")
+       }
+       g := getg()
+       if g.m.g0 != g {
+               fatal("the current g is not g0")
+       }
+       if _cgo_bindm != nil {
+               asmcgocall(_cgo_bindm, unsafe.Pointer(g))
+       }
+}
+
  // A helper function for EnsureDropM.
  func getm() uintptr {
         return uintptr(unsafe.Pointer(getg().m))
  }
  
-var extram uintptr
-var extraMCount uint32 // Protected by lockextra
-var extraMWaiters uint32
+var (
+       // Locking linked list of extra M's, via mp.schedlink. Must be accessed
+       // only via lockextra/unlockextra.
+       //
+       // Can't be atomic.Pointer[m] because we use an invalid pointer as a
+       // "locked" sentinel value. M's on this list remain visible to the GC
+       // because their mp.curg is on allgs.
+       extraM atomic.Uintptr
+       // Number of M's in the extraM list.
+       extraMLength atomic.Uint32
+       // Number of waiters in lockextra.
+       extraMWaiters atomic.Uint32
+
+       // Number of extra M's in use by threads.
+       extraMInUse atomic.Uint32
+)
  
  // lockextra locks the extra list and returns the list head.
  // The caller must unlock the list by storing a new list head
@@ -2014,7 +2338,7 @@ func lockextra(nilokay bool) *m {
  
         incr := false
         for {
-               old := atomic.Loaduintptr(&extram)
+               old := extraM.Load()
                 if old == locked {
                         osyield_no_g()
                         continue
@@ -2024,13 +2348,13 @@ func lockextra(nilokay bool) *m {
                                 // Add 1 to the number of threads
                                 // waiting for an M.
                                 // This is cleared by newextram.
-                               atomic.Xadd(&extraMWaiters, 1)
+                               extraMWaiters.Add(1)
                                 incr = true
                         }
                         usleep_no_g(1)
                         continue
                 }
-               if atomic.Casuintptr(&extram, old, locked) {
+               if extraM.CompareAndSwap(old, locked) {
                         return (*m)(unsafe.Pointer(old))
                 }
                 osyield_no_g()
@@ -2039,8 +2363,41 @@ func lockextra(nilokay bool) *m {
  }
  
  //go:nosplit
-func unlockextra(mp *m) {
-       atomic.Storeuintptr(&extram, uintptr(unsafe.Pointer(mp)))
+func unlockextra(mp *m, delta int32) {
+       extraMLength.Add(delta)
+       extraM.Store(uintptr(unsafe.Pointer(mp)))
+}
+
+// Return an M from the extra M list. Returns last == true if the list becomes
+// empty because of this call.
+//
+// Spins waiting for an extra M, so caller must ensure that the list always
+// contains or will soon contain at least one M.
+//
+//go:nosplit
+func getExtraM() (mp *m, last bool) {
+       mp = lockextra(false)
+       extraMInUse.Add(1)
+       unlockextra(mp.schedlink.ptr(), -1)
+       return mp, mp.schedlink.ptr() == nil
+}
+
+// Returns an extra M back to the list. mp must be from getExtraM. Newly
+// allocated M's should use addExtraM.
+//
+//go:nosplit
+func putExtraM(mp *m) {
+       extraMInUse.Add(-1)
+       addExtraM(mp)
+}
+
+// Adds a newly allocated M to the extra M list.
+//
+//go:nosplit
+func addExtraM(mp *m) {
+       mnext := lockextra(true)
+       mp.schedlink.set(mnext)
+       unlockextra(mp, 1)
  }
  
  var (
@@ -2055,6 +2412,13 @@ var (
         execLock rwmutex
  )
  
+// These errors are reported (via writeErrStr) by some OS-specific
+// versions of newosproc and newosproc0.
+const (
+       failthreadcreate  = "runtime: failed to create new OS thread\n"
+       failallocatestack = "runtime: failed to allocate stack for the new OS thread\n"
+)
+
  // newmHandoff contains a list of m structures that need new OS threads.
  // This is used by newm in situations where newm itself can't safely
  // start an OS thread.
@@ -2247,16 +2611,21 @@ func mspinning() {
  // Schedules some M to run the p (creates an M if necessary).
  // If p==nil, tries to get an idle P, if no idle P's does nothing.
  // May run with m.p==nil, so write barriers are not allowed.
-// If spinning is set, the caller has incremented nmspinning and startm will
-// either decrement nmspinning or set m.spinning in the newly started M.
+// If spinning is set, the caller has incremented nmspinning and must provide a
+// P. startm will set m.spinning in the newly started M.
  //
  // Callers passing a non-nil P must call from a non-preemptible context. See
  // comment on acquirem below.
  //
+// Argument lockheld indicates whether the caller already acquired the
+// scheduler lock. Callers holding the lock when making the call must pass
+// true. The lock might be temporarily dropped, but will be reacquired before
+// returning.
+//
  // Must not have write barriers because this may be called without a P.
  //
  //go:nowritebarrierrec
-func startm(pp *p, spinning bool) {
+func startm(pp *p, spinning, lockheld bool) {
         // Disable preemption.
         //
         // Every owned P must have an owner that will eventually stop it in the
@@ -2274,17 +2643,20 @@ func startm(pp *p, spinning bool) {
         // startm. Callers passing a nil P may be preemptible, so we must
         // disable preemption before acquiring a P from pidleget below.
         mp := acquirem()
-       lock(&sched.lock)
+       if !lockheld {
+               lock(&sched.lock)
+       }
         if pp == nil {
+               if spinning {
+                       // TODO(prattmic): All remaining calls to this function
+                       // with _p_ == nil could be cleaned up to find a P
+                       // before calling startm.
+                       throw("startm: P required for spinning=true")
+               }
                 pp, _ = pidleget(0)
                 if pp == nil {
-                       unlock(&sched.lock)
-                       if spinning {
-                               // The caller incremented nmspinning, but there are no idle Ps,
-                               // so it's okay to just undo the increment and give up.
-                               if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
-                                       throw("startm: negative nmspinning")
-                               }
+                       if !lockheld {
+                               unlock(&sched.lock)
                         }
                         releasem(mp)
                         return
@@ -2299,6 +2671,8 @@ func startm(pp *p, spinning bool) {
                 // could find no idle P while checkdead finds a runnable G but
                 // no running M's because this new M hasn't started yet, thus
                 // throwing in an apparent deadlock.
+               // This apparent deadlock is possible when startm is called
+               // from sysmon, which doesn't count as a running M.
                 //
                 // Avoid this situation by pre-allocating the ID for the new M,
                 // thus marking it as 'running' before we drop sched.lock. This
@@ -2313,12 +2687,18 @@ func startm(pp *p, spinning bool) {
                         fn = mspinning
                 }
                 newm(fn, pp, id)
+
+               if lockheld {
+                       lock(&sched.lock)
+               }
                 // Ownership transfer of pp committed by start in newm.
                 // Preemption is now safe.
                 releasem(mp)
                 return
         }
-       unlock(&sched.lock)
+       if !lockheld {
+               unlock(&sched.lock)
+       }
         if nmp.spinning {
                 throw("startm: m is spinning")
         }
@@ -2347,27 +2727,28 @@ func handoffp(pp *p) {
  
         // if it has local work, start it straight away
         if !runqempty(pp) || sched.runqsize != 0 {
-               startm(pp, false)
+               startm(pp, false, false)
                 return
         }
         // if there's trace work to do, start it straight away
-       if (trace.enabled || trace.shutdown) && traceReaderAvailable() != nil {
-               startm(pp, false)
+       if (traceEnabled() || traceShuttingDown()) && traceReaderAvailable() != nil {
+               startm(pp, false, false)
                 return
         }
         // if it has GC work, start it straight away
         if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) {
-               startm(pp, false)
+               startm(pp, false, false)
                 return
         }
         // no local work, check that there are no spinning/idle M's,
         // otherwise our help is not required
-       if int32(atomic.Load(&sched.nmspinning))+sched.npidle.Load() == 0 && atomic.Cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
-               startm(pp, true)
+       if sched.nmspinning.Load()+sched.npidle.Load() == 0 && sched.nmspinning.CompareAndSwap(0, 1) { // TODO: fast atomic
+               sched.needspinning.Store(0)
+               startm(pp, true, false)
                 return
         }
         lock(&sched.lock)
-       if sched.gcwaiting != 0 {
+       if sched.gcwaiting.Load() {
                 pp.status = _Pgcstop
                 sched.stopwait--
                 if sched.stopwait == 0 {
@@ -2385,14 +2766,14 @@ func handoffp(pp *p) {
         }
         if sched.runqsize != 0 {
                 unlock(&sched.lock)
-               startm(pp, false)
+               startm(pp, false, false)
                 return
         }
         // If this is the last running P and nobody is polling network,
         // need to wakeup another M to poll network.
         if sched.npidle.Load() == gomaxprocs-1 && sched.lastpoll.Load() != 0 {
                 unlock(&sched.lock)
-               startm(pp, false)
+               startm(pp, false, false)
                 return
         }
  
@@ -2409,15 +2790,41 @@ func handoffp(pp *p) {
  
  // Tries to add one more P to execute G's.
  // Called when a G is made runnable (newproc, ready).
+// Must be called with a P.
  func wakep() {
-       if sched.npidle.Load() == 0 {
+       // Be conservative about spinning threads, only start one if none exist
+       // already.
+       if sched.nmspinning.Load() != 0 || !sched.nmspinning.CompareAndSwap(0, 1) {
                 return
         }
-       // be conservative about spinning threads
-       if atomic.Load(&sched.nmspinning) != 0 || !atomic.Cas(&sched.nmspinning, 0, 1) {
+
+       // Disable preemption until ownership of pp transfers to the next M in
+       // startm. Otherwise preemption here would leave pp stuck waiting to
+       // enter _Pgcstop.
+       //
+       // See preemption comment on acquirem in startm for more details.
+       mp := acquirem()
+
+       var pp *p
+       lock(&sched.lock)
+       pp, _ = pidlegetSpinning(0)
+       if pp == nil {
+               if sched.nmspinning.Add(-1) < 0 {
+                       throw("wakep: negative nmspinning")
+               }
+               unlock(&sched.lock)
+               releasem(mp)
                 return
         }
-       startm(nil, true)
+       // Since we always have a P, the race in the "No M is available"
+       // comment in startm doesn't apply during the small window between the
+       // unlock here and lock in startm. A checkdead in between will always
+       // see at least one running M (ours).
+       unlock(&sched.lock)
+
+       startm(pp, true, false)
+
+       releasem(mp)
  }
  
  // Stops execution of the current m that is locked to a g until the g is runnable again.
@@ -2471,14 +2878,14 @@ func startlockedm(gp *g) {
  func gcstopm() {
         gp := getg()
  
-       if sched.gcwaiting == 0 {
+       if !sched.gcwaiting.Load() {
                 throw("gcstopm: not waiting for gc")
         }
         if gp.m.spinning {
                 gp.m.spinning = false
                 // OK to just drop nmspinning here,
                 // startTheWorld will unpark threads as necessary.
-               if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+               if sched.nmspinning.Add(-1) < 0 {
                         throw("gcstopm: negative nmspinning")
                 }
         }
@@ -2519,7 +2926,7 @@ func execute(gp *g, inheritTime bool) {
         casgstatus(gp, _Grunnable, _Grunning)
         gp.waitsince = 0
         gp.preempt = false
-       gp.stackguard0 = gp.stack.lo + _StackGuard
+       gp.stackguard0 = gp.stack.lo + stackGuard
         if !inheritTime {
                 mp.p.ptr().schedtick++
         }
@@ -2530,13 +2937,15 @@ func execute(gp *g, inheritTime bool) {
                 setThreadCPUProfiler(hz)
         }
  
-       if trace.enabled {
+       trace := traceAcquire()
+       if trace.ok() {
                 // GoSysExit has to happen when we have a P, but before GoStart.
                 // So we emit it here.
-               if gp.syscallsp != 0 && gp.sysblocktraced {
-                       traceGoSysExit(gp.sysexitticks)
+               if gp.syscallsp != 0 {
+                       trace.GoSysExit()
                 }
-               traceGoStart()
+               trace.GoStart()
+               traceRelease(trace)
         }
  
         gogo(&gp.sched)
@@ -2555,7 +2964,7 @@ func findRunnable() (gp *g, inheritTime, tryWakeP bool) {
  
  top:
         pp := mp.p.ptr()
-       if sched.gcwaiting != 0 {
+       if sched.gcwaiting.Load() {
                 gcstopm()
                 goto top
         }
@@ -2570,11 +2979,15 @@ top:
         now, pollUntil, _ := checkTimers(pp, 0)
  
         // Try to schedule the trace reader.
-       if trace.enabled || trace.shutdown {
+       if traceEnabled() || traceShuttingDown() {
                 gp := traceReader()
                 if gp != nil {
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
-                       traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
+                       }
                         return gp, false, true
                 }
         }
@@ -2601,7 +3014,7 @@ top:
         }
  
         // Wake up the finalizer G.
-       if fingwait && fingwake {
+       if fingStatus.Load()&(fingWait|fingWake) == fingWait|fingWake {
                 if gp := wakefing(); gp != nil {
                         ready(gp, 0, true)
                 }
@@ -2632,13 +3045,16 @@ top:
         // blocked thread (e.g. it has already returned from netpoll, but does
         // not set lastpoll yet), this thread will do blocking netpoll below
         // anyway.
-       if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll.Load() != 0 {
-               if list := netpoll(0); !list.empty() { // non-blocking
+       if netpollinited() && netpollAnyWaiters() && sched.lastpoll.Load() != 0 {
+               if list, delta := netpoll(0); !list.empty() { // non-blocking
                         gp := list.pop()
                         injectglist(&list)
+                       netpollAdjustWaiters(delta)
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
-                       if trace.enabled {
-                               traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
                         }
                         return gp, false, false
                 }
@@ -2649,10 +3065,9 @@ top:
         // Limit the number of spinning Ms to half the number of busy Ps.
         // This is necessary to prevent excessive CPU consumption when
         // GOMAXPROCS>>1 but the program parallelism is low.
-       if mp.spinning || int32(2*atomic.Load(&sched.nmspinning)) < gomaxprocs-sched.npidle.Load() {
+       if mp.spinning || 2*sched.nmspinning.Load() < gomaxprocs-sched.npidle.Load() {
                 if !mp.spinning {
-                       mp.spinning = true
-                       atomic.Xadd(&sched.nmspinning, 1)
+                       mp.becomeSpinning()
                 }
  
                 gp, inheritTime, tnow, w, newWork := stealWork(now)
@@ -2682,9 +3097,12 @@ top:
                 if node != nil {
                         pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
                         gp := node.gp.ptr()
+
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
-                       if trace.enabled {
-                               traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
                         }
                         return gp, false, false
                 }
@@ -2697,9 +3115,11 @@ top:
         // until a callback was triggered.
         gp, otherReady := beforeIdle(now, pollUntil)
         if gp != nil {
+               trace := traceAcquire()
                 casgstatus(gp, _Gwaiting, _Grunnable)
-               if trace.enabled {
-                       traceGoUnpark(gp, 0)
+               if trace.ok() {
+                       trace.GoUnpark(gp, 0)
+                       traceRelease(trace)
                 }
                 return gp, false, false
         }
@@ -2719,7 +3139,7 @@ top:
  
         // return P and block
         lock(&sched.lock)
-       if sched.gcwaiting != 0 || pp.runSafePointFn != 0 {
+       if sched.gcwaiting.Load() || pp.runSafePointFn != 0 {
                 unlock(&sched.lock)
                 goto top
         }
@@ -2728,6 +3148,12 @@ top:
                 unlock(&sched.lock)
                 return gp, false, false
         }
+       if !mp.spinning && sched.needspinning.Load() == 1 {
+               // See "Delicate dance" comment below.
+               mp.becomeSpinning()
+               unlock(&sched.lock)
+               goto top
+       }
         if releasep() != pp {
                 throw("findrunnable: wrong p")
         }
@@ -2744,35 +3170,70 @@ top:
         //
         // This applies to the following sources of work:
         //
-       // * Goroutines added to a per-P run queue.
+       // * Goroutines added to the global or a per-P run queue.
         // * New/modified-earlier timers on a per-P timer heap.
         // * Idle-priority GC work (barring golang.org/issue/19112).
         //
-       // If we discover new work below, we need to restore m.spinning as a signal
-       // for resetspinning to unpark a new worker thread (because there can be more
-       // than one starving goroutine). However, if after discovering new work
-       // we also observe no idle Ps it is OK to skip unparking a new worker
-       // thread: the system is fully loaded so no spinning threads are required.
-       // Also see "Worker thread parking/unparking" comment at the top of the file.
+       // If we discover new work below, we need to restore m.spinning as a
+       // signal for resetspinning to unpark a new worker thread (because
+       // there can be more than one starving goroutine).
+       //
+       // However, if after discovering new work we also observe no idle Ps
+       // (either here or in resetspinning), we have a problem. We may be
+       // racing with a non-spinning M in the block above, having found no
+       // work and preparing to release its P and park. Allowing that P to go
+       // idle will result in loss of work conservation (idle P while there is
+       // runnable work). This could result in complete deadlock in the
+       // unlikely event that we discover new work (from netpoll) right as we
+       // are racing with _all_ other Ps going idle.
+       //
+       // We use sched.needspinning to synchronize with non-spinning Ms going
+       // idle. If needspinning is set when they are about to drop their P,
+       // they abort the drop and instead become a new spinning M on our
+       // behalf. If we are not racing and the system is truly fully loaded
+       // then no spinning threads are required, and the next thread to
+       // naturally become spinning will clear the flag.
+       //
+       // Also see "Worker thread parking/unparking" comment at the top of the
+       // file.
         wasSpinning := mp.spinning
         if mp.spinning {
                 mp.spinning = false
-               if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+               if sched.nmspinning.Add(-1) < 0 {
                         throw("findrunnable: negative nmspinning")
                 }
  
                 // Note the for correctness, only the last M transitioning from
                 // spinning to non-spinning must perform these rechecks to
-               // ensure no missed work. We are performing it on every M that
-               // transitions as a conservative change to monitor effects on
-               // latency. See golang.org/issue/43997.
+               // ensure no missed work. However, the runtime has some cases
+               // of transient increments of nmspinning that are decremented
+               // without going through this path, so we must be conservative
+               // and perform the check on all spinning Ms.
+               //
+               // See https://go.dev/issue/43997.
+
+               // Check global and P runqueues again.
+
+               lock(&sched.lock)
+               if sched.runqsize != 0 {
+                       pp, _ := pidlegetSpinning(0)
+                       if pp != nil {
+                               gp := globrunqget(pp, 0)
+                               if gp == nil {
+                                       throw("global runq empty with non-zero runqsize")
+                               }
+                               unlock(&sched.lock)
+                               acquirep(pp)
+                               mp.becomeSpinning()
+                               return gp, false, false
+                       }
+               }
+               unlock(&sched.lock)
  
-               // Check all runqueues once again.
                 pp := checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
                 if pp != nil {
                         acquirep(pp)
-                       mp.spinning = true
-                       atomic.Xadd(&sched.nmspinning, 1)
+                       mp.becomeSpinning()
                         goto top
                 }
  
@@ -2780,14 +3241,15 @@ top:
                 pp, gp := checkIdleGCNoP()
                 if pp != nil {
                         acquirep(pp)
-                       mp.spinning = true
-                       atomic.Xadd(&sched.nmspinning, 1)
+                       mp.becomeSpinning()
  
                         // Run the idle worker.
                         pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
-                       if trace.enabled {
-                               traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
                         }
                         return gp, false, false
                 }
@@ -2802,7 +3264,7 @@ top:
         }
  
         // Poll network until next timer.
-       if netpollinited() && (atomic.Load(&netpollWaiters) > 0 || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 {
+       if netpollinited() && (netpollAnyWaiters() || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 {
                 sched.pollUntil.Store(pollUntil)
                 if mp.p != 0 {
                         throw("findrunnable: netpoll with p")
@@ -2810,10 +3272,11 @@ top:
                 if mp.spinning {
                         throw("findrunnable: netpoll with spinning")
                 }
-               // Refresh now.
-               now = nanotime()
                 delay := int64(-1)
                 if pollUntil != 0 {
+                       if now == 0 {
+                               now = nanotime()
+                       }
                         delay = pollUntil - now
                         if delay < 0 {
                                 delay = 0
@@ -2823,7 +3286,9 @@ top:
                         // When using fake time, just poll.
                         delay = 0
                 }
-               list := netpoll(delay) // block until new work is available
+               list, delta := netpoll(delay) // block until new work is available
+               // Refresh now again, after potentially blocking.
+               now = nanotime()
                 sched.pollUntil.Store(0)
                 sched.lastpoll.Store(now)
                 if faketime != 0 && list.empty() {
@@ -2837,20 +3302,23 @@ top:
                 unlock(&sched.lock)
                 if pp == nil {
                         injectglist(&list)
+                       netpollAdjustWaiters(delta)
                 } else {
                         acquirep(pp)
                         if !list.empty() {
                                 gp := list.pop()
                                 injectglist(&list)
+                               netpollAdjustWaiters(delta)
+                               trace := traceAcquire()
                                 casgstatus(gp, _Gwaiting, _Grunnable)
-                               if trace.enabled {
-                                       traceGoUnpark(gp, 0)
+                               if trace.ok() {
+                                       trace.GoUnpark(gp, 0)
+                                       traceRelease(trace)
                                 }
                                 return gp, false, false
                         }
                         if wasSpinning {
-                               mp.spinning = true
-                               atomic.Xadd(&sched.nmspinning, 1)
+                               mp.becomeSpinning()
                         }
                         goto top
                 }
@@ -2876,9 +3344,10 @@ func pollWork() bool {
         if !runqempty(p) {
                 return true
         }
-       if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll.Load() != 0 {
-               if list := netpoll(0); !list.empty() {
+       if netpollinited() && netpollAnyWaiters() && sched.lastpoll.Load() != 0 {
+               if list, delta := netpoll(0); !list.empty() {
                         injectglist(&list)
+                       netpollAdjustWaiters(delta)
                         return true
                 }
         }
@@ -2901,7 +3370,7 @@ func stealWork(now int64) (gp *g, inheritTime bool, rnow, pollUntil int64, newWo
                 stealTimersOrRunNextG := i == stealTries-1
  
                 for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
-                       if sched.gcwaiting != 0 {
+                       if sched.gcwaiting.Load() {
                                 // GC work may be available.
                                 return nil, false, now, pollUntil, true
                         }
@@ -2969,17 +3438,18 @@ func checkRunqsNoP(allpSnapshot []*p, idlepMaskSnapshot pMask) *p {
         for id, p2 := range allpSnapshot {
                 if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(p2) {
                         lock(&sched.lock)
-                       pp, _ := pidleget(0)
-                       unlock(&sched.lock)
-                       if pp != nil {
-                               return pp
+                       pp, _ := pidlegetSpinning(0)
+                       if pp == nil {
+                               // Can't get a P, don't bother checking remaining Ps.
+                               unlock(&sched.lock)
+                               return nil
                         }
-
-                       // Can't get a P, don't bother checking remaining Ps.
-                       break
+                       unlock(&sched.lock)
+                       return pp
                 }
         }
  
+       // No work available.
         return nil
  }
  
@@ -3035,7 +3505,7 @@ func checkIdleGCNoP() (*p, *g) {
         // the assumption in gcControllerState.findRunnableGCWorker that an
         // empty gcBgMarkWorkerPool is only possible if gcMarkDone is running.
         lock(&sched.lock)
-       pp, now := pidleget(0)
+       pp, now := pidlegetSpinning(0)
         if pp == nil {
                 unlock(&sched.lock)
                 return nil, nil
@@ -3089,8 +3559,8 @@ func resetspinning() {
                 throw("resetspinning: not a spinning m")
         }
         gp.m.spinning = false
-       nmspinning := atomic.Xadd(&sched.nmspinning, -1)
-       if int32(nmspinning) < 0 {
+       nmspinning := sched.nmspinning.Add(-1)
+       if nmspinning < 0 {
                 throw("findrunnable: negative nmspinning")
         }
         // M wakeup policy is deliberately somewhat conservative, so check if we
@@ -3111,10 +3581,12 @@ func injectglist(glist *gList) {
         if glist.empty() {
                 return
         }
-       if trace.enabled {
+       trace := traceAcquire()
+       if trace.ok() {
                 for gp := glist.head.ptr(); gp != nil; gp = gp.schedlink.ptr() {
-                       traceGoUnpark(gp, 0)
+                       trace.GoUnpark(gp, 0)
                 }
+               traceRelease(trace)
         }
  
         // Mark all the goroutines as runnable before we put them
@@ -3135,8 +3607,20 @@ func injectglist(glist *gList) {
         *glist = gList{}
  
         startIdle := func(n int) {
-               for ; n != 0 && sched.npidle.Load() != 0; n-- {
-                       startm(nil, false)
+               for i := 0; i < n; i++ {
+                       mp := acquirem() // See comment in startm.
+                       lock(&sched.lock)
+
+                       pp, _ := pidlegetSpinning(0)
+                       if pp == nil {
+                               unlock(&sched.lock)
+                               releasem(mp)
+                               break
+                       }
+
+                       startm(pp, false, true)
+                       unlock(&sched.lock)
+                       releasem(mp)
                 }
         }
  
@@ -3202,6 +3686,18 @@ top:
  
         gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available
  
+       if debug.dontfreezetheworld > 0 && freezing.Load() {
+               // See comment in freezetheworld. We don't want to perturb
+               // scheduler state, so we didn't gcstopm in findRunnable, but
+               // also don't want to allow new goroutines to run.
+               //
+               // Deadlock here rather than in the findRunnable loop so if
+               // findRunnable is stuck in a loop we don't perturb that
+               // either.
+               lock(&deadlock)
+               lock(&deadlock)
+       }
+
         // This thread is going to run a goroutine and is not spinning anymore,
         // so if it was marked as spinning we need to reset it now and potentially
         // start a new spinning M.
@@ -3268,8 +3764,8 @@ func dropg() {
  func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
         // If it's not yet time for the first timer, or the first adjusted
         // timer, then there is nothing to do.
-       next := int64(atomic.Load64(&pp.timer0When))
-       nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+       next := pp.timer0When.Load()
+       nextAdj := pp.timerModifiedEarliest.Load()
         if next == 0 || (nextAdj != 0 && nextAdj < next) {
                 next = nextAdj
         }
@@ -3287,7 +3783,7 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
                 // if we would clear deleted timers.
                 // This corresponds to the condition below where
                 // we decide whether to call clearDeletedTimers.
-               if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
+               if pp != getg().m.p.ptr() || int(pp.deletedTimers.Load()) <= int(pp.numTimers.Load()/4) {
                         return now, next, false
                 }
         }
@@ -3312,7 +3808,7 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
         // If this is the local P, and there are a lot of deleted timers,
         // clear them out. We only do this for the local P to reduce
         // lock contention on timersLock.
-       if pp == getg().m.p.ptr() && int(atomic.Load(&pp.deletedTimers)) > len(pp.timers)/4 {
+       if pp == getg().m.p.ptr() && int(pp.deletedTimers.Load()) > len(pp.timers)/4 {
                 clearDeletedTimers(pp)
         }
  
@@ -3330,11 +3826,16 @@ func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
  func park_m(gp *g) {
         mp := getg().m
  
-       if trace.enabled {
-               traceGoPark(mp.waittraceev, mp.waittraceskip)
-       }
+       trace := traceAcquire()
  
+       // N.B. Not using casGToWaiting here because the waitreason is
+       // set by park_m's caller.
         casgstatus(gp, _Grunning, _Gwaiting)
+       if trace.ok() {
+               trace.GoPark(mp.waitTraceBlockReason, mp.waitTraceSkip)
+               traceRelease(trace)
+       }
+
         dropg()
  
         if fn := mp.waitunlockf; fn != nil {
@@ -3342,72 +3843,73 @@ func park_m(gp *g) {
                 mp.waitunlockf = nil
                 mp.waitlock = nil
                 if !ok {
-                       if trace.enabled {
-                               traceGoUnpark(gp, 2)
-                       }
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 2)
+                               traceRelease(trace)
+                       }
                         execute(gp, true) // Schedule it back, never returns.
                 }
         }
         schedule()
  }
  
-func goschedImpl(gp *g) {
+func goschedImpl(gp *g, preempted bool) {
+       trace := traceAcquire()
         status := readgstatus(gp)
         if status&^_Gscan != _Grunning {
                 dumpgstatus(gp)
                 throw("bad g status")
         }
         casgstatus(gp, _Grunning, _Grunnable)
+       if trace.ok() {
+               if preempted {
+                       trace.GoPreempt()
+               } else {
+                       trace.GoSched()
+               }
+               traceRelease(trace)
+       }
+
         dropg()
         lock(&sched.lock)
         globrunqput(gp)
         unlock(&sched.lock)
  
+       if mainStarted {
+               wakep()
+       }
+
         schedule()
  }
  
  // Gosched continuation on g0.
  func gosched_m(gp *g) {
-       if trace.enabled {
-               traceGoSched()
-       }
-       goschedImpl(gp)
+       goschedImpl(gp, false)
  }
  
-// goschedguarded is a forbidden-states-avoided version of gosched_m
+// goschedguarded is a forbidden-states-avoided version of gosched_m.
  func goschedguarded_m(gp *g) {
-
         if !canPreemptM(gp.m) {
                 gogo(&gp.sched) // never return
         }
-
-       if trace.enabled {
-               traceGoSched()
-       }
-       goschedImpl(gp)
+       goschedImpl(gp, false)
  }
  
  func gopreempt_m(gp *g) {
-       if trace.enabled {
-               traceGoPreempt()
-       }
-       goschedImpl(gp)
+       goschedImpl(gp, true)
  }
  
  // preemptPark parks gp and puts it in _Gpreempted.
  //
  //go:systemstack
  func preemptPark(gp *g) {
-       if trace.enabled {
-               traceGoPark(traceEvGoBlock, 0)
-       }
         status := readgstatus(gp)
         if status&^_Gscan != _Grunning {
                 dumpgstatus(gp)
                 throw("bad g status")
         }
-       gp.waitreason = waitReasonPreempted
  
         if gp.asyncSafePoint {
                 // Double-check that async preemption does not
@@ -3417,7 +3919,7 @@ func preemptPark(gp *g) {
                 if !f.valid() {
                         throw("preempt at unknown pc")
                 }
-               if f.flag&funcFlag_SPWRITE != 0 {
+               if f.flag&abi.FuncFlagSPWrite != 0 {
                         println("runtime: unexpected SPWRITE function", funcname(f), "in async preempt")
                         throw("preempt SPWRITE")
                 }
@@ -3431,7 +3933,30 @@ func preemptPark(gp *g) {
         // transitions until we can dropg.
         casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted)
         dropg()
+
+       // Be careful about how we trace this next event. The ordering
+       // is subtle.
+       //
+       // The moment we CAS into _Gpreempted, suspendG could CAS to
+       // _Gwaiting, do its work, and ready the goroutine. All of
+       // this could happen before we even get the chance to emit
+       // an event. The end result is that the events could appear
+       // out of order, and the tracer generally assumes the scheduler
+       // takes care of the ordering between GoPark and GoUnpark.
+       //
+       // The answer here is simple: emit the event while we still hold
+       // the _Gscan bit on the goroutine. We still need to traceAcquire
+       // and traceRelease across the CAS because the tracer could be
+       // what's calling suspendG in the first place, and we want the
+       // CAS and event emission to appear atomic to the tracer.
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GoPark(traceBlockPreempted, 0)
+       }
         casfrom_Gscanstatus(gp, _Gscan|_Gpreempted, _Gpreempted)
+       if trace.ok() {
+               traceRelease(trace)
+       }
         schedule()
  }
  
@@ -3444,11 +3969,13 @@ func goyield() {
  }
  
  func goyield_m(gp *g) {
-       if trace.enabled {
-               traceGoPreempt()
-       }
+       trace := traceAcquire()
         pp := gp.m.p.ptr()
         casgstatus(gp, _Grunning, _Grunnable)
+       if trace.ok() {
+               trace.GoPreempt()
+               traceRelease(trace)
+       }
         dropg()
         runqput(pp, gp, false)
         schedule()
@@ -3459,8 +3986,10 @@ func goexit1() {
         if raceenabled {
                 racegoend()
         }
-       if trace.enabled {
-               traceGoEnd()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GoEnd()
+               traceRelease(trace)
         }
         mcall(goexit0)
  }
@@ -3484,7 +4013,7 @@ func goexit0(gp *g) {
         gp._defer = nil // should be true already but just in case.
         gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
         gp.writebuf = nil
-       gp.waitreason = 0
+       gp.waitreason = waitReasonZero
         gp.param = nil
         gp.labels = nil
         gp.timer = nil
@@ -3599,6 +4128,7 @@ func save(pc, sp uintptr) {
  //
  //go:nosplit
  func reentersyscall(pc, sp uintptr) {
+       trace := traceAcquire()
         gp := getg()
  
         // Disable preemption because during this function g is in Gsyscall status,
@@ -3617,6 +4147,11 @@ func reentersyscall(pc, sp uintptr) {
         gp.syscallsp = sp
         gp.syscallpc = pc
         casgstatus(gp, _Grunning, _Gsyscall)
+       if staticLockRanking {
+               // When doing static lock ranking casgstatus can call
+               // systemstack which clobbers g.sched.
+               save(pc, sp)
+       }
         if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
                 systemstack(func() {
                         print("entersyscall inconsistent ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
@@ -3624,15 +4159,18 @@ func reentersyscall(pc, sp uintptr) {
                 })
         }
  
-       if trace.enabled {
-               systemstack(traceGoSysCall)
+       if trace.ok() {
+               systemstack(func() {
+                       trace.GoSysCall()
+                       traceRelease(trace)
+               })
                 // systemstack itself clobbers g.sched.{pc,sp} and we might
                 // need them later when the G is genuinely blocked in a
                 // syscall
                 save(pc, sp)
         }
  
-       if atomic.Load(&sched.sysmonwait) != 0 {
+       if sched.sysmonwait.Load() {
                 systemstack(entersyscall_sysmon)
                 save(pc, sp)
         }
@@ -3644,13 +4182,12 @@ func reentersyscall(pc, sp uintptr) {
         }
  
         gp.m.syscalltick = gp.m.p.ptr().syscalltick
-       gp.sysblocktraced = true
         pp := gp.m.p.ptr()
         pp.m = 0
         gp.m.oldp.set(pp)
         gp.m.p = 0
         atomic.Store(&pp.status, _Psyscall)
-       if sched.gcwaiting != 0 {
+       if sched.gcwaiting.Load() {
                 systemstack(entersyscall_gcwait)
                 save(pc, sp)
         }
@@ -3670,8 +4207,8 @@ func entersyscall() {
  
  func entersyscall_sysmon() {
         lock(&sched.lock)
-       if atomic.Load(&sched.sysmonwait) != 0 {
-               atomic.Store(&sched.sysmonwait, 0)
+       if sched.sysmonwait.Load() {
+               sched.sysmonwait.Store(false)
                 notewakeup(&sched.sysmonnote)
         }
         unlock(&sched.lock)
@@ -3683,9 +4220,11 @@ func entersyscall_gcwait() {
  
         lock(&sched.lock)
         if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) {
-               if trace.enabled {
-                       traceGoSysBlock(pp)
-                       traceProcStop(pp)
+               trace := traceAcquire()
+               if trace.ok() {
+                       trace.GoSysBlock(pp)
+                       trace.ProcStop(pp)
+                       traceRelease(trace)
                 }
                 pp.syscalltick++
                 if sched.stopwait--; sched.stopwait == 0 {
@@ -3705,7 +4244,6 @@ func entersyscallblock() {
         gp.throwsplit = true
         gp.stackguard0 = stackPreempt // see comment in entersyscall
         gp.m.syscalltick = gp.m.p.ptr().syscalltick
-       gp.sysblocktraced = true
         gp.m.p.ptr().syscalltick++
  
         // Leave SP around for GC and traceback.
@@ -3740,9 +4278,11 @@ func entersyscallblock() {
  }
  
  func entersyscallblock_handoff() {
-       if trace.enabled {
-               traceGoSysCall()
-               traceGoSysBlock(getg().m.p.ptr())
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GoSysCall()
+               trace.GoSysBlock(getg().m.p.ptr())
+               traceRelease(trace)
         }
         handoffp(releasep())
  }
@@ -3781,15 +4321,21 @@ func exitsyscall() {
                                 tryRecordGoroutineProfileWB(gp)
                         })
                 }
-               if trace.enabled {
+               trace := traceAcquire()
+               if trace.ok() {
                         if oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-                               systemstack(traceGoStart)
+                               systemstack(func() {
+                                       trace.GoStart()
+                               })
                         }
                 }
                 // There's a cpu for us, so we can run.
                 gp.m.p.ptr().syscalltick++
                 // We need to cas the status and scan before resuming...
                 casgstatus(gp, _Gsyscall, _Grunning)
+               if trace.ok() {
+                       traceRelease(trace)
+               }
  
                 // Garbage collector isn't running (since we are),
                 // so okay to clear syscallsp.
@@ -3799,8 +4345,8 @@ func exitsyscall() {
                         // restore the preemption request in case we've cleared it in newstack
                         gp.stackguard0 = stackPreempt
                 } else {
-                       // otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
-                       gp.stackguard0 = gp.stack.lo + _StackGuard
+                       // otherwise restore the real stackGuard, we've spoiled it in entersyscall/entersyscallblock
+                       gp.stackguard0 = gp.stack.lo + stackGuard
                 }
                 gp.throwsplit = false
  
@@ -3812,8 +4358,8 @@ func exitsyscall() {
                 return
         }
  
-       gp.sysexitticks = 0
-       if trace.enabled {
+       trace := traceAcquire()
+       if trace.ok() {
                 // Wait till traceGoSysBlock event is emitted.
                 // This ensures consistency of the trace (the goroutine is started after it is blocked).
                 for oldp != nil && oldp.syscalltick == gp.m.syscalltick {
@@ -3823,7 +4369,8 @@ func exitsyscall() {
                 // Tracing code can invoke write barriers that cannot run without a P.
                 // So instead we remember the syscall exit time and emit the event
                 // in execute when we have a P.
-               gp.sysexitticks = cputicks()
+               gp.trace.sysExitTime = traceClockNow()
+               traceRelease(trace)
         }
  
         gp.m.locks--
@@ -3864,15 +4411,19 @@ func exitsyscallfast(oldp *p) bool {
                 var ok bool
                 systemstack(func() {
                         ok = exitsyscallfast_pidle()
-                       if ok && trace.enabled {
-                               if oldp != nil {
-                                       // Wait till traceGoSysBlock event is emitted.
-                                       // This ensures consistency of the trace (the goroutine is started after it is blocked).
-                                       for oldp.syscalltick == gp.m.syscalltick {
-                                               osyield()
+                       if ok {
+                               trace := traceAcquire()
+                               if trace.ok() {
+                                       if oldp != nil {
+                                               // Wait till traceGoSysBlock event is emitted.
+                                               // This ensures consistency of the trace (the goroutine is started after it is blocked).
+                                               for oldp.syscalltick == gp.m.syscalltick {
+                                                       osyield()
+                                               }
                                         }
+                                       trace.GoSysExit()
+                                       traceRelease(trace)
                                 }
-                               traceGoSysExit(0)
                         }
                 })
                 if ok {
@@ -3890,15 +4441,17 @@ func exitsyscallfast(oldp *p) bool {
  func exitsyscallfast_reacquired() {
         gp := getg()
         if gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-               if trace.enabled {
+               trace := traceAcquire()
+               if trace.ok() {
                         // The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed).
                         // traceGoSysBlock for this syscall was already emitted,
                         // but here we effectively retake the p from the new syscall running on the same p.
                         systemstack(func() {
                                 // Denote blocking of the new syscall.
-                               traceGoSysBlock(gp.m.p.ptr())
+                               trace.GoSysBlock(gp.m.p.ptr())
                                 // Denote completion of the current syscall.
-                               traceGoSysExit(0)
+                               trace.GoSysExit()
+                               traceRelease(trace)
                         })
                 }
                 gp.m.p.ptr().syscalltick++
@@ -3908,8 +4461,8 @@ func exitsyscallfast_reacquired() {
  func exitsyscallfast_pidle() bool {
         lock(&sched.lock)
         pp, _ := pidleget(0)
-       if pp != nil && atomic.Load(&sched.sysmonwait) != 0 {
-               atomic.Store(&sched.sysmonwait, 0)
+       if pp != nil && sched.sysmonwait.Load() {
+               sched.sysmonwait.Store(false)
                 notewakeup(&sched.sysmonnote)
         }
         unlock(&sched.lock)
@@ -3944,8 +4497,8 @@ func exitsyscall0(gp *g) {
                 // could race with another M transitioning gp from unlocked to
                 // locked.
                 locked = gp.lockedm != 0
-       } else if atomic.Load(&sched.sysmonwait) != 0 {
-               atomic.Store(&sched.sysmonwait, 0)
+       } else if sched.sysmonwait.Load() {
+               sched.sysmonwait.Store(false)
                 notewakeup(&sched.sysmonnote)
         }
         unlock(&sched.lock)
@@ -3981,7 +4534,7 @@ func syscall_runtime_BeforeFork() {
  
         // This function is called before fork in syscall package.
         // Code between fork and exec must not allocate memory nor even try to grow stack.
-       // Here we spoil g->_StackGuard to reliably detect any attempts to grow stack.
+       // Here we spoil g.stackguard0 to reliably detect any attempts to grow stack.
         // runtime_AfterFork will undo this in parent process, but not in child.
         gp.stackguard0 = stackFork
  }
@@ -3994,7 +4547,7 @@ func syscall_runtime_AfterFork() {
         gp := getg().m.curg
  
         // See the comments in beforefork.
-       gp.stackguard0 = gp.stack.lo + _StackGuard
+       gp.stackguard0 = gp.stack.lo + stackGuard
  
         msigrestore(gp.m.sigmask)
  
@@ -4064,11 +4617,11 @@ func syscall_runtime_AfterExec() {
  func malg(stacksize int32) *g {
         newg := new(g)
         if stacksize >= 0 {
-               stacksize = round2(_StackSystem + stacksize)
+               stacksize = round2(stackSystem + stacksize)
                 systemstack(func() {
                         newg.stack = stackalloc(uint32(stacksize))
                 })
-               newg.stackguard0 = newg.stack.lo + _StackGuard
+               newg.stackguard0 = newg.stack.lo + stackGuard
                 newg.stackguard1 = ^uintptr(0)
                 // Clear the bottom word of the stack. We record g
                 // there on gsignal stack during VDSO on ARM and ARM64.
@@ -4107,7 +4660,7 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
         pp := mp.p.ptr()
         newg := gfget(pp)
         if newg == nil {
-               newg = malg(_StackMin)
+               newg = malg(stackMin)
                 casgstatus(newg, _Gidle, _Gdead)
                 allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
         }
@@ -4122,12 +4675,14 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
         totalSize := uintptr(4*goarch.PtrSize + sys.MinFrameSize) // extra space in case of reads slightly beyond frame
         totalSize = alignUp(totalSize, sys.StackAlign)
         sp := newg.stack.hi - totalSize
-       spArg := sp
         if usesLR {
                 // caller's LR
                 *(*uintptr)(unsafe.Pointer(sp)) = 0
                 prepGoExitFrame(sp)
-               spArg += sys.MinFrameSize
+       }
+       if GOARCH == "arm64" {
+               // caller's FP
+               *(*uintptr)(unsafe.Pointer(sp - goarch.PtrSize)) = 0
         }
  
         memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
@@ -4136,6 +4691,7 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
         newg.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
         newg.sched.g = guintptr(unsafe.Pointer(newg))
         gostartcallfn(&newg.sched, fn)
+       newg.parentGoid = callergp.goid
         newg.gopc = callerpc
         newg.ancestors = saveAncestors(callergp)
         newg.startpc = fn.fn
@@ -4160,9 +4716,11 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
         if newg.trackingSeq%gTrackingPeriod == 0 {
                 newg.tracking = true
         }
-       casgstatus(newg, _Gdead, _Grunnable)
         gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo))
  
+       // Get a goid and switch to runnable. Make all this atomic to the tracer.
+       trace := traceAcquire()
+       casgstatus(newg, _Gdead, _Grunnable)
         if pp.goidcache == pp.goidcacheend {
                 // Sched.goidgen is the last allocated id,
                 // this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
@@ -4173,24 +4731,28 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
         }
         newg.goid = pp.goidcache
         pp.goidcache++
+       if trace.ok() {
+               trace.GoCreate(newg, newg.startpc)
+               traceRelease(trace)
+       }
+
+       // Set up race context.
         if raceenabled {
                 newg.racectx = racegostart(callerpc)
+               newg.raceignore = 0
                 if newg.labels != nil {
                         // See note in proflabel.go on labelSync's role in synchronizing
                         // with the reads in the signal handler.
                         racereleasemergeg(newg, unsafe.Pointer(&labelSync))
                 }
         }
-       if trace.enabled {
-               traceGoCreate(newg, newg.startpc)
-       }
         releasem(mp)
  
         return newg
  }
  
  // saveAncestors copies previous ancestors of the given caller g and
-// includes infor for the current caller into a new set of tracebacks for
+// includes info for the current caller into a new set of tracebacks for
  // a g being created.
  func saveAncestors(callergp *g) *[]ancestorInfo {
         // Copy all prior info, except for the root goroutine (goid 0).
@@ -4208,7 +4770,7 @@ func saveAncestors(callergp *g) *[]ancestorInfo {
         ancestors := make([]ancestorInfo, n)
         copy(ancestors[1:], callerAncestors)
  
-       var pcs [_TracebackMaxFrames]uintptr
+       var pcs [tracebackInnerFrames]uintptr
         npcs := gcallers(callergp, 0, pcs[:])
         ipcs := make([]uintptr, npcs)
         copy(ipcs, pcs[:])
@@ -4310,7 +4872,7 @@ retry:
                 systemstack(func() {
                         gp.stack = stackalloc(startingStackSize)
                 })
-               gp.stackguard0 = gp.stack.lo + _StackGuard
+               gp.stackguard0 = gp.stack.lo + stackGuard
         } else {
                 if raceenabled {
                         racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
@@ -4368,8 +4930,6 @@ func dolockOSThread() {
         gp.lockedm.set(gp.m)
  }
  
-//go:nosplit
-
  // LockOSThread wires the calling goroutine to its current operating system thread.
  // The calling goroutine will always execute in that thread,
  // and no other goroutine will execute in it,
@@ -4384,6 +4944,8 @@ func dolockOSThread() {
  //
  // A goroutine should call LockOSThread before calling OS services or
  // non-Go library functions that depend on per-thread state.
+//
+//go:nosplit
  func LockOSThread() {
         if atomic.Load(&newmHandoff.haveTemplateThread) == 0 && GOOS != "plan9" {
                 // If we need to start a new thread from the locked
@@ -4423,8 +4985,6 @@ func dounlockOSThread() {
         gp.lockedm = 0
  }
  
-//go:nosplit
-
  // UnlockOSThread undoes an earlier call to LockOSThread.
  // If this drops the number of active LockOSThread calls on the
  // calling goroutine to zero, it unwires the calling goroutine from
@@ -4437,6 +4997,8 @@ func dounlockOSThread() {
  // other goroutines, it should not call this function and thus leave
  // the goroutine locked to the OS thread until the goroutine (and
  // hence the thread) exits.
+//
+//go:nosplit
  func UnlockOSThread() {
         gp := getg()
         if gp.m.lockedExt == 0 {
@@ -4479,8 +5041,11 @@ func mcount() int32 {
  }
  
  var prof struct {
-       signalLock uint32
-       hz         int32
+       signalLock atomic.Uint32
+
+       // Must hold signalLock to write. Reads may be lock-free, but
+       // signalLock should be taken to synchronize with changes.
+       hz atomic.Int32
  }
  
  func _System()                    { _System() }
@@ -4495,7 +5060,7 @@ func _VDSO()                      { _VDSO() }
  //
  //go:nowritebarrierrec
  func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
-       if prof.hz == 0 {
+       if prof.hz.Load() == 0 {
                 return
         }
  
@@ -4536,6 +5101,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
         // See golang.org/issue/17165.
         getg().m.mallocing++
  
+       var u unwinder
         var stk [maxCPUProfStack]uintptr
         n := 0
         if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
@@ -4545,54 +5111,48 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
                 // cgoCallers.  We are running in a signal handler
                 // with all signals blocked, so we don't have to worry
                 // about any other code interrupting us.
-               if atomic.Load(&mp.cgoCallersUse) == 0 && mp.cgoCallers != nil && mp.cgoCallers[0] != 0 {
+               if mp.cgoCallersUse.Load() == 0 && mp.cgoCallers != nil && mp.cgoCallers[0] != 0 {
                         for cgoOff < len(mp.cgoCallers) && mp.cgoCallers[cgoOff] != 0 {
                                 cgoOff++
                         }
-                       copy(stk[:], mp.cgoCallers[:cgoOff])
+                       n += copy(stk[:], mp.cgoCallers[:cgoOff])
                         mp.cgoCallers[0] = 0
                 }
  
                 // Collect Go stack that leads to the cgo call.
-               n = gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[cgoOff], len(stk)-cgoOff, nil, nil, 0)
-               if n > 0 {
-                       n += cgoOff
-               }
+               u.initAt(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, unwindSilentErrors)
+       } else if usesLibcall() && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
+               // Libcall, i.e. runtime syscall on windows.
+               // Collect Go stack that leads to the call.
+               u.initAt(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), unwindSilentErrors)
+       } else if mp != nil && mp.vdsoSP != 0 {
+               // VDSO call, e.g. nanotime1 on Linux.
+               // Collect Go stack that leads to the call.
+               u.initAt(mp.vdsoPC, mp.vdsoSP, 0, gp, unwindSilentErrors|unwindJumpStack)
         } else {
-               n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
+               u.initAt(pc, sp, lr, gp, unwindSilentErrors|unwindTrap|unwindJumpStack)
         }
+       n += tracebackPCs(&u, 0, stk[n:])
  
         if n <= 0 {
                 // Normal traceback is impossible or has failed.
-               // See if it falls into several common cases.
-               n = 0
-               if usesLibcall() && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
-                       // Libcall, i.e. runtime syscall on windows.
-                       // Collect Go stack that leads to the call.
-                       n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
-               }
-               if n == 0 && mp != nil && mp.vdsoSP != 0 {
-                       n = gentraceback(mp.vdsoPC, mp.vdsoSP, 0, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
-               }
-               if n == 0 {
-                       // If all of the above has failed, account it against abstract "System" or "GC".
-                       n = 2
-                       if inVDSOPage(pc) {
-                               pc = abi.FuncPCABIInternal(_VDSO) + sys.PCQuantum
-                       } else if pc > firstmoduledata.etext {
-                               // "ExternalCode" is better than "etext".
-                               pc = abi.FuncPCABIInternal(_ExternalCode) + sys.PCQuantum
-                       }
-                       stk[0] = pc
-                       if mp.preemptoff != "" {
-                               stk[1] = abi.FuncPCABIInternal(_GC) + sys.PCQuantum
-                       } else {
-                               stk[1] = abi.FuncPCABIInternal(_System) + sys.PCQuantum
-                       }
+               // Account it against abstract "System" or "GC".
+               n = 2
+               if inVDSOPage(pc) {
+                       pc = abi.FuncPCABIInternal(_VDSO) + sys.PCQuantum
+               } else if pc > firstmoduledata.etext {
+                       // "ExternalCode" is better than "etext".
+                       pc = abi.FuncPCABIInternal(_ExternalCode) + sys.PCQuantum
+               }
+               stk[0] = pc
+               if mp.preemptoff != "" {
+                       stk[1] = abi.FuncPCABIInternal(_GC) + sys.PCQuantum
+               } else {
+                       stk[1] = abi.FuncPCABIInternal(_System) + sys.PCQuantum
                 }
         }
  
-       if prof.hz != 0 {
+       if prof.hz.Load() != 0 {
                 // Note: it can happen on Windows that we interrupted a system thread
                 // with no g, so gp could nil. The other nil checks are done out of
                 // caution, but not expected to be nil in practice.
@@ -4633,14 +5193,14 @@ func setcpuprofilerate(hz int32) {
         // it would deadlock.
         setThreadCPUProfiler(0)
  
-       for !atomic.Cas(&prof.signalLock, 0, 1) {
+       for !prof.signalLock.CompareAndSwap(0, 1) {
                 osyield()
         }
-       if prof.hz != hz {
+       if prof.hz.Load() != hz {
                 setProcessCPUProfiler(hz)
-               prof.hz = hz
+               prof.hz.Store(hz)
         }
-       atomic.Store(&prof.signalLock, 0)
+       prof.signalLock.Store(0)
  
         lock(&sched.lock)
         sched.profilehz = hz
@@ -4721,9 +5281,9 @@ func (pp *p) destroy() {
                 lock(&pp.timersLock)
                 moveTimers(plocal, pp.timers)
                 pp.timers = nil
-               pp.numTimers = 0
-               pp.deletedTimers = 0
-               atomic.Store64(&pp.timer0When, 0)
+               pp.numTimers.Store(0)
+               pp.deletedTimers.Store(0)
+               pp.timer0When.Store(0)
                 unlock(&pp.timersLock)
                 unlock(&plocal.timersLock)
         }
@@ -4736,6 +5296,7 @@ func (pp *p) destroy() {
                 pp.sudogbuf[i] = nil
         }
         pp.sudogcache = pp.sudogbuf[:0]
+       pp.pinnerCache = nil
         for j := range pp.deferpoolbuf {
                 pp.deferpoolbuf[j] = nil
         }
@@ -4793,8 +5354,10 @@ func procresize(nprocs int32) *p {
         if old < 0 || nprocs <= 0 {
                 throw("procresize: invalid arg")
         }
-       if trace.enabled {
-               traceGomaxprocs(nprocs)
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.Gomaxprocs(nprocs)
+               traceRelease(trace)
         }
  
         // update statistics
@@ -4859,12 +5422,14 @@ func procresize(nprocs int32) *p {
                 // because p.destroy itself has write barriers, so we
                 // need to do that from a valid P.
                 if gp.m.p != 0 {
-                       if trace.enabled {
+                       trace := traceAcquire()
+                       if trace.ok() {
                                 // Pretend that we were descheduled
                                 // and then scheduled again to keep
                                 // the trace sane.
-                               traceGoSched()
-                               traceProcStop(gp.m.p.ptr())
+                               trace.GoSched()
+                               trace.ProcStop(gp.m.p.ptr())
+                               traceRelease(trace)
                         }
                         gp.m.p.ptr().m = 0
                 }
@@ -4873,8 +5438,10 @@ func procresize(nprocs int32) *p {
                 pp.m = 0
                 pp.status = _Pidle
                 acquirep(pp)
-               if trace.enabled {
-                       traceGoStart()
+               trace := traceAcquire()
+               if trace.ok() {
+                       trace.GoStart()
+                       traceRelease(trace)
                 }
         }
  
@@ -4938,8 +5505,10 @@ func acquirep(pp *p) {
         // from a potentially stale mcache.
         pp.mcache.prepareForSweep()
  
-       if trace.enabled {
-               traceProcStart()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.ProcStart()
+               traceRelease(trace)
         }
  }
  
@@ -4980,8 +5549,10 @@ func releasep() *p {
                 print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
                 throw("releasep: invalid p state")
         }
-       if trace.enabled {
-               traceProcStop(gp.m.p.ptr())
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.ProcStop(gp.m.p.ptr())
+               traceRelease(trace)
         }
         gp.m.p = 0
         pp.m = 0
@@ -5024,13 +5595,8 @@ func checkdead() {
         // accommodate callbacks created by syscall.NewCallback. See issue #6751
         // for details.)
         var run0 int32
-       if !iscgo && cgoHasExtraM {
-               mp := lockextra(true)
-               haveExtraM := extraMCount > 0
-               unlockextra(mp)
-               if haveExtraM {
-                       run0 = 1
-               }
+       if !iscgo && cgoHasExtraM && extraMLength.Load() > 0 {
+               run0 = 1
         }
  
         run := mcount() - sched.nmidle - sched.nmidlelocked - sched.nmsys
@@ -5039,6 +5605,7 @@ func checkdead() {
         }
         if run < 0 {
                 print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", mcount(), " nmsys=", sched.nmsys, "\n")
+               unlock(&sched.lock)
                 throw("checkdead: inconsistent counts")
         }
  
@@ -5056,6 +5623,7 @@ func checkdead() {
                         _Grunning,
                         _Gsyscall:
                         print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
+                       unlock(&sched.lock)
                         throw("checkdead: runnable g")
                 }
         })
@@ -5074,18 +5642,20 @@ func checkdead() {
                         if pp == nil {
                                 // There should always be a free P since
                                 // nothing is running.
+                               unlock(&sched.lock)
                                 throw("checkdead: no p for timer")
                         }
                         mp := mget()
                         if mp == nil {
                                 // There should always be a free M since
                                 // nothing is running.
+                               unlock(&sched.lock)
                                 throw("checkdead: no m for timer")
                         }
                         // M must be spinning to steal. We set this to be
                         // explicit, but since this is the only M it would
                         // become spinning on its own anyways.
-                       atomic.Xadd(&sched.nmspinning, 1)
+                       sched.nmspinning.Add(1)
                         mp.spinning = true
                         mp.nextp.set(pp)
                         notewakeup(&mp.park)
@@ -5155,13 +5725,13 @@ func sysmon() {
                 // from a timer to avoid adding system load to applications that spend
                 // most of their time sleeping.
                 now := nanotime()
-               if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || sched.npidle.Load() == gomaxprocs) {
+               if debug.schedtrace <= 0 && (sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs) {
                         lock(&sched.lock)
-                       if atomic.Load(&sched.gcwaiting) != 0 || sched.npidle.Load() == gomaxprocs {
+                       if sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs {
                                 syscallWake := false
                                 next := timeSleepUntil()
                                 if next > now {
-                                       atomic.Store(&sched.sysmonwait, 1)
+                                       sched.sysmonwait.Store(true)
                                         unlock(&sched.lock)
                                         // Make wake-up period small enough
                                         // for the sampling to be correct.
@@ -5178,7 +5748,7 @@ func sysmon() {
                                                 osRelax(false)
                                         }
                                         lock(&sched.lock)
-                                       atomic.Store(&sched.sysmonwait, 0)
+                                       sched.sysmonwait.Store(false)
                                         noteclear(&sched.sysmonnote)
                                 }
                                 if syscallWake {
@@ -5202,7 +5772,7 @@ func sysmon() {
                 lastpoll := sched.lastpoll.Load()
                 if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
                         sched.lastpoll.CompareAndSwap(lastpoll, now)
-                       list := netpoll(0) // non-blocking - returns list of goroutines
+                       list, delta := netpoll(0) // non-blocking - returns list of goroutines
                         if !list.empty() {
                                 // Need to decrement number of idle locked M's
                                 // (pretending that one more is running) before injectglist.
@@ -5214,6 +5784,7 @@ func sysmon() {
                                 incidlelocked(-1)
                                 injectglist(&list)
                                 incidlelocked(1)
+                               netpollAdjustWaiters(delta)
                         }
                 }
                 if GOOS == "netbsd" && needSysmonWorkaround {
@@ -5233,7 +5804,7 @@ func sysmon() {
                         // See issue 42515 and
                         // https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094.
                         if next := timeSleepUntil(); next < now {
-                               startm(nil, false)
+                               startm(nil, false, false)
                         }
                 }
                 if scavenger.sysmonWake.Load() != 0 {
@@ -5248,9 +5819,9 @@ func sysmon() {
                         idle++
                 }
                 // check if we need to force a GC
-               if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && atomic.Load(&forcegc.idle) != 0 {
+               if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && forcegc.idle.Load() {
                         lock(&forcegc.lock)
-                       forcegc.idle = 0
+                       forcegc.idle.Store(false)
                         var list gList
                         list.push(forcegc.g)
                         injectglist(&list)
@@ -5317,7 +5888,7 @@ func retake(now int64) uint32 {
                         // On the one hand we don't want to retake Ps if there is no other work to do,
                         // but on the other hand we want to retake them eventually
                         // because they can prevent the sysmon thread from deep sleep.
-                       if runqempty(pp) && atomic.Load(&sched.nmspinning)+uint32(sched.npidle.Load()) > 0 && pd.syscallwhen+10*1000*1000 > now {
+                       if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
                                 continue
                         }
                         // Drop allpLock so we can take sched.lock.
@@ -5328,9 +5899,11 @@ func retake(now int64) uint32 {
                         // increment nmidle and report deadlock.
                         incidlelocked(-1)
                         if atomic.Cas(&pp.status, s, _Pidle) {
-                               if trace.enabled {
-                                       traceGoSysBlock(pp)
-                                       traceProcStop(pp)
+                               trace := traceAcquire()
+                               if trace.ok() {
+                                       trace.GoSysBlock(pp)
+                                       trace.ProcStop(pp)
+                                       traceRelease(trace)
                                 }
                                 n++
                                 pp.syscalltick++
@@ -5408,9 +5981,9 @@ func schedtrace(detailed bool) {
         }
  
         lock(&sched.lock)
-       print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle.Load(), " threads=", mcount(), " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
+       print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle.Load(), " threads=", mcount(), " spinningthreads=", sched.nmspinning.Load(), " needspinning=", sched.needspinning.Load(), " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
         if detailed {
-               print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
+               print(" gcwaiting=", sched.gcwaiting.Load(), " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait.Load(), "\n")
         }
         // We must be careful while reading data from P's, M's and G's.
         // Even if we hold schedlock, most data can be changed concurrently.
@@ -5505,7 +6078,7 @@ func schedEnableUser(enable bool) {
                 globrunqputbatch(&sched.disable.runnable, n)
                 unlock(&sched.lock)
                 for ; n != 0 && sched.npidle.Load() != 0; n-- {
-                       startm(nil, false)
+                       startm(nil, false, false)
                 }
         } else {
                 unlock(&sched.lock)
@@ -5674,7 +6247,7 @@ func (p pMask) clear(id int32) {
  // TODO(prattmic): Additional targeted updates may improve the above cases.
  // e.g., updating the mask when stealing a timer.
  func updateTimerPMask(pp *p) {
-       if atomic.Load(&pp.numTimers) > 0 {
+       if pp.numTimers.Load() > 0 {
                 return
         }
  
@@ -5682,7 +6255,7 @@ func updateTimerPMask(pp *p) {
         // decrement numTimers when handling a timerModified timer in
         // checkTimers. We must take timersLock to serialize with these changes.
         lock(&pp.timersLock)
-       if atomic.Load(&pp.numTimers) == 0 {
+       if pp.numTimers.Load() == 0 {
                 timerpMask.clear(pp.id)
         }
         unlock(&pp.timersLock)
@@ -5744,6 +6317,31 @@ func pidleget(now int64) (*p, int64) {
         return pp, now
  }
  
+// pidlegetSpinning tries to get a p from the _Pidle list, acquiring ownership.
+// This is called by spinning Ms (or callers than need a spinning M) that have
+// found work. If no P is available, this must synchronized with non-spinning
+// Ms that may be preparing to drop their P without discovering this work.
+//
+// sched.lock must be held.
+//
+// May run during STW, so write barriers are not allowed.
+//
+//go:nowritebarrierrec
+func pidlegetSpinning(now int64) (*p, int64) {
+       assertLockHeld(&sched.lock)
+
+       pp, now := pidleget(now)
+       if pp == nil {
+               // See "Delicate dance" comment in findrunnable. We found work
+               // that we cannot take, we must synchronize with non-spinning
+               // Ms that may be preparing to drop their P.
+               sched.needspinning.Store(1)
+               return nil, now
+       }
+
+       return pp, now
+}
+
  // runqempty reports whether pp has no Gs on its local run queue.
  // It never returns true spuriously.
  func runqempty(pp *p) bool {
@@ -5894,7 +6492,7 @@ func runqget(pp *p) (gp *g, inheritTime bool) {
         next := pp.runnext
         // If the runnext is non-0 and the CAS fails, it could only have been stolen by another P,
         // because other Ps can race to set runnext to 0, but only the current P can set it to non-0.
-       // Hence, there's no need to retry this CAS if it falls.
+       // Hence, there's no need to retry this CAS if it fails.
         if next != 0 && pp.runnext.cas(next, 0) {
                 return next.ptr(), true
         }
@@ -6193,7 +6791,7 @@ func sync_runtime_canSpin(i int) bool {
         // GOMAXPROCS>1 and there is at least one other running P and local runq is empty.
         // As opposed to runtime mutex we don't do passive spinning here,
         // because there can be work on global runq or on other Ps.
-       if i >= active_spin || ncpu <= 1 || gomaxprocs <= sched.npidle.Load()+int32(sched.nmspinning)+1 {
+       if i >= active_spin || ncpu <= 1 || gomaxprocs <= sched.npidle.Load()+sched.nmspinning.Load()+1 {
                 return false
         }
         if p := getg().m.p.ptr(); !runqempty(p) {
@@ -6265,14 +6863,11 @@ func gcd(a, b uint32) uint32 {
  }
  
  // An initTask represents the set of initializations that need to be done for a package.
-// Keep in sync with ../../test/initempty.go:initTask
+// Keep in sync with ../../test/noinit.go:initTask
  type initTask struct {
-       // TODO: pack the first 3 fields more tightly?
-       state uintptr // 0 = uninitialized, 1 = in progress, 2 = done
-       ndeps uintptr
-       nfns  uintptr
-       // followed by ndeps instances of an *initTask, one per package depended on
-       // followed by nfns pcs, one per init function to run
+       state uint32 // 0 = uninitialized, 1 = in progress, 2 = done
+       nfns  uint32
+       // followed by nfns pcs, uintptr sized, one per init function to run
  }
  
  // inittrace stores statistics for init functions which are
@@ -6286,7 +6881,13 @@ type tracestat struct {
         bytes  uint64 // heap allocated bytes
  }
  
-func doInit(t *initTask) {
+func doInit(ts []*initTask) {
+       for _, t := range ts {
+               doInit1(t)
+       }
+}
+
+func doInit1(t *initTask) {
         switch t.state {
         case 2: // fully initialized
                 return
@@ -6295,17 +6896,6 @@ func doInit(t *initTask) {
         default: // not initialized yet
                 t.state = 1 // initialization in progress
  
-               for i := uintptr(0); i < t.ndeps; i++ {
-                       p := add(unsafe.Pointer(t), (3+i)*goarch.PtrSize)
-                       t2 := *(**initTask)(p)
-                       doInit(t2)
-               }
-
-               if t.nfns == 0 {
-                       t.state = 2 // initialization done
-                       return
-               }
-
                 var (
                         start  int64
                         before tracestat
@@ -6317,9 +6907,14 @@ func doInit(t *initTask) {
                         before = inittrace
                 }
  
-               firstFunc := add(unsafe.Pointer(t), (3+t.ndeps)*goarch.PtrSize)
-               for i := uintptr(0); i < t.nfns; i++ {
-                       p := add(firstFunc, i*goarch.PtrSize)
+               if t.nfns == 0 {
+                       // We should have pruned all of these in the linker.
+                       throw("inittask with no functions")
+               }
+
+               firstFunc := add(unsafe.Pointer(t), 8)
+               for i := uint32(0); i < t.nfns; i++ {
+                       p := add(firstFunc, uintptr(i)*goarch.PtrSize)
                         f := *(*func())(unsafe.Pointer(&p))
                         f()
                 }