]> Cypherpunks.ru repositories - gostls13.git/blobdiff - src/runtime/proc.go
runtime: refactor runtime->tracer API to appear more like a lock
[gostls13.git] / src / runtime / proc.go
index ae218da513f3168a3e81ec7e7d5eac72a8ae77d7..ae2562a5b76caa70e5a6ca3b71eb53d1909f3c4b 100644 (file)
@@ -84,7 +84,7 @@ var modinfo string
 // semi-persistent CPU underutilization.
 //
 // The general pattern for submission is:
-// 1. Submit work to the local run queue, timer heap, or GC state.
+// 1. Submit work to the local or global run queue, timer heap, or GC state.
 // 2. #StoreLoad-style memory barrier.
 // 3. Check sched.nmspinning.
 //
@@ -115,6 +115,7 @@ var (
        g0           g
        mcache0      *mcache
        raceprocctx0 uintptr
+       raceFiniLock mutex
 )
 
 // This slice records the initializing tasks that need to be
@@ -208,6 +209,10 @@ func main() {
 
        main_init_done = make(chan bool)
        if iscgo {
+               if _cgo_pthread_key_created == nil {
+                       throw("_cgo_pthread_key_created missing")
+               }
+
                if _cgo_thread_start == nil {
                        throw("_cgo_thread_start missing")
                }
@@ -222,6 +227,13 @@ func main() {
                if _cgo_notify_runtime_init_done == nil {
                        throw("_cgo_notify_runtime_init_done missing")
                }
+
+               // Set the x_crosscall2_ptr C function pointer variable point to crosscall2.
+               if set_crosscall2 == nil {
+                       throw("set_crosscall2 missing")
+               }
+               set_crosscall2()
+
                // Start the template thread in case we enter Go from
                // a C-created thread and need to create a new thread.
                startTemplateThread()
@@ -232,8 +244,10 @@ func main() {
        // list can arrive a few different ways, but it will always
        // contain the init tasks computed by the linker for all the
        // packages in the program (excluding those added at runtime
-       // by package plugin).
-       for _, m := range activeModules() {
+       // by package plugin). Run through the modules in dependency
+       // order (the order they are initialized by the dynamic
+       // loader, i.e. they are added to the moduledata linked list).
+       for m := &firstmoduledata; m != nil; m = m.next {
                doInit(m.inittasks)
        }
 
@@ -272,7 +286,7 @@ func main() {
                }
        }
        if panicking.Load() != 0 {
-               gopark(nil, nil, waitReasonPanicWait, traceEvGoStop, 1)
+               gopark(nil, nil, waitReasonPanicWait, traceBlockForever, 1)
        }
        runExitHooks(0)
 
@@ -307,7 +321,7 @@ func forcegchelper() {
                        throw("forcegc: phase error")
                }
                forcegc.idle.Store(true)
-               goparkunlock(&forcegc.lock, waitReasonForceGCIdle, traceEvGoBlock, 1)
+               goparkunlock(&forcegc.lock, waitReasonForceGCIdle, traceBlockSystemGoroutine, 1)
                // this goroutine is explicitly resumed by sysmon
                if debug.gctrace > 0 {
                        println("GC forced")
@@ -366,7 +380,7 @@ func goschedIfBusy() {
 // Reason explains why the goroutine has been parked. It is displayed in stack
 // traces and heap dumps. Reasons should be unique and descriptive. Do not
 // re-use reasons, add new ones.
-func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceEv byte, traceskip int) {
+func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceReason traceBlockReason, traceskip int) {
        if reason != waitReasonSleep {
                checkTimeouts() // timeouts may expire while two goroutines keep the scheduler busy
        }
@@ -379,8 +393,8 @@ func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason w
        mp.waitlock = lock
        mp.waitunlockf = unlockf
        gp.waitreason = reason
-       mp.waittraceev = traceEv
-       mp.waittraceskip = traceskip
+       mp.waitTraceBlockReason = traceReason
+       mp.waitTraceSkip = traceskip
        releasem(mp)
        // can't do anything that might move the G between Ms here.
        mcall(park_m)
@@ -388,8 +402,8 @@ func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason w
 
 // Puts the current goroutine into a waiting state and unlocks the lock.
 // The goroutine can be made runnable again by calling goready(gp).
-func goparkunlock(lock *mutex, reason waitReason, traceEv byte, traceskip int) {
-       gopark(parkunlock_c, unsafe.Pointer(lock), reason, traceEv, traceskip)
+func goparkunlock(lock *mutex, reason waitReason, traceReason traceBlockReason, traceskip int) {
+       gopark(parkunlock_c, unsafe.Pointer(lock), reason, traceReason, traceskip)
 }
 
 func goready(gp *g, traceskip int) {
@@ -502,7 +516,20 @@ func badreflectcall() {
 //go:nosplit
 //go:nowritebarrierrec
 func badmorestackg0() {
-       writeErrStr("fatal: morestack on g0\n")
+       if !crashStackImplemented {
+               writeErrStr("fatal: morestack on g0\n")
+               return
+       }
+
+       g := getg()
+       switchToCrashStack(func() {
+               print("runtime: morestack on g0, stack [", hex(g.stack.lo), " ", hex(g.stack.hi), "], sp=", hex(g.sched.sp), ", called from\n")
+               g.m.traceback = 2 // include pc and sp in stack trace
+               traceback1(g.sched.pc, g.sched.sp, g.sched.lr, g, 0)
+               print("\n")
+
+               throw("morestack on g0")
+       })
 }
 
 //go:nosplit
@@ -516,6 +543,42 @@ func badctxt() {
        throw("ctxt != 0")
 }
 
+// gcrash is a fake g that can be used when crashing due to bad
+// stack conditions.
+var gcrash g
+
+var crashingG atomic.Pointer[g]
+
+// Switch to crashstack and call fn, with special handling of
+// concurrent and recursive cases.
+//
+// Nosplit as it is called in a bad stack condition (we know
+// morestack would fail).
+//
+//go:nosplit
+//go:nowritebarrierrec
+func switchToCrashStack(fn func()) {
+       me := getg()
+       if crashingG.CompareAndSwapNoWB(nil, me) {
+               switchToCrashStack0(fn) // should never return
+               abort()
+       }
+       if crashingG.Load() == me {
+               // recursive crashing. too bad.
+               writeErrStr("fatal: recursive switchToCrashStack\n")
+               abort()
+       }
+       // Another g is crashing. Give it some time, hopefully it will finish traceback.
+       usleep_no_g(100)
+       writeErrStr("fatal: concurrent switchToCrashStack\n")
+       abort()
+}
+
+const crashStackImplemented = GOARCH == "amd64" || GOARCH == "arm64" || GOARCH == "mips64" || GOARCH == "mips64le" || GOARCH == "riscv64"
+
+//go:noescape
+func switchToCrashStack0(fn func()) // in assembly
+
 func lockedOSThread() bool {
        gp := getg()
        return gp.lockedm != 0 && gp.m.lockedg != 0
@@ -690,11 +753,8 @@ func schedinit() {
        lockInit(&allpLock, lockRankAllp)
        lockInit(&reflectOffs.lock, lockRankReflectOffs)
        lockInit(&finlock, lockRankFin)
-       lockInit(&trace.bufLock, lockRankTraceBuf)
-       lockInit(&trace.stringsLock, lockRankTraceStrings)
-       lockInit(&trace.lock, lockRankTrace)
        lockInit(&cpuprof.lock, lockRankCpuprof)
-       lockInit(&trace.stackTab.lock, lockRankTraceStackTab)
+       traceLockInit()
        // Enforce that this lock is always a leaf lock.
        // All of this lock's critical sections should be
        // extremely short.
@@ -731,9 +791,17 @@ func schedinit() {
 
        goargs()
        goenvs()
+       secure()
+       checkfds()
        parsedebugvars()
        gcinit()
 
+       // Allocate stack space that can be used when crashing due to bad stack
+       // conditions, e.g. morestack on g0.
+       gcrash.stack = stackalloc(16384)
+       gcrash.stackguard0 = gcrash.stack.lo + 1000
+       gcrash.stackguard1 = gcrash.stack.lo + 1000
+
        // if disableMemoryProfiling is set, update MemProfileRate to 0 to turn off memprofile.
        // Note: parsedebugvars may update MemProfileRate, but when disableMemoryProfiling is
        // set to true by the linker, it means that nothing is consuming the profile, it is
@@ -778,7 +846,16 @@ func dumpgstatus(gp *g) {
 func checkmcount() {
        assertLockHeld(&sched.lock)
 
-       if mcount() > sched.maxmcount {
+       // Exclude extra M's, which are used for cgocallback from threads
+       // created in C.
+       //
+       // The purpose of the SetMaxThreads limit is to avoid accidental fork
+       // bomb from something like millions of goroutines blocking on system
+       // calls, causing the runtime to create millions of threads. By
+       // definition, this isn't a problem for threads created in C, so we
+       // exclude them from the limit. See https://go.dev/issue/60004.
+       count := mcount() - int32(extraMInUse.Load()) - int32(extraMLength.Load())
+       if count > sched.maxmcount {
                print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
                throw("thread exhaustion")
        }
@@ -832,7 +909,7 @@ func mcommoninit(mp *m, id int64) {
 
        mpreinit(mp)
        if mp.gsignal != nil {
-               mp.gsignal.stackguard1 = mp.gsignal.stack.lo + _StackGuard
+               mp.gsignal.stackguard1 = mp.gsignal.stack.lo + stackGuard
        }
 
        // Add to allm so garbage collector doesn't free g->m
@@ -856,8 +933,8 @@ func (mp *m) becomeSpinning() {
        sched.needspinning.Store(0)
 }
 
-func (mp *m) incgocallback() bool {
-       return (!mp.incgo && mp.ncgo > 0) || mp.isextra
+func (mp *m) hasCgoOnStack() bool {
+       return mp.ncgo > 0 || mp.isextra
 }
 
 var fastrandseed uintptr
@@ -869,10 +946,6 @@ func fastrandinit() {
 
 // Mark gp ready to run.
 func ready(gp *g, traceskip int, next bool) {
-       if trace.enabled {
-               traceGoUnpark(gp, traceskip)
-       }
-
        status := readgstatus(gp)
 
        // Mark runnable.
@@ -883,7 +956,12 @@ func ready(gp *g, traceskip int, next bool) {
        }
 
        // status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
+       trace := traceAcquire()
        casgstatus(gp, _Gwaiting, _Grunnable)
+       if trace.ok() {
+               trace.GoUnpark(gp, traceskip)
+               traceRelease(trace)
+       }
        runqput(mp.p.ptr(), gp, next)
        wakep()
        releasem(mp)
@@ -902,6 +980,35 @@ var freezing atomic.Bool
 // This function must not lock any mutexes.
 func freezetheworld() {
        freezing.Store(true)
+       if debug.dontfreezetheworld > 0 {
+               // Don't prempt Ps to stop goroutines. That will perturb
+               // scheduler state, making debugging more difficult. Instead,
+               // allow goroutines to continue execution.
+               //
+               // fatalpanic will tracebackothers to trace all goroutines. It
+               // is unsafe to trace a running goroutine, so tracebackothers
+               // will skip running goroutines. That is OK and expected, we
+               // expect users of dontfreezetheworld to use core files anyway.
+               //
+               // However, allowing the scheduler to continue running free
+               // introduces a race: a goroutine may be stopped when
+               // tracebackothers checks its status, and then start running
+               // later when we are in the middle of traceback, potentially
+               // causing a crash.
+               //
+               // To mitigate this, when an M naturally enters the scheduler,
+               // schedule checks if freezing is set and if so stops
+               // execution. This guarantees that while Gs can transition from
+               // running to stopped, they can never transition from stopped
+               // to running.
+               //
+               // The sleep here allows racing Ms that missed freezing and are
+               // about to run a G to complete the transition to running
+               // before we start traceback.
+               usleep(1000)
+               return
+       }
+
        // stopwait and preemption requests can be lost
        // due to races with concurrently executing threads,
        // so try several times
@@ -1139,6 +1246,59 @@ func casGFromPreempted(gp *g, old, new uint32) bool {
        return gp.atomicstatus.CompareAndSwap(_Gpreempted, _Gwaiting)
 }
 
+// stwReason is an enumeration of reasons the world is stopping.
+type stwReason uint8
+
+// Reasons to stop-the-world.
+//
+// Avoid reusing reasons and add new ones instead.
+const (
+       stwUnknown                     stwReason = iota // "unknown"
+       stwGCMarkTerm                                   // "GC mark termination"
+       stwGCSweepTerm                                  // "GC sweep termination"
+       stwWriteHeapDump                                // "write heap dump"
+       stwGoroutineProfile                             // "goroutine profile"
+       stwGoroutineProfileCleanup                      // "goroutine profile cleanup"
+       stwAllGoroutinesStack                           // "all goroutines stack trace"
+       stwReadMemStats                                 // "read mem stats"
+       stwAllThreadsSyscall                            // "AllThreadsSyscall"
+       stwGOMAXPROCS                                   // "GOMAXPROCS"
+       stwStartTrace                                   // "start trace"
+       stwStopTrace                                    // "stop trace"
+       stwForTestCountPagesInUse                       // "CountPagesInUse (test)"
+       stwForTestReadMetricsSlow                       // "ReadMetricsSlow (test)"
+       stwForTestReadMemStatsSlow                      // "ReadMemStatsSlow (test)"
+       stwForTestPageCachePagesLeaked                  // "PageCachePagesLeaked (test)"
+       stwForTestResetDebugLog                         // "ResetDebugLog (test)"
+)
+
+func (r stwReason) String() string {
+       return stwReasonStrings[r]
+}
+
+// If you add to this list, also add it to src/internal/trace/parser.go.
+// If you change the values of any of the stw* constants, bump the trace
+// version number and make a copy of this.
+var stwReasonStrings = [...]string{
+       stwUnknown:                     "unknown",
+       stwGCMarkTerm:                  "GC mark termination",
+       stwGCSweepTerm:                 "GC sweep termination",
+       stwWriteHeapDump:               "write heap dump",
+       stwGoroutineProfile:            "goroutine profile",
+       stwGoroutineProfileCleanup:     "goroutine profile cleanup",
+       stwAllGoroutinesStack:          "all goroutines stack trace",
+       stwReadMemStats:                "read mem stats",
+       stwAllThreadsSyscall:           "AllThreadsSyscall",
+       stwGOMAXPROCS:                  "GOMAXPROCS",
+       stwStartTrace:                  "start trace",
+       stwStopTrace:                   "stop trace",
+       stwForTestCountPagesInUse:      "CountPagesInUse (test)",
+       stwForTestReadMetricsSlow:      "ReadMetricsSlow (test)",
+       stwForTestReadMemStatsSlow:     "ReadMemStatsSlow (test)",
+       stwForTestPageCachePagesLeaked: "PageCachePagesLeaked (test)",
+       stwForTestResetDebugLog:        "ResetDebugLog (test)",
+}
+
 // stopTheWorld stops all P's from executing goroutines, interrupting
 // all goroutines at GC safe points and records reason as the reason
 // for the stop. On return, only the current goroutine's P is running.
@@ -1153,10 +1313,10 @@ func casGFromPreempted(gp *g, old, new uint32) bool {
 // This is also used by routines that do stack dumps. If the system is
 // in panic or being exited, this may not reliably stop all
 // goroutines.
-func stopTheWorld(reason string) {
+func stopTheWorld(reason stwReason) {
        semacquire(&worldsema)
        gp := getg()
-       gp.m.preemptoff = reason
+       gp.m.preemptoff = reason.String()
        systemstack(func() {
                // Mark the goroutine which called stopTheWorld preemptible so its
                // stack may be scanned.
@@ -1170,14 +1330,14 @@ func stopTheWorld(reason string) {
                // have already completed by the time we exit.
                // Don't provide a wait reason because we're still executing.
                casGToWaiting(gp, _Grunning, waitReasonStoppingTheWorld)
-               stopTheWorldWithSema()
+               stopTheWorldWithSema(reason)
                casgstatus(gp, _Gwaiting, _Grunning)
        })
 }
 
 // startTheWorld undoes the effects of stopTheWorld.
 func startTheWorld() {
-       systemstack(func() { startTheWorldWithSema(false) })
+       systemstack(func() { startTheWorldWithSema() })
 
        // worldsema must be held over startTheWorldWithSema to ensure
        // gomaxprocs cannot change while worldsema is held.
@@ -1203,7 +1363,7 @@ func startTheWorld() {
 // stopTheWorldGC has the same effect as stopTheWorld, but blocks
 // until the GC is not running. It also blocks a GC from starting
 // until startTheWorldGC is called.
-func stopTheWorldGC(reason string) {
+func stopTheWorldGC(reason stwReason) {
        semacquire(&gcsema)
        stopTheWorld(reason)
 }
@@ -1247,7 +1407,12 @@ var gcsema uint32 = 1
 // startTheWorldWithSema and stopTheWorldWithSema.
 // Holding worldsema causes any other goroutines invoking
 // stopTheWorld to block.
-func stopTheWorldWithSema() {
+func stopTheWorldWithSema(reason stwReason) {
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.STWStart(reason)
+               traceRelease(trace)
+       }
        gp := getg()
 
        // If we hold a lock, then we won't be able to stop another M
@@ -1264,17 +1429,22 @@ func stopTheWorldWithSema() {
        gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
        sched.stopwait--
        // try to retake all P's in Psyscall status
+       trace = traceAcquire()
        for _, pp := range allp {
                s := pp.status
                if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) {
-                       if trace.enabled {
-                               traceGoSysBlock(pp)
-                               traceProcStop(pp)
+                       if trace.ok() {
+                               trace.GoSysBlock(pp)
+                               trace.ProcStop(pp)
                        }
                        pp.syscalltick++
                        sched.stopwait--
                }
        }
+       if trace.ok() {
+               traceRelease(trace)
+       }
+
        // stop idle P's
        now := nanotime()
        for {
@@ -1326,13 +1496,14 @@ func stopTheWorldWithSema() {
        worldStopped()
 }
 
-func startTheWorldWithSema(emitTraceEvent bool) int64 {
+func startTheWorldWithSema() int64 {
        assertWorldStopped()
 
        mp := acquirem() // disable preemption because it can be holding p in a local var
        if netpollinited() {
-               list := netpoll(0) // non-blocking
+               list, delta := netpoll(0) // non-blocking
                injectglist(&list)
+               netpollAdjustWaiters(delta)
        }
        lock(&sched.lock)
 
@@ -1370,8 +1541,10 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 
        // Capture start-the-world time before doing clean-up tasks.
        startTime := nanotime()
-       if emitTraceEvent {
-               traceGCSTWDone()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.STWDone()
+               traceRelease(trace)
        }
 
        // Wakeup an additional proc in case we have excessive runnable goroutines
@@ -1391,7 +1564,7 @@ func usesLibcall() bool {
        case "aix", "darwin", "illumos", "ios", "solaris", "windows":
                return true
        case "openbsd":
-               return GOARCH == "386" || GOARCH == "amd64" || GOARCH == "arm" || GOARCH == "arm64"
+               return GOARCH != "mips64"
        }
        return false
 }
@@ -1403,10 +1576,7 @@ func mStackIsSystemAllocated() bool {
        case "aix", "darwin", "plan9", "illumos", "ios", "solaris", "windows":
                return true
        case "openbsd":
-               switch GOARCH {
-               case "386", "amd64", "arm", "arm64":
-                       return true
-               }
+               return GOARCH != "mips64"
        }
        return false
 }
@@ -1439,14 +1609,14 @@ func mstart0() {
                // but is somewhat arbitrary.
                size := gp.stack.hi
                if size == 0 {
-                       size = 8192 * sys.StackGuardMultiplier
+                       size = 16384 * sys.StackGuardMultiplier
                }
                gp.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
                gp.stack.lo = gp.stack.hi - size + 1024
        }
        // Initialize stack guard so that we can start calling regular
        // Go code.
-       gp.stackguard0 = gp.stack.lo + _StackGuard
+       gp.stackguard0 = gp.stack.lo + stackGuard
        // This is the g0, so we can also call go:systemstack
        // functions, which check stackguard1.
        gp.stackguard1 = gp.stackguard0
@@ -1693,17 +1863,21 @@ func forEachP(fn func(*p)) {
 
        // Force Ps currently in _Psyscall into _Pidle and hand them
        // off to induce safe point function execution.
+       trace := traceAcquire()
        for _, p2 := range allp {
                s := p2.status
                if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) {
-                       if trace.enabled {
-                               traceGoSysBlock(p2)
-                               traceProcStop(p2)
+                       if trace.ok() {
+                               trace.GoSysBlock(p2)
+                               trace.ProcStop(p2)
                        }
                        p2.syscalltick++
                        handoffp(p2)
                }
        }
+       if trace.ok() {
+               traceRelease(trace)
+       }
 
        // Wait for remaining Ps to run fn.
        if wait {
@@ -1835,7 +2009,7 @@ func allocm(pp *p, fn func(), id int64) *m {
        if iscgo || mStackIsSystemAllocated() {
                mp.g0 = malg(-1)
        } else {
-               mp.g0 = malg(8192 * sys.StackGuardMultiplier)
+               mp.g0 = malg(16384 * sys.StackGuardMultiplier)
        }
        mp.g0.m = mp
 
@@ -1879,11 +2053,15 @@ func allocm(pp *p, fn func(), id int64) *m {
 // pressed into service as the scheduling stack and current
 // goroutine for the duration of the cgo callback.
 //
-// When the callback is done with the m, it calls dropm to
-// put the m back on the list.
+// It calls dropm to put the m back on the list,
+// 1. when the callback is done with the m in non-pthread platforms,
+// 2. or when the C thread exiting on pthread platforms.
+//
+// The signal argument indicates whether we're called from a signal
+// handler.
 //
 //go:nosplit
-func needm() {
+func needm(signal bool) {
        if (iscgo || GOOS == "windows") && !cgoHasExtraM {
                // Can happen if C/C++ code calls Go from a global ctor.
                // Can also happen on Windows if a global ctor uses a
@@ -1907,11 +2085,10 @@ func needm() {
        sigsave(&sigmask)
        sigblock(false)
 
-       // Lock extra list, take head, unlock popped list.
-       // nilokay=false is safe here because of the invariant above,
+       // getExtraM is safe here because of the invariant above,
        // that the extra list always contains or will soon contain
        // at least one m.
-       mp := lockextra(false)
+       mp, last := getExtraM()
 
        // Set needextram when we've just emptied the list,
        // so that the eventual call into cgocallbackg will
@@ -1920,9 +2097,7 @@ func needm() {
        // after exitsyscall makes sure it is okay to be
        // running at all (that is, there's no garbage collection
        // running right now).
-       mp.needextram = mp.schedlink == 0
-       extraMCount--
-       unlockextra(mp.schedlink.ptr())
+       mp.needextram = last
 
        // Store the original signal mask for use by minit.
        mp.sigmask = sigmask
@@ -1932,15 +2107,15 @@ func needm() {
        osSetupTLS(mp)
 
        // Install g (= m->g0) and set the stack bounds
-       // to match the current stack. We don't actually know
-       // how big the stack is, like we don't know how big any
-       // scheduling stack is, but we assume there's at least 32 kB,
-       // which is more than enough for us.
+       // to match the current stack.
        setg(mp.g0)
-       gp := getg()
-       gp.stack.hi = getcallersp() + 1024
-       gp.stack.lo = getcallersp() - 32*1024
-       gp.stackguard0 = gp.stack.lo + _StackGuard
+       sp := getcallersp()
+       callbackUpdateSystemStack(mp, sp, signal)
+
+       // Should mark we are already in Go now.
+       // Otherwise, we may call needm again when we get a signal, before cgocallbackg1,
+       // which means the extram list may be empty, that will cause a deadlock.
+       mp.isExtraInC = false
 
        // Initialize this thread to use the m.
        asminit()
@@ -1951,6 +2126,17 @@ func needm() {
        sched.ngsys.Add(-1)
 }
 
+// Acquire an extra m and bind it to the C thread when a pthread key has been created.
+//
+//go:nosplit
+func needAndBindM() {
+       needm(false)
+
+       if _cgo_pthread_key_created != nil && *(*uintptr)(_cgo_pthread_key_created) != 0 {
+               cgoBindM()
+       }
+}
+
 // newextram allocates m's and puts them on the extra list.
 // It is called with a working local m, so that it can do things
 // like call schedlock and allocate.
@@ -1960,13 +2146,9 @@ func newextram() {
                for i := uint32(0); i < c; i++ {
                        oneNewExtraM()
                }
-       } else {
+       } else if extraMLength.Load() == 0 {
                // Make sure there is at least one extra M.
-               mp := lockextra(true)
-               unlockextra(mp)
-               if mp == nil {
-                       oneNewExtraM()
-               }
+               oneNewExtraM()
        }
 }
 
@@ -1995,21 +2177,19 @@ func oneNewExtraM() {
        gp.m = mp
        mp.curg = gp
        mp.isextra = true
+       // mark we are in C by default.
+       mp.isExtraInC = true
        mp.lockedInt++
        mp.lockedg.set(gp)
        gp.lockedm.set(mp)
        gp.goid = sched.goidgen.Add(1)
-       gp.sysblocktraced = true
        if raceenabled {
                gp.racectx = racegostart(abi.FuncPCABIInternal(newextram) + sys.PCQuantum)
        }
-       if trace.enabled {
-               // Trigger two trace events for the locked g in the extra m,
-               // since the next event of the g will be traceEvGoSysExit in exitsyscall,
-               // while calling from C thread to Go.
-               traceGoCreate(gp, 0) // no start pc
-               gp.traceseq++
-               traceEvent(traceEvGoInSyscall, -1, gp.goid)
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.OneNewExtraM(gp)
+               traceRelease(trace)
        }
        // put on allg for garbage collector
        allgadd(gp)
@@ -2021,15 +2201,14 @@ func oneNewExtraM() {
        sched.ngsys.Add(1)
 
        // Add m to the extra list.
-       mnext := lockextra(true)
-       mp.schedlink.set(mnext)
-       extraMCount++
-       unlockextra(mp)
+       addExtraM(mp)
 }
 
+// dropm puts the current m back onto the extra list.
+//
+// 1. On systems without pthreads, like Windows
 // dropm is called when a cgo callback has called needm but is now
 // done with the callback and returning back into the non-Go thread.
-// It puts the current m back onto the extra list.
 //
 // The main expense here is the call to signalstack to release the
 // m's signal stack, and then the call to needm on the next callback
@@ -2041,15 +2220,23 @@ func oneNewExtraM() {
 // call. These should typically not be scheduling operations, just a few
 // atomics, so the cost should be small.
 //
-// TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
-// variable using pthread_key_create. Unlike the pthread keys we already use
-// on OS X, this dummy key would never be read by Go code. It would exist
-// only so that we could register at thread-exit-time destructor.
-// That destructor would put the m back onto the extra list.
-// This is purely a performance optimization. The current version,
-// in which dropm happens on each cgo call, is still correct too.
-// We may have to keep the current version on systems with cgo
-// but without pthreads, like Windows.
+// 2. On systems with pthreads
+// dropm is called while a non-Go thread is exiting.
+// We allocate a pthread per-thread variable using pthread_key_create,
+// to register a thread-exit-time destructor.
+// And store the g into a thread-specific value associated with the pthread key,
+// when first return back to C.
+// So that the destructor would invoke dropm while the non-Go thread is exiting.
+// This is much faster since it avoids expensive signal-related syscalls.
+//
+// This always runs without a P, so //go:nowritebarrierrec is required.
+//
+// This may run with a different stack than was recorded in g0 (there is no
+// call to callbackUpdateSystemStack prior to dropm), so this must be
+// //go:nosplit to avoid the stack bounds check.
+//
+//go:nowritebarrierrec
+//go:nosplit
 func dropm() {
        // Clear m and g, and return m to the extra list.
        // After the call to setg we can only call nosplit functions
@@ -2069,26 +2256,75 @@ func dropm() {
        sigblock(false)
        unminit()
 
-       mnext := lockextra(true)
-       extraMCount++
-       mp.schedlink.set(mnext)
-
        setg(nil)
 
-       // Commit the release of mp.
-       unlockextra(mp)
+       // Clear g0 stack bounds to ensure that needm always refreshes the
+       // bounds when reusing this M.
+       g0 := mp.g0
+       g0.stack.hi = 0
+       g0.stack.lo = 0
+       g0.stackguard0 = 0
+       g0.stackguard1 = 0
+
+       putExtraM(mp)
 
        msigrestore(sigmask)
 }
 
+// bindm store the g0 of the current m into a thread-specific value.
+//
+// We allocate a pthread per-thread variable using pthread_key_create,
+// to register a thread-exit-time destructor.
+// We are here setting the thread-specific value of the pthread key, to enable the destructor.
+// So that the pthread_key_destructor would dropm while the C thread is exiting.
+//
+// And the saved g will be used in pthread_key_destructor,
+// since the g stored in the TLS by Go might be cleared in some platforms,
+// before the destructor invoked, so, we restore g by the stored g, before dropm.
+//
+// We store g0 instead of m, to make the assembly code simpler,
+// since we need to restore g0 in runtime.cgocallback.
+//
+// On systems without pthreads, like Windows, bindm shouldn't be used.
+//
+// NOTE: this always runs without a P, so, nowritebarrierrec required.
+//
+//go:nosplit
+//go:nowritebarrierrec
+func cgoBindM() {
+       if GOOS == "windows" || GOOS == "plan9" {
+               fatal("bindm in unexpected GOOS")
+       }
+       g := getg()
+       if g.m.g0 != g {
+               fatal("the current g is not g0")
+       }
+       if _cgo_bindm != nil {
+               asmcgocall(_cgo_bindm, unsafe.Pointer(g))
+       }
+}
+
 // A helper function for EnsureDropM.
 func getm() uintptr {
        return uintptr(unsafe.Pointer(getg().m))
 }
 
-var extram atomic.Uintptr
-var extraMCount uint32 // Protected by lockextra
-var extraMWaiters atomic.Uint32
+var (
+       // Locking linked list of extra M's, via mp.schedlink. Must be accessed
+       // only via lockextra/unlockextra.
+       //
+       // Can't be atomic.Pointer[m] because we use an invalid pointer as a
+       // "locked" sentinel value. M's on this list remain visible to the GC
+       // because their mp.curg is on allgs.
+       extraM atomic.Uintptr
+       // Number of M's in the extraM list.
+       extraMLength atomic.Uint32
+       // Number of waiters in lockextra.
+       extraMWaiters atomic.Uint32
+
+       // Number of extra M's in use by threads.
+       extraMInUse atomic.Uint32
+)
 
 // lockextra locks the extra list and returns the list head.
 // The caller must unlock the list by storing a new list head
@@ -2102,7 +2338,7 @@ func lockextra(nilokay bool) *m {
 
        incr := false
        for {
-               old := extram.Load()
+               old := extraM.Load()
                if old == locked {
                        osyield_no_g()
                        continue
@@ -2118,7 +2354,7 @@ func lockextra(nilokay bool) *m {
                        usleep_no_g(1)
                        continue
                }
-               if extram.CompareAndSwap(old, locked) {
+               if extraM.CompareAndSwap(old, locked) {
                        return (*m)(unsafe.Pointer(old))
                }
                osyield_no_g()
@@ -2127,8 +2363,41 @@ func lockextra(nilokay bool) *m {
 }
 
 //go:nosplit
-func unlockextra(mp *m) {
-       extram.Store(uintptr(unsafe.Pointer(mp)))
+func unlockextra(mp *m, delta int32) {
+       extraMLength.Add(delta)
+       extraM.Store(uintptr(unsafe.Pointer(mp)))
+}
+
+// Return an M from the extra M list. Returns last == true if the list becomes
+// empty because of this call.
+//
+// Spins waiting for an extra M, so caller must ensure that the list always
+// contains or will soon contain at least one M.
+//
+//go:nosplit
+func getExtraM() (mp *m, last bool) {
+       mp = lockextra(false)
+       extraMInUse.Add(1)
+       unlockextra(mp.schedlink.ptr(), -1)
+       return mp, mp.schedlink.ptr() == nil
+}
+
+// Returns an extra M back to the list. mp must be from getExtraM. Newly
+// allocated M's should use addExtraM.
+//
+//go:nosplit
+func putExtraM(mp *m) {
+       extraMInUse.Add(-1)
+       addExtraM(mp)
+}
+
+// Adds a newly allocated M to the extra M list.
+//
+//go:nosplit
+func addExtraM(mp *m) {
+       mnext := lockextra(true)
+       mp.schedlink.set(mnext)
+       unlockextra(mp, 1)
 }
 
 var (
@@ -2348,10 +2617,15 @@ func mspinning() {
 // Callers passing a non-nil P must call from a non-preemptible context. See
 // comment on acquirem below.
 //
+// Argument lockheld indicates whether the caller already acquired the
+// scheduler lock. Callers holding the lock when making the call must pass
+// true. The lock might be temporarily dropped, but will be reacquired before
+// returning.
+//
 // Must not have write barriers because this may be called without a P.
 //
 //go:nowritebarrierrec
-func startm(pp *p, spinning bool) {
+func startm(pp *p, spinning, lockheld bool) {
        // Disable preemption.
        //
        // Every owned P must have an owner that will eventually stop it in the
@@ -2369,7 +2643,9 @@ func startm(pp *p, spinning bool) {
        // startm. Callers passing a nil P may be preemptible, so we must
        // disable preemption before acquiring a P from pidleget below.
        mp := acquirem()
-       lock(&sched.lock)
+       if !lockheld {
+               lock(&sched.lock)
+       }
        if pp == nil {
                if spinning {
                        // TODO(prattmic): All remaining calls to this function
@@ -2379,7 +2655,9 @@ func startm(pp *p, spinning bool) {
                }
                pp, _ = pidleget(0)
                if pp == nil {
-                       unlock(&sched.lock)
+                       if !lockheld {
+                               unlock(&sched.lock)
+                       }
                        releasem(mp)
                        return
                }
@@ -2393,6 +2671,8 @@ func startm(pp *p, spinning bool) {
                // could find no idle P while checkdead finds a runnable G but
                // no running M's because this new M hasn't started yet, thus
                // throwing in an apparent deadlock.
+               // This apparent deadlock is possible when startm is called
+               // from sysmon, which doesn't count as a running M.
                //
                // Avoid this situation by pre-allocating the ID for the new M,
                // thus marking it as 'running' before we drop sched.lock. This
@@ -2407,12 +2687,18 @@ func startm(pp *p, spinning bool) {
                        fn = mspinning
                }
                newm(fn, pp, id)
+
+               if lockheld {
+                       lock(&sched.lock)
+               }
                // Ownership transfer of pp committed by start in newm.
                // Preemption is now safe.
                releasem(mp)
                return
        }
-       unlock(&sched.lock)
+       if !lockheld {
+               unlock(&sched.lock)
+       }
        if nmp.spinning {
                throw("startm: m is spinning")
        }
@@ -2441,24 +2727,24 @@ func handoffp(pp *p) {
 
        // if it has local work, start it straight away
        if !runqempty(pp) || sched.runqsize != 0 {
-               startm(pp, false)
+               startm(pp, false, false)
                return
        }
        // if there's trace work to do, start it straight away
-       if (trace.enabled || trace.shutdown) && traceReaderAvailable() != nil {
-               startm(pp, false)
+       if (traceEnabled() || traceShuttingDown()) && traceReaderAvailable() != nil {
+               startm(pp, false, false)
                return
        }
        // if it has GC work, start it straight away
        if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) {
-               startm(pp, false)
+               startm(pp, false, false)
                return
        }
        // no local work, check that there are no spinning/idle M's,
        // otherwise our help is not required
        if sched.nmspinning.Load()+sched.npidle.Load() == 0 && sched.nmspinning.CompareAndSwap(0, 1) { // TODO: fast atomic
                sched.needspinning.Store(0)
-               startm(pp, true)
+               startm(pp, true, false)
                return
        }
        lock(&sched.lock)
@@ -2480,14 +2766,14 @@ func handoffp(pp *p) {
        }
        if sched.runqsize != 0 {
                unlock(&sched.lock)
-               startm(pp, false)
+               startm(pp, false, false)
                return
        }
        // If this is the last running P and nobody is polling network,
        // need to wakeup another M to poll network.
        if sched.npidle.Load() == gomaxprocs-1 && sched.lastpoll.Load() != 0 {
                unlock(&sched.lock)
-               startm(pp, false)
+               startm(pp, false, false)
                return
        }
 
@@ -2536,7 +2822,7 @@ func wakep() {
        // see at least one running M (ours).
        unlock(&sched.lock)
 
-       startm(pp, true)
+       startm(pp, true, false)
 
        releasem(mp)
 }
@@ -2640,7 +2926,7 @@ func execute(gp *g, inheritTime bool) {
        casgstatus(gp, _Grunnable, _Grunning)
        gp.waitsince = 0
        gp.preempt = false
-       gp.stackguard0 = gp.stack.lo + _StackGuard
+       gp.stackguard0 = gp.stack.lo + stackGuard
        if !inheritTime {
                mp.p.ptr().schedtick++
        }
@@ -2651,13 +2937,15 @@ func execute(gp *g, inheritTime bool) {
                setThreadCPUProfiler(hz)
        }
 
-       if trace.enabled {
+       trace := traceAcquire()
+       if trace.ok() {
                // GoSysExit has to happen when we have a P, but before GoStart.
                // So we emit it here.
-               if gp.syscallsp != 0 && gp.sysblocktraced {
-                       traceGoSysExit(gp.sysexitticks)
+               if gp.syscallsp != 0 {
+                       trace.GoSysExit()
                }
-               traceGoStart()
+               trace.GoStart()
+               traceRelease(trace)
        }
 
        gogo(&gp.sched)
@@ -2691,11 +2979,15 @@ top:
        now, pollUntil, _ := checkTimers(pp, 0)
 
        // Try to schedule the trace reader.
-       if trace.enabled || trace.shutdown {
+       if traceEnabled() || traceShuttingDown() {
                gp := traceReader()
                if gp != nil {
+                       trace := traceAcquire()
                        casgstatus(gp, _Gwaiting, _Grunnable)
-                       traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
+                       }
                        return gp, false, true
                }
        }
@@ -2753,13 +3045,16 @@ top:
        // blocked thread (e.g. it has already returned from netpoll, but does
        // not set lastpoll yet), this thread will do blocking netpoll below
        // anyway.
-       if netpollinited() && netpollWaiters.Load() > 0 && sched.lastpoll.Load() != 0 {
-               if list := netpoll(0); !list.empty() { // non-blocking
+       if netpollinited() && netpollAnyWaiters() && sched.lastpoll.Load() != 0 {
+               if list, delta := netpoll(0); !list.empty() { // non-blocking
                        gp := list.pop()
                        injectglist(&list)
+                       netpollAdjustWaiters(delta)
+                       trace := traceAcquire()
                        casgstatus(gp, _Gwaiting, _Grunnable)
-                       if trace.enabled {
-                               traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
                        }
                        return gp, false, false
                }
@@ -2802,9 +3097,12 @@ top:
                if node != nil {
                        pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
                        gp := node.gp.ptr()
+
+                       trace := traceAcquire()
                        casgstatus(gp, _Gwaiting, _Grunnable)
-                       if trace.enabled {
-                               traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
                        }
                        return gp, false, false
                }
@@ -2817,9 +3115,11 @@ top:
        // until a callback was triggered.
        gp, otherReady := beforeIdle(now, pollUntil)
        if gp != nil {
+               trace := traceAcquire()
                casgstatus(gp, _Gwaiting, _Grunnable)
-               if trace.enabled {
-                       traceGoUnpark(gp, 0)
+               if trace.ok() {
+                       trace.GoUnpark(gp, 0)
+                       traceRelease(trace)
                }
                return gp, false, false
        }
@@ -2870,7 +3170,7 @@ top:
        //
        // This applies to the following sources of work:
        //
-       // * Goroutines added to a per-P run queue.
+       // * Goroutines added to the global or a per-P run queue.
        // * New/modified-earlier timers on a per-P timer heap.
        // * Idle-priority GC work (barring golang.org/issue/19112).
        //
@@ -2912,7 +3212,24 @@ top:
                //
                // See https://go.dev/issue/43997.
 
-               // Check all runqueues once again.
+               // Check global and P runqueues again.
+
+               lock(&sched.lock)
+               if sched.runqsize != 0 {
+                       pp, _ := pidlegetSpinning(0)
+                       if pp != nil {
+                               gp := globrunqget(pp, 0)
+                               if gp == nil {
+                                       throw("global runq empty with non-zero runqsize")
+                               }
+                               unlock(&sched.lock)
+                               acquirep(pp)
+                               mp.becomeSpinning()
+                               return gp, false, false
+                       }
+               }
+               unlock(&sched.lock)
+
                pp := checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
                if pp != nil {
                        acquirep(pp)
@@ -2928,9 +3245,11 @@ top:
 
                        // Run the idle worker.
                        pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
+                       trace := traceAcquire()
                        casgstatus(gp, _Gwaiting, _Grunnable)
-                       if trace.enabled {
-                               traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
                        }
                        return gp, false, false
                }
@@ -2945,7 +3264,7 @@ top:
        }
 
        // Poll network until next timer.
-       if netpollinited() && (netpollWaiters.Load() > 0 || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 {
+       if netpollinited() && (netpollAnyWaiters() || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 {
                sched.pollUntil.Store(pollUntil)
                if mp.p != 0 {
                        throw("findrunnable: netpoll with p")
@@ -2953,10 +3272,11 @@ top:
                if mp.spinning {
                        throw("findrunnable: netpoll with spinning")
                }
-               // Refresh now.
-               now = nanotime()
                delay := int64(-1)
                if pollUntil != 0 {
+                       if now == 0 {
+                               now = nanotime()
+                       }
                        delay = pollUntil - now
                        if delay < 0 {
                                delay = 0
@@ -2966,7 +3286,9 @@ top:
                        // When using fake time, just poll.
                        delay = 0
                }
-               list := netpoll(delay) // block until new work is available
+               list, delta := netpoll(delay) // block until new work is available
+               // Refresh now again, after potentially blocking.
+               now = nanotime()
                sched.pollUntil.Store(0)
                sched.lastpoll.Store(now)
                if faketime != 0 && list.empty() {
@@ -2980,14 +3302,18 @@ top:
                unlock(&sched.lock)
                if pp == nil {
                        injectglist(&list)
+                       netpollAdjustWaiters(delta)
                } else {
                        acquirep(pp)
                        if !list.empty() {
                                gp := list.pop()
                                injectglist(&list)
+                               netpollAdjustWaiters(delta)
+                               trace := traceAcquire()
                                casgstatus(gp, _Gwaiting, _Grunnable)
-                               if trace.enabled {
-                                       traceGoUnpark(gp, 0)
+                               if trace.ok() {
+                                       trace.GoUnpark(gp, 0)
+                                       traceRelease(trace)
                                }
                                return gp, false, false
                        }
@@ -3018,9 +3344,10 @@ func pollWork() bool {
        if !runqempty(p) {
                return true
        }
-       if netpollinited() && netpollWaiters.Load() > 0 && sched.lastpoll.Load() != 0 {
-               if list := netpoll(0); !list.empty() {
+       if netpollinited() && netpollAnyWaiters() && sched.lastpoll.Load() != 0 {
+               if list, delta := netpoll(0); !list.empty() {
                        injectglist(&list)
+                       netpollAdjustWaiters(delta)
                        return true
                }
        }
@@ -3254,10 +3581,12 @@ func injectglist(glist *gList) {
        if glist.empty() {
                return
        }
-       if trace.enabled {
+       trace := traceAcquire()
+       if trace.ok() {
                for gp := glist.head.ptr(); gp != nil; gp = gp.schedlink.ptr() {
-                       traceGoUnpark(gp, 0)
+                       trace.GoUnpark(gp, 0)
                }
+               traceRelease(trace)
        }
 
        // Mark all the goroutines as runnable before we put them
@@ -3289,8 +3618,8 @@ func injectglist(glist *gList) {
                                break
                        }
 
+                       startm(pp, false, true)
                        unlock(&sched.lock)
-                       startm(pp, false)
                        releasem(mp)
                }
        }
@@ -3357,6 +3686,18 @@ top:
 
        gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available
 
+       if debug.dontfreezetheworld > 0 && freezing.Load() {
+               // See comment in freezetheworld. We don't want to perturb
+               // scheduler state, so we didn't gcstopm in findRunnable, but
+               // also don't want to allow new goroutines to run.
+               //
+               // Deadlock here rather than in the findRunnable loop so if
+               // findRunnable is stuck in a loop we don't perturb that
+               // either.
+               lock(&deadlock)
+               lock(&deadlock)
+       }
+
        // This thread is going to run a goroutine and is not spinning anymore,
        // so if it was marked as spinning we need to reset it now and potentially
        // start a new spinning M.
@@ -3485,13 +3826,16 @@ func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
 func park_m(gp *g) {
        mp := getg().m
 
-       if trace.enabled {
-               traceGoPark(mp.waittraceev, mp.waittraceskip)
-       }
+       trace := traceAcquire()
 
        // N.B. Not using casGToWaiting here because the waitreason is
        // set by park_m's caller.
        casgstatus(gp, _Grunning, _Gwaiting)
+       if trace.ok() {
+               trace.GoPark(mp.waitTraceBlockReason, mp.waitTraceSkip)
+               traceRelease(trace)
+       }
+
        dropg()
 
        if fn := mp.waitunlockf; fn != nil {
@@ -3499,66 +3843,68 @@ func park_m(gp *g) {
                mp.waitunlockf = nil
                mp.waitlock = nil
                if !ok {
-                       if trace.enabled {
-                               traceGoUnpark(gp, 2)
-                       }
+                       trace := traceAcquire()
                        casgstatus(gp, _Gwaiting, _Grunnable)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 2)
+                               traceRelease(trace)
+                       }
                        execute(gp, true) // Schedule it back, never returns.
                }
        }
        schedule()
 }
 
-func goschedImpl(gp *g) {
+func goschedImpl(gp *g, preempted bool) {
+       trace := traceAcquire()
        status := readgstatus(gp)
        if status&^_Gscan != _Grunning {
                dumpgstatus(gp)
                throw("bad g status")
        }
        casgstatus(gp, _Grunning, _Grunnable)
+       if trace.ok() {
+               if preempted {
+                       trace.GoPreempt()
+               } else {
+                       trace.GoSched()
+               }
+               traceRelease(trace)
+       }
+
        dropg()
        lock(&sched.lock)
        globrunqput(gp)
        unlock(&sched.lock)
 
+       if mainStarted {
+               wakep()
+       }
+
        schedule()
 }
 
 // Gosched continuation on g0.
 func gosched_m(gp *g) {
-       if trace.enabled {
-               traceGoSched()
-       }
-       goschedImpl(gp)
+       goschedImpl(gp, false)
 }
 
 // goschedguarded is a forbidden-states-avoided version of gosched_m.
 func goschedguarded_m(gp *g) {
-
        if !canPreemptM(gp.m) {
                gogo(&gp.sched) // never return
        }
-
-       if trace.enabled {
-               traceGoSched()
-       }
-       goschedImpl(gp)
+       goschedImpl(gp, false)
 }
 
 func gopreempt_m(gp *g) {
-       if trace.enabled {
-               traceGoPreempt()
-       }
-       goschedImpl(gp)
+       goschedImpl(gp, true)
 }
 
 // preemptPark parks gp and puts it in _Gpreempted.
 //
 //go:systemstack
 func preemptPark(gp *g) {
-       if trace.enabled {
-               traceGoPark(traceEvGoBlock, 0)
-       }
        status := readgstatus(gp)
        if status&^_Gscan != _Grunning {
                dumpgstatus(gp)
@@ -3573,7 +3919,7 @@ func preemptPark(gp *g) {
                if !f.valid() {
                        throw("preempt at unknown pc")
                }
-               if f.flag&funcFlag_SPWRITE != 0 {
+               if f.flag&abi.FuncFlagSPWrite != 0 {
                        println("runtime: unexpected SPWRITE function", funcname(f), "in async preempt")
                        throw("preempt SPWRITE")
                }
@@ -3587,7 +3933,30 @@ func preemptPark(gp *g) {
        // transitions until we can dropg.
        casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted)
        dropg()
+
+       // Be careful about how we trace this next event. The ordering
+       // is subtle.
+       //
+       // The moment we CAS into _Gpreempted, suspendG could CAS to
+       // _Gwaiting, do its work, and ready the goroutine. All of
+       // this could happen before we even get the chance to emit
+       // an event. The end result is that the events could appear
+       // out of order, and the tracer generally assumes the scheduler
+       // takes care of the ordering between GoPark and GoUnpark.
+       //
+       // The answer here is simple: emit the event while we still hold
+       // the _Gscan bit on the goroutine. We still need to traceAcquire
+       // and traceRelease across the CAS because the tracer could be
+       // what's calling suspendG in the first place, and we want the
+       // CAS and event emission to appear atomic to the tracer.
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GoPark(traceBlockPreempted, 0)
+       }
        casfrom_Gscanstatus(gp, _Gscan|_Gpreempted, _Gpreempted)
+       if trace.ok() {
+               traceRelease(trace)
+       }
        schedule()
 }
 
@@ -3600,11 +3969,13 @@ func goyield() {
 }
 
 func goyield_m(gp *g) {
-       if trace.enabled {
-               traceGoPreempt()
-       }
+       trace := traceAcquire()
        pp := gp.m.p.ptr()
        casgstatus(gp, _Grunning, _Grunnable)
+       if trace.ok() {
+               trace.GoPreempt()
+               traceRelease(trace)
+       }
        dropg()
        runqput(pp, gp, false)
        schedule()
@@ -3615,8 +3986,10 @@ func goexit1() {
        if raceenabled {
                racegoend()
        }
-       if trace.enabled {
-               traceGoEnd()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GoEnd()
+               traceRelease(trace)
        }
        mcall(goexit0)
 }
@@ -3755,6 +4128,7 @@ func save(pc, sp uintptr) {
 //
 //go:nosplit
 func reentersyscall(pc, sp uintptr) {
+       trace := traceAcquire()
        gp := getg()
 
        // Disable preemption because during this function g is in Gsyscall status,
@@ -3773,6 +4147,11 @@ func reentersyscall(pc, sp uintptr) {
        gp.syscallsp = sp
        gp.syscallpc = pc
        casgstatus(gp, _Grunning, _Gsyscall)
+       if staticLockRanking {
+               // When doing static lock ranking casgstatus can call
+               // systemstack which clobbers g.sched.
+               save(pc, sp)
+       }
        if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
                systemstack(func() {
                        print("entersyscall inconsistent ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
@@ -3780,8 +4159,11 @@ func reentersyscall(pc, sp uintptr) {
                })
        }
 
-       if trace.enabled {
-               systemstack(traceGoSysCall)
+       if trace.ok() {
+               systemstack(func() {
+                       trace.GoSysCall()
+                       traceRelease(trace)
+               })
                // systemstack itself clobbers g.sched.{pc,sp} and we might
                // need them later when the G is genuinely blocked in a
                // syscall
@@ -3800,7 +4182,6 @@ func reentersyscall(pc, sp uintptr) {
        }
 
        gp.m.syscalltick = gp.m.p.ptr().syscalltick
-       gp.sysblocktraced = true
        pp := gp.m.p.ptr()
        pp.m = 0
        gp.m.oldp.set(pp)
@@ -3839,9 +4220,11 @@ func entersyscall_gcwait() {
 
        lock(&sched.lock)
        if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) {
-               if trace.enabled {
-                       traceGoSysBlock(pp)
-                       traceProcStop(pp)
+               trace := traceAcquire()
+               if trace.ok() {
+                       trace.GoSysBlock(pp)
+                       trace.ProcStop(pp)
+                       traceRelease(trace)
                }
                pp.syscalltick++
                if sched.stopwait--; sched.stopwait == 0 {
@@ -3861,7 +4244,6 @@ func entersyscallblock() {
        gp.throwsplit = true
        gp.stackguard0 = stackPreempt // see comment in entersyscall
        gp.m.syscalltick = gp.m.p.ptr().syscalltick
-       gp.sysblocktraced = true
        gp.m.p.ptr().syscalltick++
 
        // Leave SP around for GC and traceback.
@@ -3896,9 +4278,11 @@ func entersyscallblock() {
 }
 
 func entersyscallblock_handoff() {
-       if trace.enabled {
-               traceGoSysCall()
-               traceGoSysBlock(getg().m.p.ptr())
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GoSysCall()
+               trace.GoSysBlock(getg().m.p.ptr())
+               traceRelease(trace)
        }
        handoffp(releasep())
 }
@@ -3937,15 +4321,21 @@ func exitsyscall() {
                                tryRecordGoroutineProfileWB(gp)
                        })
                }
-               if trace.enabled {
+               trace := traceAcquire()
+               if trace.ok() {
                        if oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-                               systemstack(traceGoStart)
+                               systemstack(func() {
+                                       trace.GoStart()
+                               })
                        }
                }
                // There's a cpu for us, so we can run.
                gp.m.p.ptr().syscalltick++
                // We need to cas the status and scan before resuming...
                casgstatus(gp, _Gsyscall, _Grunning)
+               if trace.ok() {
+                       traceRelease(trace)
+               }
 
                // Garbage collector isn't running (since we are),
                // so okay to clear syscallsp.
@@ -3955,8 +4345,8 @@ func exitsyscall() {
                        // restore the preemption request in case we've cleared it in newstack
                        gp.stackguard0 = stackPreempt
                } else {
-                       // otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
-                       gp.stackguard0 = gp.stack.lo + _StackGuard
+                       // otherwise restore the real stackGuard, we've spoiled it in entersyscall/entersyscallblock
+                       gp.stackguard0 = gp.stack.lo + stackGuard
                }
                gp.throwsplit = false
 
@@ -3968,8 +4358,8 @@ func exitsyscall() {
                return
        }
 
-       gp.sysexitticks = 0
-       if trace.enabled {
+       trace := traceAcquire()
+       if trace.ok() {
                // Wait till traceGoSysBlock event is emitted.
                // This ensures consistency of the trace (the goroutine is started after it is blocked).
                for oldp != nil && oldp.syscalltick == gp.m.syscalltick {
@@ -3979,7 +4369,8 @@ func exitsyscall() {
                // Tracing code can invoke write barriers that cannot run without a P.
                // So instead we remember the syscall exit time and emit the event
                // in execute when we have a P.
-               gp.sysexitticks = cputicks()
+               gp.trace.sysExitTime = traceClockNow()
+               traceRelease(trace)
        }
 
        gp.m.locks--
@@ -4020,15 +4411,19 @@ func exitsyscallfast(oldp *p) bool {
                var ok bool
                systemstack(func() {
                        ok = exitsyscallfast_pidle()
-                       if ok && trace.enabled {
-                               if oldp != nil {
-                                       // Wait till traceGoSysBlock event is emitted.
-                                       // This ensures consistency of the trace (the goroutine is started after it is blocked).
-                                       for oldp.syscalltick == gp.m.syscalltick {
-                                               osyield()
+                       if ok {
+                               trace := traceAcquire()
+                               if trace.ok() {
+                                       if oldp != nil {
+                                               // Wait till traceGoSysBlock event is emitted.
+                                               // This ensures consistency of the trace (the goroutine is started after it is blocked).
+                                               for oldp.syscalltick == gp.m.syscalltick {
+                                                       osyield()
+                                               }
                                        }
+                                       trace.GoSysExit()
+                                       traceRelease(trace)
                                }
-                               traceGoSysExit(0)
                        }
                })
                if ok {
@@ -4046,15 +4441,17 @@ func exitsyscallfast(oldp *p) bool {
 func exitsyscallfast_reacquired() {
        gp := getg()
        if gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-               if trace.enabled {
+               trace := traceAcquire()
+               if trace.ok() {
                        // The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed).
                        // traceGoSysBlock for this syscall was already emitted,
                        // but here we effectively retake the p from the new syscall running on the same p.
                        systemstack(func() {
                                // Denote blocking of the new syscall.
-                               traceGoSysBlock(gp.m.p.ptr())
+                               trace.GoSysBlock(gp.m.p.ptr())
                                // Denote completion of the current syscall.
-                               traceGoSysExit(0)
+                               trace.GoSysExit()
+                               traceRelease(trace)
                        })
                }
                gp.m.p.ptr().syscalltick++
@@ -4137,7 +4534,7 @@ func syscall_runtime_BeforeFork() {
 
        // This function is called before fork in syscall package.
        // Code between fork and exec must not allocate memory nor even try to grow stack.
-       // Here we spoil g->_StackGuard to reliably detect any attempts to grow stack.
+       // Here we spoil g.stackguard0 to reliably detect any attempts to grow stack.
        // runtime_AfterFork will undo this in parent process, but not in child.
        gp.stackguard0 = stackFork
 }
@@ -4150,7 +4547,7 @@ func syscall_runtime_AfterFork() {
        gp := getg().m.curg
 
        // See the comments in beforefork.
-       gp.stackguard0 = gp.stack.lo + _StackGuard
+       gp.stackguard0 = gp.stack.lo + stackGuard
 
        msigrestore(gp.m.sigmask)
 
@@ -4220,11 +4617,11 @@ func syscall_runtime_AfterExec() {
 func malg(stacksize int32) *g {
        newg := new(g)
        if stacksize >= 0 {
-               stacksize = round2(_StackSystem + stacksize)
+               stacksize = round2(stackSystem + stacksize)
                systemstack(func() {
                        newg.stack = stackalloc(uint32(stacksize))
                })
-               newg.stackguard0 = newg.stack.lo + _StackGuard
+               newg.stackguard0 = newg.stack.lo + stackGuard
                newg.stackguard1 = ^uintptr(0)
                // Clear the bottom word of the stack. We record g
                // there on gsignal stack during VDSO on ARM and ARM64.
@@ -4263,7 +4660,7 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
        pp := mp.p.ptr()
        newg := gfget(pp)
        if newg == nil {
-               newg = malg(_StackMin)
+               newg = malg(stackMin)
                casgstatus(newg, _Gidle, _Gdead)
                allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
        }
@@ -4278,12 +4675,14 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
        totalSize := uintptr(4*goarch.PtrSize + sys.MinFrameSize) // extra space in case of reads slightly beyond frame
        totalSize = alignUp(totalSize, sys.StackAlign)
        sp := newg.stack.hi - totalSize
-       spArg := sp
        if usesLR {
                // caller's LR
                *(*uintptr)(unsafe.Pointer(sp)) = 0
                prepGoExitFrame(sp)
-               spArg += sys.MinFrameSize
+       }
+       if GOARCH == "arm64" {
+               // caller's FP
+               *(*uintptr)(unsafe.Pointer(sp - goarch.PtrSize)) = 0
        }
 
        memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
@@ -4317,9 +4716,11 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
        if newg.trackingSeq%gTrackingPeriod == 0 {
                newg.tracking = true
        }
-       casgstatus(newg, _Gdead, _Grunnable)
        gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo))
 
+       // Get a goid and switch to runnable. Make all this atomic to the tracer.
+       trace := traceAcquire()
+       casgstatus(newg, _Gdead, _Grunnable)
        if pp.goidcache == pp.goidcacheend {
                // Sched.goidgen is the last allocated id,
                // this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
@@ -4330,17 +4731,21 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
        }
        newg.goid = pp.goidcache
        pp.goidcache++
+       if trace.ok() {
+               trace.GoCreate(newg, newg.startpc)
+               traceRelease(trace)
+       }
+
+       // Set up race context.
        if raceenabled {
                newg.racectx = racegostart(callerpc)
+               newg.raceignore = 0
                if newg.labels != nil {
                        // See note in proflabel.go on labelSync's role in synchronizing
                        // with the reads in the signal handler.
                        racereleasemergeg(newg, unsafe.Pointer(&labelSync))
                }
        }
-       if trace.enabled {
-               traceGoCreate(newg, newg.startpc)
-       }
        releasem(mp)
 
        return newg
@@ -4467,7 +4872,7 @@ retry:
                systemstack(func() {
                        gp.stack = stackalloc(startingStackSize)
                })
-               gp.stackguard0 = gp.stack.lo + _StackGuard
+               gp.stackguard0 = gp.stack.lo + stackGuard
        } else {
                if raceenabled {
                        racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
@@ -4891,6 +5296,7 @@ func (pp *p) destroy() {
                pp.sudogbuf[i] = nil
        }
        pp.sudogcache = pp.sudogbuf[:0]
+       pp.pinnerCache = nil
        for j := range pp.deferpoolbuf {
                pp.deferpoolbuf[j] = nil
        }
@@ -4948,8 +5354,10 @@ func procresize(nprocs int32) *p {
        if old < 0 || nprocs <= 0 {
                throw("procresize: invalid arg")
        }
-       if trace.enabled {
-               traceGomaxprocs(nprocs)
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.Gomaxprocs(nprocs)
+               traceRelease(trace)
        }
 
        // update statistics
@@ -5014,12 +5422,14 @@ func procresize(nprocs int32) *p {
                // because p.destroy itself has write barriers, so we
                // need to do that from a valid P.
                if gp.m.p != 0 {
-                       if trace.enabled {
+                       trace := traceAcquire()
+                       if trace.ok() {
                                // Pretend that we were descheduled
                                // and then scheduled again to keep
                                // the trace sane.
-                               traceGoSched()
-                               traceProcStop(gp.m.p.ptr())
+                               trace.GoSched()
+                               trace.ProcStop(gp.m.p.ptr())
+                               traceRelease(trace)
                        }
                        gp.m.p.ptr().m = 0
                }
@@ -5028,8 +5438,10 @@ func procresize(nprocs int32) *p {
                pp.m = 0
                pp.status = _Pidle
                acquirep(pp)
-               if trace.enabled {
-                       traceGoStart()
+               trace := traceAcquire()
+               if trace.ok() {
+                       trace.GoStart()
+                       traceRelease(trace)
                }
        }
 
@@ -5093,8 +5505,10 @@ func acquirep(pp *p) {
        // from a potentially stale mcache.
        pp.mcache.prepareForSweep()
 
-       if trace.enabled {
-               traceProcStart()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.ProcStart()
+               traceRelease(trace)
        }
 }
 
@@ -5135,8 +5549,10 @@ func releasep() *p {
                print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
                throw("releasep: invalid p state")
        }
-       if trace.enabled {
-               traceProcStop(gp.m.p.ptr())
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.ProcStop(gp.m.p.ptr())
+               traceRelease(trace)
        }
        gp.m.p = 0
        pp.m = 0
@@ -5179,13 +5595,8 @@ func checkdead() {
        // accommodate callbacks created by syscall.NewCallback. See issue #6751
        // for details.)
        var run0 int32
-       if !iscgo && cgoHasExtraM {
-               mp := lockextra(true)
-               haveExtraM := extraMCount > 0
-               unlockextra(mp)
-               if haveExtraM {
-                       run0 = 1
-               }
+       if !iscgo && cgoHasExtraM && extraMLength.Load() > 0 {
+               run0 = 1
        }
 
        run := mcount() - sched.nmidle - sched.nmidlelocked - sched.nmsys
@@ -5194,6 +5605,7 @@ func checkdead() {
        }
        if run < 0 {
                print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", mcount(), " nmsys=", sched.nmsys, "\n")
+               unlock(&sched.lock)
                throw("checkdead: inconsistent counts")
        }
 
@@ -5211,6 +5623,7 @@ func checkdead() {
                        _Grunning,
                        _Gsyscall:
                        print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
+                       unlock(&sched.lock)
                        throw("checkdead: runnable g")
                }
        })
@@ -5229,12 +5642,14 @@ func checkdead() {
                        if pp == nil {
                                // There should always be a free P since
                                // nothing is running.
+                               unlock(&sched.lock)
                                throw("checkdead: no p for timer")
                        }
                        mp := mget()
                        if mp == nil {
                                // There should always be a free M since
                                // nothing is running.
+                               unlock(&sched.lock)
                                throw("checkdead: no m for timer")
                        }
                        // M must be spinning to steal. We set this to be
@@ -5357,7 +5772,7 @@ func sysmon() {
                lastpoll := sched.lastpoll.Load()
                if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
                        sched.lastpoll.CompareAndSwap(lastpoll, now)
-                       list := netpoll(0) // non-blocking - returns list of goroutines
+                       list, delta := netpoll(0) // non-blocking - returns list of goroutines
                        if !list.empty() {
                                // Need to decrement number of idle locked M's
                                // (pretending that one more is running) before injectglist.
@@ -5369,6 +5784,7 @@ func sysmon() {
                                incidlelocked(-1)
                                injectglist(&list)
                                incidlelocked(1)
+                               netpollAdjustWaiters(delta)
                        }
                }
                if GOOS == "netbsd" && needSysmonWorkaround {
@@ -5388,7 +5804,7 @@ func sysmon() {
                        // See issue 42515 and
                        // https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094.
                        if next := timeSleepUntil(); next < now {
-                               startm(nil, false)
+                               startm(nil, false, false)
                        }
                }
                if scavenger.sysmonWake.Load() != 0 {
@@ -5483,9 +5899,11 @@ func retake(now int64) uint32 {
                        // increment nmidle and report deadlock.
                        incidlelocked(-1)
                        if atomic.Cas(&pp.status, s, _Pidle) {
-                               if trace.enabled {
-                                       traceGoSysBlock(pp)
-                                       traceProcStop(pp)
+                               trace := traceAcquire()
+                               if trace.ok() {
+                                       trace.GoSysBlock(pp)
+                                       trace.ProcStop(pp)
+                                       traceRelease(trace)
                                }
                                n++
                                pp.syscalltick++
@@ -5660,7 +6078,7 @@ func schedEnableUser(enable bool) {
                globrunqputbatch(&sched.disable.runnable, n)
                unlock(&sched.lock)
                for ; n != 0 && sched.npidle.Load() != 0; n-- {
-                       startm(nil, false)
+                       startm(nil, false, false)
                }
        } else {
                unlock(&sched.lock)