runtime: refactor runtime->tracer API to appear more like a lock

[gostls13.git] / src / runtime / proc.go
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index fafab7f58c487c6c4c7ab1e2bc3e441483e7c304..ae2562a5b76caa70e5a6ca3b71eb53d1909f3c4b 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -244,8 +244,10 @@ func main() {
         // list can arrive a few different ways, but it will always
         // contain the init tasks computed by the linker for all the
         // packages in the program (excluding those added at runtime
-       // by package plugin).
-       for _, m := range activeModules() {
+       // by package plugin). Run through the modules in dependency
+       // order (the order they are initialized by the dynamic
+       // loader, i.e. they are added to the moduledata linked list).
+       for m := &firstmoduledata; m != nil; m = m.next {
                 doInit(m.inittasks)
         }
  
@@ -514,7 +516,20 @@ func badreflectcall() {
  //go:nosplit
  //go:nowritebarrierrec
  func badmorestackg0() {
-       writeErrStr("fatal: morestack on g0\n")
+       if !crashStackImplemented {
+               writeErrStr("fatal: morestack on g0\n")
+               return
+       }
+
+       g := getg()
+       switchToCrashStack(func() {
+               print("runtime: morestack on g0, stack [", hex(g.stack.lo), " ", hex(g.stack.hi), "], sp=", hex(g.sched.sp), ", called from\n")
+               g.m.traceback = 2 // include pc and sp in stack trace
+               traceback1(g.sched.pc, g.sched.sp, g.sched.lr, g, 0)
+               print("\n")
+
+               throw("morestack on g0")
+       })
  }
  
  //go:nosplit
@@ -528,6 +543,42 @@ func badctxt() {
         throw("ctxt != 0")
  }
  
+// gcrash is a fake g that can be used when crashing due to bad
+// stack conditions.
+var gcrash g
+
+var crashingG atomic.Pointer[g]
+
+// Switch to crashstack and call fn, with special handling of
+// concurrent and recursive cases.
+//
+// Nosplit as it is called in a bad stack condition (we know
+// morestack would fail).
+//
+//go:nosplit
+//go:nowritebarrierrec
+func switchToCrashStack(fn func()) {
+       me := getg()
+       if crashingG.CompareAndSwapNoWB(nil, me) {
+               switchToCrashStack0(fn) // should never return
+               abort()
+       }
+       if crashingG.Load() == me {
+               // recursive crashing. too bad.
+               writeErrStr("fatal: recursive switchToCrashStack\n")
+               abort()
+       }
+       // Another g is crashing. Give it some time, hopefully it will finish traceback.
+       usleep_no_g(100)
+       writeErrStr("fatal: concurrent switchToCrashStack\n")
+       abort()
+}
+
+const crashStackImplemented = GOARCH == "amd64" || GOARCH == "arm64" || GOARCH == "mips64" || GOARCH == "mips64le" || GOARCH == "riscv64"
+
+//go:noescape
+func switchToCrashStack0(fn func()) // in assembly
+
  func lockedOSThread() bool {
         gp := getg()
         return gp.lockedm != 0 && gp.m.lockedg != 0
@@ -745,6 +796,12 @@ func schedinit() {
         parsedebugvars()
         gcinit()
  
+       // Allocate stack space that can be used when crashing due to bad stack
+       // conditions, e.g. morestack on g0.
+       gcrash.stack = stackalloc(16384)
+       gcrash.stackguard0 = gcrash.stack.lo + 1000
+       gcrash.stackguard1 = gcrash.stack.lo + 1000
+
         // if disableMemoryProfiling is set, update MemProfileRate to 0 to turn off memprofile.
         // Note: parsedebugvars may update MemProfileRate, but when disableMemoryProfiling is
         // set to true by the linker, it means that nothing is consuming the profile, it is
@@ -889,10 +946,6 @@ func fastrandinit() {
  
  // Mark gp ready to run.
  func ready(gp *g, traceskip int, next bool) {
-       if traceEnabled() {
-               traceGoUnpark(gp, traceskip)
-       }
-
         status := readgstatus(gp)
  
         // Mark runnable.
@@ -903,7 +956,12 @@ func ready(gp *g, traceskip int, next bool) {
         }
  
         // status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
+       trace := traceAcquire()
         casgstatus(gp, _Gwaiting, _Grunnable)
+       if trace.ok() {
+               trace.GoUnpark(gp, traceskip)
+               traceRelease(trace)
+       }
         runqput(mp.p.ptr(), gp, next)
         wakep()
         releasem(mp)
@@ -1350,8 +1408,10 @@ var gcsema uint32 = 1
  // Holding worldsema causes any other goroutines invoking
  // stopTheWorld to block.
  func stopTheWorldWithSema(reason stwReason) {
-       if traceEnabled() {
-               traceSTWStart(reason)
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.STWStart(reason)
+               traceRelease(trace)
         }
         gp := getg()
  
@@ -1369,17 +1429,22 @@ func stopTheWorldWithSema(reason stwReason) {
         gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
         sched.stopwait--
         // try to retake all P's in Psyscall status
+       trace = traceAcquire()
         for _, pp := range allp {
                 s := pp.status
                 if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) {
-                       if traceEnabled() {
-                               traceGoSysBlock(pp)
-                               traceProcStop(pp)
+                       if trace.ok() {
+                               trace.GoSysBlock(pp)
+                               trace.ProcStop(pp)
                         }
                         pp.syscalltick++
                         sched.stopwait--
                 }
         }
+       if trace.ok() {
+               traceRelease(trace)
+       }
+
         // stop idle P's
         now := nanotime()
         for {
@@ -1476,8 +1541,10 @@ func startTheWorldWithSema() int64 {
  
         // Capture start-the-world time before doing clean-up tasks.
         startTime := nanotime()
-       if traceEnabled() {
-               traceSTWDone()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.STWDone()
+               traceRelease(trace)
         }
  
         // Wakeup an additional proc in case we have excessive runnable goroutines
@@ -1542,7 +1609,7 @@ func mstart0() {
                 // but is somewhat arbitrary.
                 size := gp.stack.hi
                 if size == 0 {
-                       size = 8192 * sys.StackGuardMultiplier
+                       size = 16384 * sys.StackGuardMultiplier
                 }
                 gp.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
                 gp.stack.lo = gp.stack.hi - size + 1024
@@ -1796,17 +1863,21 @@ func forEachP(fn func(*p)) {
  
         // Force Ps currently in _Psyscall into _Pidle and hand them
         // off to induce safe point function execution.
+       trace := traceAcquire()
         for _, p2 := range allp {
                 s := p2.status
                 if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) {
-                       if traceEnabled() {
-                               traceGoSysBlock(p2)
-                               traceProcStop(p2)
+                       if trace.ok() {
+                               trace.GoSysBlock(p2)
+                               trace.ProcStop(p2)
                         }
                         p2.syscalltick++
                         handoffp(p2)
                 }
         }
+       if trace.ok() {
+               traceRelease(trace)
+       }
  
         // Wait for remaining Ps to run fn.
         if wait {
@@ -1938,7 +2009,7 @@ func allocm(pp *p, fn func(), id int64) *m {
         if iscgo || mStackIsSystemAllocated() {
                 mp.g0 = malg(-1)
         } else {
-               mp.g0 = malg(8192 * sys.StackGuardMultiplier)
+               mp.g0 = malg(16384 * sys.StackGuardMultiplier)
         }
         mp.g0.m = mp
  
@@ -2036,30 +2107,10 @@ func needm(signal bool) {
         osSetupTLS(mp)
  
         // Install g (= m->g0) and set the stack bounds
-       // to match the current stack. If we don't actually know
-       // how big the stack is, like we don't know how big any
-       // scheduling stack is, but we assume there's at least 32 kB.
-       // If we can get a more accurate stack bound from pthread,
-       // use that.
+       // to match the current stack.
         setg(mp.g0)
-       gp := getg()
-       gp.stack.hi = getcallersp() + 1024
-       gp.stack.lo = getcallersp() - 32*1024
-       if !signal && _cgo_getstackbound != nil {
-               // Don't adjust if called from the signal handler.
-               // We are on the signal stack, not the pthread stack.
-               // (We could get the stack bounds from sigaltstack, but
-               // we're getting out of the signal handler very soon
-               // anyway. Not worth it.)
-               var bounds [2]uintptr
-               asmcgocall(_cgo_getstackbound, unsafe.Pointer(&bounds))
-               // getstackbound is an unsupported no-op on Windows.
-               if bounds[0] != 0 {
-                       gp.stack.lo = bounds[0]
-                       gp.stack.hi = bounds[1]
-               }
-       }
-       gp.stackguard0 = gp.stack.lo + stackGuard
+       sp := getcallersp()
+       callbackUpdateSystemStack(mp, sp, signal)
  
         // Should mark we are already in Go now.
         // Otherwise, we may call needm again when we get a signal, before cgocallbackg1,
@@ -2135,8 +2186,10 @@ func oneNewExtraM() {
         if raceenabled {
                 gp.racectx = racegostart(abi.FuncPCABIInternal(newextram) + sys.PCQuantum)
         }
-       if traceEnabled() {
-               traceOneNewExtraM(gp)
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.OneNewExtraM(gp)
+               traceRelease(trace)
         }
         // put on allg for garbage collector
         allgadd(gp)
@@ -2176,9 +2229,14 @@ func oneNewExtraM() {
  // So that the destructor would invoke dropm while the non-Go thread is exiting.
  // This is much faster since it avoids expensive signal-related syscalls.
  //
-// NOTE: this always runs without a P, so, nowritebarrierrec required.
+// This always runs without a P, so //go:nowritebarrierrec is required.
+//
+// This may run with a different stack than was recorded in g0 (there is no
+// call to callbackUpdateSystemStack prior to dropm), so this must be
+// //go:nosplit to avoid the stack bounds check.
  //
  //go:nowritebarrierrec
+//go:nosplit
  func dropm() {
         // Clear m and g, and return m to the extra list.
         // After the call to setg we can only call nosplit functions
@@ -2200,6 +2258,14 @@ func dropm() {
  
         setg(nil)
  
+       // Clear g0 stack bounds to ensure that needm always refreshes the
+       // bounds when reusing this M.
+       g0 := mp.g0
+       g0.stack.hi = 0
+       g0.stack.lo = 0
+       g0.stackguard0 = 0
+       g0.stackguard1 = 0
+
         putExtraM(mp)
  
         msigrestore(sigmask)
@@ -2871,13 +2937,15 @@ func execute(gp *g, inheritTime bool) {
                 setThreadCPUProfiler(hz)
         }
  
-       if traceEnabled() {
+       trace := traceAcquire()
+       if trace.ok() {
                 // GoSysExit has to happen when we have a P, but before GoStart.
                 // So we emit it here.
                 if gp.syscallsp != 0 {
-                       traceGoSysExit()
+                       trace.GoSysExit()
                 }
-               traceGoStart()
+               trace.GoStart()
+               traceRelease(trace)
         }
  
         gogo(&gp.sched)
@@ -2914,8 +2982,12 @@ top:
         if traceEnabled() || traceShuttingDown() {
                 gp := traceReader()
                 if gp != nil {
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
-                       traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
+                       }
                         return gp, false, true
                 }
         }
@@ -2978,9 +3050,11 @@ top:
                         gp := list.pop()
                         injectglist(&list)
                         netpollAdjustWaiters(delta)
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
-                       if traceEnabled() {
-                               traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
                         }
                         return gp, false, false
                 }
@@ -3023,9 +3097,12 @@ top:
                 if node != nil {
                         pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
                         gp := node.gp.ptr()
+
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
-                       if traceEnabled() {
-                               traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
                         }
                         return gp, false, false
                 }
@@ -3038,9 +3115,11 @@ top:
         // until a callback was triggered.
         gp, otherReady := beforeIdle(now, pollUntil)
         if gp != nil {
+               trace := traceAcquire()
                 casgstatus(gp, _Gwaiting, _Grunnable)
-               if traceEnabled() {
-                       traceGoUnpark(gp, 0)
+               if trace.ok() {
+                       trace.GoUnpark(gp, 0)
+                       traceRelease(trace)
                 }
                 return gp, false, false
         }
@@ -3166,9 +3245,11 @@ top:
  
                         // Run the idle worker.
                         pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
-                       if traceEnabled() {
-                               traceGoUnpark(gp, 0)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 0)
+                               traceRelease(trace)
                         }
                         return gp, false, false
                 }
@@ -3228,9 +3309,11 @@ top:
                                 gp := list.pop()
                                 injectglist(&list)
                                 netpollAdjustWaiters(delta)
+                               trace := traceAcquire()
                                 casgstatus(gp, _Gwaiting, _Grunnable)
-                               if traceEnabled() {
-                                       traceGoUnpark(gp, 0)
+                               if trace.ok() {
+                                       trace.GoUnpark(gp, 0)
+                                       traceRelease(trace)
                                 }
                                 return gp, false, false
                         }
@@ -3498,10 +3581,12 @@ func injectglist(glist *gList) {
         if glist.empty() {
                 return
         }
-       if traceEnabled() {
+       trace := traceAcquire()
+       if trace.ok() {
                 for gp := glist.head.ptr(); gp != nil; gp = gp.schedlink.ptr() {
-                       traceGoUnpark(gp, 0)
+                       trace.GoUnpark(gp, 0)
                 }
+               traceRelease(trace)
         }
  
         // Mark all the goroutines as runnable before we put them
@@ -3741,13 +3826,16 @@ func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
  func park_m(gp *g) {
         mp := getg().m
  
-       if traceEnabled() {
-               traceGoPark(mp.waitTraceBlockReason, mp.waitTraceSkip)
-       }
+       trace := traceAcquire()
  
         // N.B. Not using casGToWaiting here because the waitreason is
         // set by park_m's caller.
         casgstatus(gp, _Grunning, _Gwaiting)
+       if trace.ok() {
+               trace.GoPark(mp.waitTraceBlockReason, mp.waitTraceSkip)
+               traceRelease(trace)
+       }
+
         dropg()
  
         if fn := mp.waitunlockf; fn != nil {
@@ -3755,23 +3843,35 @@ func park_m(gp *g) {
                 mp.waitunlockf = nil
                 mp.waitlock = nil
                 if !ok {
-                       if traceEnabled() {
-                               traceGoUnpark(gp, 2)
-                       }
+                       trace := traceAcquire()
                         casgstatus(gp, _Gwaiting, _Grunnable)
+                       if trace.ok() {
+                               trace.GoUnpark(gp, 2)
+                               traceRelease(trace)
+                       }
                         execute(gp, true) // Schedule it back, never returns.
                 }
         }
         schedule()
  }
  
-func goschedImpl(gp *g) {
+func goschedImpl(gp *g, preempted bool) {
+       trace := traceAcquire()
         status := readgstatus(gp)
         if status&^_Gscan != _Grunning {
                 dumpgstatus(gp)
                 throw("bad g status")
         }
         casgstatus(gp, _Grunning, _Grunnable)
+       if trace.ok() {
+               if preempted {
+                       trace.GoPreempt()
+               } else {
+                       trace.GoSched()
+               }
+               traceRelease(trace)
+       }
+
         dropg()
         lock(&sched.lock)
         globrunqput(gp)
@@ -3786,39 +3886,25 @@ func goschedImpl(gp *g) {
  
  // Gosched continuation on g0.
  func gosched_m(gp *g) {
-       if traceEnabled() {
-               traceGoSched()
-       }
-       goschedImpl(gp)
+       goschedImpl(gp, false)
  }
  
  // goschedguarded is a forbidden-states-avoided version of gosched_m.
  func goschedguarded_m(gp *g) {
-
         if !canPreemptM(gp.m) {
                 gogo(&gp.sched) // never return
         }
-
-       if traceEnabled() {
-               traceGoSched()
-       }
-       goschedImpl(gp)
+       goschedImpl(gp, false)
  }
  
  func gopreempt_m(gp *g) {
-       if traceEnabled() {
-               traceGoPreempt()
-       }
-       goschedImpl(gp)
+       goschedImpl(gp, true)
  }
  
  // preemptPark parks gp and puts it in _Gpreempted.
  //
  //go:systemstack
  func preemptPark(gp *g) {
-       if traceEnabled() {
-               traceGoPark(traceBlockPreempted, 0)
-       }
         status := readgstatus(gp)
         if status&^_Gscan != _Grunning {
                 dumpgstatus(gp)
@@ -3847,7 +3933,30 @@ func preemptPark(gp *g) {
         // transitions until we can dropg.
         casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted)
         dropg()
+
+       // Be careful about how we trace this next event. The ordering
+       // is subtle.
+       //
+       // The moment we CAS into _Gpreempted, suspendG could CAS to
+       // _Gwaiting, do its work, and ready the goroutine. All of
+       // this could happen before we even get the chance to emit
+       // an event. The end result is that the events could appear
+       // out of order, and the tracer generally assumes the scheduler
+       // takes care of the ordering between GoPark and GoUnpark.
+       //
+       // The answer here is simple: emit the event while we still hold
+       // the _Gscan bit on the goroutine. We still need to traceAcquire
+       // and traceRelease across the CAS because the tracer could be
+       // what's calling suspendG in the first place, and we want the
+       // CAS and event emission to appear atomic to the tracer.
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GoPark(traceBlockPreempted, 0)
+       }
         casfrom_Gscanstatus(gp, _Gscan|_Gpreempted, _Gpreempted)
+       if trace.ok() {
+               traceRelease(trace)
+       }
         schedule()
  }
  
@@ -3860,11 +3969,13 @@ func goyield() {
  }
  
  func goyield_m(gp *g) {
-       if traceEnabled() {
-               traceGoPreempt()
-       }
+       trace := traceAcquire()
         pp := gp.m.p.ptr()
         casgstatus(gp, _Grunning, _Grunnable)
+       if trace.ok() {
+               trace.GoPreempt()
+               traceRelease(trace)
+       }
         dropg()
         runqput(pp, gp, false)
         schedule()
@@ -3875,8 +3986,10 @@ func goexit1() {
         if raceenabled {
                 racegoend()
         }
-       if traceEnabled() {
-               traceGoEnd()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GoEnd()
+               traceRelease(trace)
         }
         mcall(goexit0)
  }
@@ -4015,6 +4128,7 @@ func save(pc, sp uintptr) {
  //
  //go:nosplit
  func reentersyscall(pc, sp uintptr) {
+       trace := traceAcquire()
         gp := getg()
  
         // Disable preemption because during this function g is in Gsyscall status,
@@ -4045,8 +4159,11 @@ func reentersyscall(pc, sp uintptr) {
                 })
         }
  
-       if traceEnabled() {
-               systemstack(traceGoSysCall)
+       if trace.ok() {
+               systemstack(func() {
+                       trace.GoSysCall()
+                       traceRelease(trace)
+               })
                 // systemstack itself clobbers g.sched.{pc,sp} and we might
                 // need them later when the G is genuinely blocked in a
                 // syscall
@@ -4103,9 +4220,11 @@ func entersyscall_gcwait() {
  
         lock(&sched.lock)
         if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) {
-               if traceEnabled() {
-                       traceGoSysBlock(pp)
-                       traceProcStop(pp)
+               trace := traceAcquire()
+               if trace.ok() {
+                       trace.GoSysBlock(pp)
+                       trace.ProcStop(pp)
+                       traceRelease(trace)
                 }
                 pp.syscalltick++
                 if sched.stopwait--; sched.stopwait == 0 {
@@ -4159,9 +4278,11 @@ func entersyscallblock() {
  }
  
  func entersyscallblock_handoff() {
-       if traceEnabled() {
-               traceGoSysCall()
-               traceGoSysBlock(getg().m.p.ptr())
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.GoSysCall()
+               trace.GoSysBlock(getg().m.p.ptr())
+               traceRelease(trace)
         }
         handoffp(releasep())
  }
@@ -4200,15 +4321,21 @@ func exitsyscall() {
                                 tryRecordGoroutineProfileWB(gp)
                         })
                 }
-               if traceEnabled() {
+               trace := traceAcquire()
+               if trace.ok() {
                         if oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-                               systemstack(traceGoStart)
+                               systemstack(func() {
+                                       trace.GoStart()
+                               })
                         }
                 }
                 // There's a cpu for us, so we can run.
                 gp.m.p.ptr().syscalltick++
                 // We need to cas the status and scan before resuming...
                 casgstatus(gp, _Gsyscall, _Grunning)
+               if trace.ok() {
+                       traceRelease(trace)
+               }
  
                 // Garbage collector isn't running (since we are),
                 // so okay to clear syscallsp.
@@ -4231,7 +4358,8 @@ func exitsyscall() {
                 return
         }
  
-       if traceEnabled() {
+       trace := traceAcquire()
+       if trace.ok() {
                 // Wait till traceGoSysBlock event is emitted.
                 // This ensures consistency of the trace (the goroutine is started after it is blocked).
                 for oldp != nil && oldp.syscalltick == gp.m.syscalltick {
@@ -4242,6 +4370,7 @@ func exitsyscall() {
                 // So instead we remember the syscall exit time and emit the event
                 // in execute when we have a P.
                 gp.trace.sysExitTime = traceClockNow()
+               traceRelease(trace)
         }
  
         gp.m.locks--
@@ -4282,15 +4411,19 @@ func exitsyscallfast(oldp *p) bool {
                 var ok bool
                 systemstack(func() {
                         ok = exitsyscallfast_pidle()
-                       if ok && traceEnabled() {
-                               if oldp != nil {
-                                       // Wait till traceGoSysBlock event is emitted.
-                                       // This ensures consistency of the trace (the goroutine is started after it is blocked).
-                                       for oldp.syscalltick == gp.m.syscalltick {
-                                               osyield()
+                       if ok {
+                               trace := traceAcquire()
+                               if trace.ok() {
+                                       if oldp != nil {
+                                               // Wait till traceGoSysBlock event is emitted.
+                                               // This ensures consistency of the trace (the goroutine is started after it is blocked).
+                                               for oldp.syscalltick == gp.m.syscalltick {
+                                                       osyield()
+                                               }
                                         }
+                                       trace.GoSysExit()
+                                       traceRelease(trace)
                                 }
-                               traceGoSysExit()
                         }
                 })
                 if ok {
@@ -4308,15 +4441,17 @@ func exitsyscallfast(oldp *p) bool {
  func exitsyscallfast_reacquired() {
         gp := getg()
         if gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-               if traceEnabled() {
+               trace := traceAcquire()
+               if trace.ok() {
                         // The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed).
                         // traceGoSysBlock for this syscall was already emitted,
                         // but here we effectively retake the p from the new syscall running on the same p.
                         systemstack(func() {
                                 // Denote blocking of the new syscall.
-                               traceGoSysBlock(gp.m.p.ptr())
+                               trace.GoSysBlock(gp.m.p.ptr())
                                 // Denote completion of the current syscall.
-                               traceGoSysExit()
+                               trace.GoSysExit()
+                               traceRelease(trace)
                         })
                 }
                 gp.m.p.ptr().syscalltick++
@@ -4540,12 +4675,14 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
         totalSize := uintptr(4*goarch.PtrSize + sys.MinFrameSize) // extra space in case of reads slightly beyond frame
         totalSize = alignUp(totalSize, sys.StackAlign)
         sp := newg.stack.hi - totalSize
-       spArg := sp
         if usesLR {
                 // caller's LR
                 *(*uintptr)(unsafe.Pointer(sp)) = 0
                 prepGoExitFrame(sp)
-               spArg += sys.MinFrameSize
+       }
+       if GOARCH == "arm64" {
+               // caller's FP
+               *(*uintptr)(unsafe.Pointer(sp - goarch.PtrSize)) = 0
         }
  
         memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
@@ -4579,9 +4716,11 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
         if newg.trackingSeq%gTrackingPeriod == 0 {
                 newg.tracking = true
         }
-       casgstatus(newg, _Gdead, _Grunnable)
         gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo))
  
+       // Get a goid and switch to runnable. Make all this atomic to the tracer.
+       trace := traceAcquire()
+       casgstatus(newg, _Gdead, _Grunnable)
         if pp.goidcache == pp.goidcacheend {
                 // Sched.goidgen is the last allocated id,
                 // this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
@@ -4592,6 +4731,12 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
         }
         newg.goid = pp.goidcache
         pp.goidcache++
+       if trace.ok() {
+               trace.GoCreate(newg, newg.startpc)
+               traceRelease(trace)
+       }
+
+       // Set up race context.
         if raceenabled {
                 newg.racectx = racegostart(callerpc)
                 newg.raceignore = 0
@@ -4601,9 +4746,6 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
                         racereleasemergeg(newg, unsafe.Pointer(&labelSync))
                 }
         }
-       if traceEnabled() {
-               traceGoCreate(newg, newg.startpc)
-       }
         releasem(mp)
  
         return newg
@@ -5212,8 +5354,10 @@ func procresize(nprocs int32) *p {
         if old < 0 || nprocs <= 0 {
                 throw("procresize: invalid arg")
         }
-       if traceEnabled() {
-               traceGomaxprocs(nprocs)
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.Gomaxprocs(nprocs)
+               traceRelease(trace)
         }
  
         // update statistics
@@ -5278,12 +5422,14 @@ func procresize(nprocs int32) *p {
                 // because p.destroy itself has write barriers, so we
                 // need to do that from a valid P.
                 if gp.m.p != 0 {
-                       if traceEnabled() {
+                       trace := traceAcquire()
+                       if trace.ok() {
                                 // Pretend that we were descheduled
                                 // and then scheduled again to keep
                                 // the trace sane.
-                               traceGoSched()
-                               traceProcStop(gp.m.p.ptr())
+                               trace.GoSched()
+                               trace.ProcStop(gp.m.p.ptr())
+                               traceRelease(trace)
                         }
                         gp.m.p.ptr().m = 0
                 }
@@ -5292,8 +5438,10 @@ func procresize(nprocs int32) *p {
                 pp.m = 0
                 pp.status = _Pidle
                 acquirep(pp)
-               if traceEnabled() {
-                       traceGoStart()
+               trace := traceAcquire()
+               if trace.ok() {
+                       trace.GoStart()
+                       traceRelease(trace)
                 }
         }
  
@@ -5357,8 +5505,10 @@ func acquirep(pp *p) {
         // from a potentially stale mcache.
         pp.mcache.prepareForSweep()
  
-       if traceEnabled() {
-               traceProcStart()
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.ProcStart()
+               traceRelease(trace)
         }
  }
  
@@ -5399,8 +5549,10 @@ func releasep() *p {
                 print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
                 throw("releasep: invalid p state")
         }
-       if traceEnabled() {
-               traceProcStop(gp.m.p.ptr())
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.ProcStop(gp.m.p.ptr())
+               traceRelease(trace)
         }
         gp.m.p = 0
         pp.m = 0
@@ -5747,9 +5899,11 @@ func retake(now int64) uint32 {
                         // increment nmidle and report deadlock.
                         incidlelocked(-1)
                         if atomic.Cas(&pp.status, s, _Pidle) {
-                               if traceEnabled() {
-                                       traceGoSysBlock(pp)
-                                       traceProcStop(pp)
+                               trace := traceAcquire()
+                               if trace.ok() {
+                                       trace.GoSysBlock(pp)
+                                       trace.ProcStop(pp)
+                                       traceRelease(trace)
                                 }
                                 n++
                                 pp.syscalltick++