]> Cypherpunks.ru repositories - gostls13.git/commitdiff
runtime: resolve checkdead panic by refining `startm` lock handling in caller context
authorLucien Coffe <lucien.coffe@botify.com>
Fri, 21 Apr 2023 11:44:35 +0000 (13:44 +0200)
committerMichael Pratt <mpratt@google.com>
Fri, 28 Apr 2023 15:57:36 +0000 (15:57 +0000)
This change addresses a `checkdead` panic caused by a race condition between
`sysmon->startm` and `checkdead` callers, due to prematurely releasing the
scheduler lock. The solution involves allowing a `startm` caller to acquire the
scheduler lock and call `startm` in this context. A new `lockheld` bool
argument is added to `startm`, which manages all lock and unlock calls within
the function. The`startIdle` function variable in `injectglist` is updated to
call `startm` with the lock held, ensuring proper lock handling in this
specific case. This refined lock handling resolves the observed race condition
issue.

Fixes #59600

Change-Id: I11663a15536c10c773fc2fde291d959099aa71be
Reviewed-on: https://go-review.googlesource.com/c/go/+/487316
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Pratt <mpratt@google.com>

src/runtime/proc.go

index cef9680db2efa83763ed1e92aa03737c935e0379..d2901e3aa0257bc0a57982bc4c3a0647a53418b7 100644 (file)
@@ -2435,10 +2435,15 @@ func mspinning() {
 // Callers passing a non-nil P must call from a non-preemptible context. See
 // comment on acquirem below.
 //
+// Argument lockheld indicates whether the caller already acquired the
+// scheduler lock. Callers holding the lock when making the call must pass
+// true. The lock might be temporarily dropped, but will be reacquired before
+// returning.
+//
 // Must not have write barriers because this may be called without a P.
 //
 //go:nowritebarrierrec
-func startm(pp *p, spinning bool) {
+func startm(pp *p, spinning, lockheld bool) {
        // Disable preemption.
        //
        // Every owned P must have an owner that will eventually stop it in the
@@ -2456,7 +2461,9 @@ func startm(pp *p, spinning bool) {
        // startm. Callers passing a nil P may be preemptible, so we must
        // disable preemption before acquiring a P from pidleget below.
        mp := acquirem()
-       lock(&sched.lock)
+       if !lockheld {
+               lock(&sched.lock)
+       }
        if pp == nil {
                if spinning {
                        // TODO(prattmic): All remaining calls to this function
@@ -2466,7 +2473,9 @@ func startm(pp *p, spinning bool) {
                }
                pp, _ = pidleget(0)
                if pp == nil {
-                       unlock(&sched.lock)
+                       if !lockheld {
+                               unlock(&sched.lock)
+                       }
                        releasem(mp)
                        return
                }
@@ -2480,6 +2489,8 @@ func startm(pp *p, spinning bool) {
                // could find no idle P while checkdead finds a runnable G but
                // no running M's because this new M hasn't started yet, thus
                // throwing in an apparent deadlock.
+               // This apparent deadlock is possible when startm is called
+               // from sysmon, which doesn't count as a running M.
                //
                // Avoid this situation by pre-allocating the ID for the new M,
                // thus marking it as 'running' before we drop sched.lock. This
@@ -2494,12 +2505,18 @@ func startm(pp *p, spinning bool) {
                        fn = mspinning
                }
                newm(fn, pp, id)
+
+               if lockheld {
+                       lock(&sched.lock)
+               }
                // Ownership transfer of pp committed by start in newm.
                // Preemption is now safe.
                releasem(mp)
                return
        }
-       unlock(&sched.lock)
+       if !lockheld {
+               unlock(&sched.lock)
+       }
        if nmp.spinning {
                throw("startm: m is spinning")
        }
@@ -2528,24 +2545,24 @@ func handoffp(pp *p) {
 
        // if it has local work, start it straight away
        if !runqempty(pp) || sched.runqsize != 0 {
-               startm(pp, false)
+               startm(pp, false, false)
                return
        }
        // if there's trace work to do, start it straight away
        if (trace.enabled || trace.shutdown) && traceReaderAvailable() != nil {
-               startm(pp, false)
+               startm(pp, false, false)
                return
        }
        // if it has GC work, start it straight away
        if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) {
-               startm(pp, false)
+               startm(pp, false, false)
                return
        }
        // no local work, check that there are no spinning/idle M's,
        // otherwise our help is not required
        if sched.nmspinning.Load()+sched.npidle.Load() == 0 && sched.nmspinning.CompareAndSwap(0, 1) { // TODO: fast atomic
                sched.needspinning.Store(0)
-               startm(pp, true)
+               startm(pp, true, false)
                return
        }
        lock(&sched.lock)
@@ -2567,14 +2584,14 @@ func handoffp(pp *p) {
        }
        if sched.runqsize != 0 {
                unlock(&sched.lock)
-               startm(pp, false)
+               startm(pp, false, false)
                return
        }
        // If this is the last running P and nobody is polling network,
        // need to wakeup another M to poll network.
        if sched.npidle.Load() == gomaxprocs-1 && sched.lastpoll.Load() != 0 {
                unlock(&sched.lock)
-               startm(pp, false)
+               startm(pp, false, false)
                return
        }
 
@@ -2623,7 +2640,7 @@ func wakep() {
        // see at least one running M (ours).
        unlock(&sched.lock)
 
-       startm(pp, true)
+       startm(pp, true, false)
 
        releasem(mp)
 }
@@ -3376,8 +3393,8 @@ func injectglist(glist *gList) {
                                break
                        }
 
+                       startm(pp, false, true)
                        unlock(&sched.lock)
-                       startm(pp, false)
                        releasem(mp)
                }
        }
@@ -5484,7 +5501,7 @@ func sysmon() {
                        // See issue 42515 and
                        // https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094.
                        if next := timeSleepUntil(); next < now {
-                               startm(nil, false)
+                               startm(nil, false, false)
                        }
                }
                if scavenger.sysmonWake.Load() != 0 {
@@ -5756,7 +5773,7 @@ func schedEnableUser(enable bool) {
                globrunqputbatch(&sched.disable.runnable, n)
                unlock(&sched.lock)
                for ; n != 0 && sched.npidle.Load() != 0; n-- {
-                       startm(nil, false)
+                       startm(nil, false, false)
                }
        } else {
                unlock(&sched.lock)