]> Cypherpunks.ru repositories - gostls13.git/commitdiff
runtime: split mprof locks
authorRhys Hiltner <rhys@justin.tv>
Fri, 1 Apr 2022 19:56:49 +0000 (12:56 -0700)
committerMichael Knyszek <mknyszek@google.com>
Tue, 3 May 2022 20:49:59 +0000 (20:49 +0000)
The profiles for memory allocations, sync.Mutex contention, and general
blocking store their data in a shared hash table. The bookkeeping work
at the end of a garbage collection cycle involves maintenance on each
memory allocation record. Previously, a single lock guarded access to
the hash table and the contents of all records. When a program has
allocated memory at a large number of unique call stacks, the
maintenance following every garbage collection can hold that lock for
several milliseconds. That can prevent progress on all other goroutines
by delaying acquirep's call to mcache.prepareForSweep, which needs the
lock in mProf_Free to report when a profiled allocation is no longer in
use. With no user goroutines making progress, it is in effect a
multi-millisecond GC-related stop-the-world pause.

Split the lock so the call to mProf_Flush no longer delays each P's call
to mProf_Free: mProf_Free uses a lock on the memory records' N+1 cycle,
and mProf_Flush uses locks on the memory records' accumulator and their
N cycle. mProf_Malloc also no longer competes with mProf_Flush, as it
uses a lock on the memory records' N+2 cycle. The profiles for
sync.Mutex contention and general blocking now share a separate lock,
and another lock guards insertions to the shared hash table (uncommon in
the steady-state). Consumers of each type of profile take the matching
accumulator lock, so will observe consistent count and magnitude values
for each record.

For #45894

Change-Id: I615ff80618d10e71025423daa64b0b7f9dc57daa
Reviewed-on: https://go-review.googlesource.com/c/go/+/399956
Reviewed-by: Carlos Amedee <carlos@golang.org>
Run-TryBot: Rhys Hiltner <rhys@justin.tv>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>

src/runtime/lockrank.go
src/runtime/malloc.go
src/runtime/mprof.go

index 4a16bc0ddb6e7c091fd95b0163230971461d8276..f6e7ea98801998f060a5f1b20a6416597db2eb82 100644 (file)
@@ -56,7 +56,10 @@ const (
        lockRankNotifyList
        lockRankTraceStrings
        lockRankMspanSpecial
-       lockRankProf
+       lockRankProfInsert
+       lockRankProfBlock
+       lockRankProfMemActive
+       lockRankProfMemFuture
        lockRankGcBitsArenas
        lockRankRoot
        lockRankTrace
@@ -137,7 +140,10 @@ var lockNames = []string{
        lockRankNotifyList:    "notifyList",
        lockRankTraceStrings:  "traceStrings",
        lockRankMspanSpecial:  "mspanSpecial",
-       lockRankProf:          "prof",
+       lockRankProfInsert:    "profInsert",
+       lockRankProfBlock:     "profBlock",
+       lockRankProfMemActive: "profMemActive",
+       lockRankProfMemFuture: "profMemFuture",
        lockRankGcBitsArenas:  "gcBitsArenas",
        lockRankRoot:          "root",
        lockRankTrace:         "trace",
@@ -215,7 +221,10 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
        lockRankNotifyList:    {},
        lockRankTraceStrings:  {lockRankTraceBuf},
        lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-       lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+       lockRankProfInsert:    {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+       lockRankProfBlock:     {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+       lockRankProfMemActive: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+       lockRankProfMemFuture: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings, lockRankProfMemActive},
        lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
        lockRankRoot:          {},
        lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot},
@@ -226,15 +235,15 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
        lockRankRwmutexR: {lockRankSysmon, lockRankRwmutexW},
 
        lockRankSpanSetSpine:  {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-       lockRankGscan:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
-       lockRankStackpool:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
-       lockRankStackLarge:    {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
+       lockRankGscan:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
+       lockRankStackpool:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
+       lockRankStackLarge:    {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
        lockRankDefer:         {},
        lockRankSudog:         {lockRankHchan, lockRankNotifyList},
-       lockRankWbufSpans:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-       lockRankMheap:         {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
+       lockRankWbufSpans:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
+       lockRankMheap:         {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
        lockRankMheapSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-       lockRankGlobalAlloc:   {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+       lockRankGlobalAlloc:   {lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
        lockRankPageAllocScav: {lockRankMheap},
 
        lockRankGFree:     {lockRankSched},
index f65be2bc749af73c967702df21608b39b5e18a7c..14bf9a583f4b143015a7bc852eefc79a869217d7 100644 (file)
@@ -413,7 +413,12 @@ func mallocinit() {
        mheap_.init()
        mcache0 = allocmcache()
        lockInit(&gcBitsArenas.lock, lockRankGcBitsArenas)
-       lockInit(&proflock, lockRankProf)
+       lockInit(&profInsertLock, lockRankProfInsert)
+       lockInit(&profBlockLock, lockRankProfBlock)
+       lockInit(&profMemActiveLock, lockRankProfMemActive)
+       for i := range profMemFutureLock {
+               lockInit(&profMemFutureLock[i], lockRankProfMemFuture)
+       }
        lockInit(&globalAlloc.mutex, lockRankGlobalAlloc)
 
        // Create initial arena growth hints.
index 5137db201537f70bad506e3b40b9baa51a2cc69a..cd63bafebb84a73ea03873a5bf79d8cd49591bbc 100644 (file)
@@ -14,7 +14,17 @@ import (
 )
 
 // NOTE(rsc): Everything here could use cas if contention became an issue.
-var proflock mutex
+var (
+       // profInsertLock protects changes to the start of all *bucket linked lists
+       profInsertLock mutex
+       // profBlockLock protects the contents of every blockRecord struct
+       profBlockLock mutex
+       // profMemActiveLock protects the active field of every memRecord struct
+       profMemActiveLock mutex
+       // profMemFutureLock is a set of locks that protect the respective elements
+       // of the future array of every memRecord struct
+       profMemFutureLock [len(memRecord{}.future)]mutex
+)
 
 // All memory allocations are local and do not escape outside of the profiler.
 // The profiler is forbidden from referring to garbage-collected memory.
@@ -43,6 +53,9 @@ type bucketType int
 // Per-call-stack profiling information.
 // Lookup by hashing call stack into a linked-list hash table.
 //
+// None of the fields in this bucket header are modified after
+// creation, including its next and allnext links.
+//
 // No heap pointers.
 //
 //go:notinheap
@@ -139,26 +152,64 @@ type blockRecord struct {
 }
 
 var (
-       mbuckets  *bucket // memory profile buckets
-       bbuckets  *bucket // blocking profile buckets
-       xbuckets  *bucket // mutex profile buckets
-       buckhash  *[buckHashSize]*bucket
-       bucketmem uintptr
-
-       mProf struct {
-               // All fields in mProf are protected by proflock.
-
-               // cycle is the global heap profile cycle. This wraps
-               // at mProfCycleWrap.
-               cycle uint32
-               // flushed indicates that future[cycle] in all buckets
-               // has been flushed to the active profile.
-               flushed bool
-       }
+       mbuckets atomic.UnsafePointer // *bucket, memory profile buckets
+       bbuckets atomic.UnsafePointer // *bucket, blocking profile buckets
+       xbuckets atomic.UnsafePointer // *bucket, mutex profile buckets
+       buckhash atomic.UnsafePointer // *buckhashArray
+
+       mProfCycle mProfCycleHolder
 )
 
+type buckhashArray [buckHashSize]atomic.UnsafePointer // *bucket
+
 const mProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)
 
+// mProfCycleHolder holds the global heap profile cycle number (wrapped at
+// mProfCycleWrap, stored starting at bit 1), and a flag (stored at bit 0) to
+// indicate whether future[cycle] in all buckets has been queued to flush into
+// the active profile.
+type mProfCycleHolder struct {
+       value atomic.Uint32
+}
+
+// read returns the current cycle count.
+func (c *mProfCycleHolder) read() (cycle uint32) {
+       v := c.value.Load()
+       cycle = v >> 1
+       return cycle
+}
+
+// setFlushed sets the flushed flag. It returns the current cycle count and the
+// previous value of the flushed flag.
+func (c *mProfCycleHolder) setFlushed() (cycle uint32, alreadyFlushed bool) {
+       for {
+               prev := c.value.Load()
+               cycle = prev >> 1
+               alreadyFlushed = (prev & 0x1) != 0
+               next := prev | 0x1
+               if c.value.CompareAndSwap(prev, next) {
+                       return cycle, alreadyFlushed
+               }
+       }
+}
+
+// increment increases the cycle count by one, wrapping the value at
+// mProfCycleWrap. It clears the flushed flag.
+func (c *mProfCycleHolder) increment() {
+       // We explicitly wrap mProfCycle rather than depending on
+       // uint wraparound because the memRecord.future ring does not
+       // itself wrap at a power of two.
+       for {
+               prev := c.value.Load()
+               cycle := prev >> 1
+               cycle = (cycle + 1) % mProfCycleWrap
+               next := cycle << 1
+               if c.value.CompareAndSwap(prev, next) {
+                       break
+               }
+       }
+}
+
 // newBucket allocates a bucket with the given type and number of stack entries.
 func newBucket(typ bucketType, nstk int) *bucket {
        size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr(0))
@@ -172,7 +223,6 @@ func newBucket(typ bucketType, nstk int) *bucket {
        }
 
        b := (*bucket)(persistentalloc(size, 0, &memstats.buckhash_sys))
-       bucketmem += size
        b.typ = typ
        b.nstk = uintptr(nstk)
        return b
@@ -204,11 +254,19 @@ func (b *bucket) bp() *blockRecord {
 
 // Return the bucket for stk[0:nstk], allocating new bucket if needed.
 func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket {
-       if buckhash == nil {
-               buckhash = (*[buckHashSize]*bucket)(sysAlloc(unsafe.Sizeof(*buckhash), &memstats.buckhash_sys))
-               if buckhash == nil {
-                       throw("runtime: cannot allocate memory")
+       bh := (*buckhashArray)(buckhash.Load())
+       if bh == nil {
+               lock(&profInsertLock)
+               // check again under the lock
+               bh = (*buckhashArray)(buckhash.Load())
+               if bh == nil {
+                       bh = (*buckhashArray)(sysAlloc(unsafe.Sizeof(buckhashArray{}), &memstats.buckhash_sys))
+                       if bh == nil {
+                               throw("runtime: cannot allocate memory")
+                       }
+                       buckhash.StoreNoWB(unsafe.Pointer(bh))
                }
+               unlock(&profInsertLock)
        }
 
        // Hash stack.
@@ -227,7 +285,8 @@ func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket
        h ^= h >> 11
 
        i := int(h % buckHashSize)
-       for b := buckhash[i]; b != nil; b = b.next {
+       // first check optimistically, without the lock
+       for b := (*bucket)(bh[i].Load()); b != nil; b = b.next {
                if b.typ == typ && b.hash == h && b.size == size && eqslice(b.stk(), stk) {
                        return b
                }
@@ -237,23 +296,37 @@ func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket
                return nil
        }
 
+       lock(&profInsertLock)
+       // check again under the insertion lock
+       for b := (*bucket)(bh[i].Load()); b != nil; b = b.next {
+               if b.typ == typ && b.hash == h && b.size == size && eqslice(b.stk(), stk) {
+                       unlock(&profInsertLock)
+                       return b
+               }
+       }
+
        // Create new bucket.
        b := newBucket(typ, len(stk))
        copy(b.stk(), stk)
        b.hash = h
        b.size = size
-       b.next = buckhash[i]
-       buckhash[i] = b
+
+       var allnext *atomic.UnsafePointer
        if typ == memProfile {
-               b.allnext = mbuckets
-               mbuckets = b
+               allnext = &mbuckets
        } else if typ == mutexProfile {
-               b.allnext = xbuckets
-               xbuckets = b
+               allnext = &xbuckets
        } else {
-               b.allnext = bbuckets
-               bbuckets = b
+               allnext = &bbuckets
        }
+
+       b.next = (*bucket)(bh[i].Load())
+       b.allnext = (*bucket)(allnext.Load())
+
+       bh[i].StoreNoWB(unsafe.Pointer(b))
+       allnext.StoreNoWB(unsafe.Pointer(b))
+
+       unlock(&profInsertLock)
        return b
 }
 
@@ -278,13 +351,7 @@ func eqslice(x, y []uintptr) bool {
 // frees after the world is started again count towards a new heap
 // profiling cycle.
 func mProf_NextCycle() {
-       lock(&proflock)
-       // We explicitly wrap mProf.cycle rather than depending on
-       // uint wraparound because the memRecord.future ring does not
-       // itself wrap at a power of two.
-       mProf.cycle = (mProf.cycle + 1) % mProfCycleWrap
-       mProf.flushed = false
-       unlock(&proflock)
+       mProfCycle.increment()
 }
 
 // mProf_Flush flushes the events from the current heap profiling
@@ -295,22 +362,33 @@ func mProf_NextCycle() {
 // contrast with mProf_NextCycle, this is somewhat expensive, but safe
 // to do concurrently.
 func mProf_Flush() {
-       lock(&proflock)
-       if !mProf.flushed {
-               mProf_FlushLocked()
-               mProf.flushed = true
+       cycle, alreadyFlushed := mProfCycle.setFlushed()
+       if alreadyFlushed {
+               return
        }
-       unlock(&proflock)
+
+       index := cycle % uint32(len(memRecord{}.future))
+       lock(&profMemActiveLock)
+       lock(&profMemFutureLock[index])
+       mProf_FlushLocked(index)
+       unlock(&profMemFutureLock[index])
+       unlock(&profMemActiveLock)
 }
 
-func mProf_FlushLocked() {
-       c := mProf.cycle
-       for b := mbuckets; b != nil; b = b.allnext {
+// mProf_FlushLocked flushes the events from the heap profiling cycle at index
+// into the active profile. The caller must hold the lock for the active profile
+// (profMemActiveLock) and for the profiling cycle at index
+// (profMemFutureLock[index]).
+func mProf_FlushLocked(index uint32) {
+       assertLockHeld(&profMemActiveLock)
+       assertLockHeld(&profMemFutureLock[index])
+       head := (*bucket)(mbuckets.Load())
+       for b := head; b != nil; b = b.allnext {
                mp := b.mp()
 
                // Flush cycle C into the published profile and clear
                // it for reuse.
-               mpc := &mp.future[c%uint32(len(mp.future))]
+               mpc := &mp.future[index]
                mp.active.add(mpc)
                *mpc = memRecordCycle{}
        }
@@ -321,39 +399,41 @@ func mProf_FlushLocked() {
 // snapshot as of the last mark termination without advancing the heap
 // profile cycle.
 func mProf_PostSweep() {
-       lock(&proflock)
        // Flush cycle C+1 to the active profile so everything as of
        // the last mark termination becomes visible. *Don't* advance
        // the cycle, since we're still accumulating allocs in cycle
        // C+2, which have to become C+1 in the next mark termination
        // and so on.
-       c := mProf.cycle
-       for b := mbuckets; b != nil; b = b.allnext {
-               mp := b.mp()
-               mpc := &mp.future[(c+1)%uint32(len(mp.future))]
-               mp.active.add(mpc)
-               *mpc = memRecordCycle{}
-       }
-       unlock(&proflock)
+       cycle := mProfCycle.read() + 1
+
+       index := cycle % uint32(len(memRecord{}.future))
+       lock(&profMemActiveLock)
+       lock(&profMemFutureLock[index])
+       mProf_FlushLocked(index)
+       unlock(&profMemFutureLock[index])
+       unlock(&profMemActiveLock)
 }
 
 // Called by malloc to record a profiled block.
 func mProf_Malloc(p unsafe.Pointer, size uintptr) {
        var stk [maxStack]uintptr
        nstk := callers(4, stk[:])
-       lock(&proflock)
+
+       index := (mProfCycle.read() + 2) % uint32(len(memRecord{}.future))
+
        b := stkbucket(memProfile, size, stk[:nstk], true)
-       c := mProf.cycle
        mp := b.mp()
-       mpc := &mp.future[(c+2)%uint32(len(mp.future))]
+       mpc := &mp.future[index]
+
+       lock(&profMemFutureLock[index])
        mpc.allocs++
        mpc.alloc_bytes += size
-       unlock(&proflock)
+       unlock(&profMemFutureLock[index])
 
-       // Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock.
-       // This reduces potential contention and chances of deadlocks.
-       // Since the object must be alive during call to mProf_Malloc,
-       // it's fine to do this non-atomically.
+       // Setprofilebucket locks a bunch of other mutexes, so we call it outside of
+       // the profiler locks. This reduces potential contention and chances of
+       // deadlocks. Since the object must be alive during the call to
+       // mProf_Malloc, it's fine to do this non-atomically.
        systemstack(func() {
                setprofilebucket(p, b)
        })
@@ -361,13 +441,15 @@ func mProf_Malloc(p unsafe.Pointer, size uintptr) {
 
 // Called when freeing a profiled block.
 func mProf_Free(b *bucket, size uintptr) {
-       lock(&proflock)
-       c := mProf.cycle
+       index := (mProfCycle.read() + 1) % uint32(len(memRecord{}.future))
+
        mp := b.mp()
-       mpc := &mp.future[(c+1)%uint32(len(mp.future))]
+       mpc := &mp.future[index]
+
+       lock(&profMemFutureLock[index])
        mpc.frees++
        mpc.free_bytes += size
-       unlock(&proflock)
+       unlock(&profMemFutureLock[index])
 }
 
 var blockprofilerate uint64 // in CPU ticks
@@ -424,18 +506,19 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
        } else {
                nstk = gcallers(gp.m.curg, skip, stk[:])
        }
-       lock(&proflock)
        b := stkbucket(which, 0, stk[:nstk], true)
+       bp := b.bp()
 
+       lock(&profBlockLock)
        if which == blockProfile && cycles < rate {
                // Remove sampling bias, see discussion on http://golang.org/cl/299991.
-               b.bp().count += float64(rate) / float64(cycles)
-               b.bp().cycles += rate
+               bp.count += float64(rate) / float64(cycles)
+               bp.cycles += rate
        } else {
-               b.bp().count++
-               b.bp().cycles += cycles
+               bp.count++
+               bp.cycles += cycles
        }
-       unlock(&proflock)
+       unlock(&profBlockLock)
 }
 
 var mutexprofilerate uint64 // fraction sampled
@@ -567,13 +650,18 @@ func (r *MemProfileRecord) Stack() []uintptr {
 // the testing package's -test.memprofile flag instead
 // of calling MemProfile directly.
 func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
-       lock(&proflock)
+       cycle := mProfCycle.read()
        // If we're between mProf_NextCycle and mProf_Flush, take care
        // of flushing to the active profile so we only have to look
        // at the active profile below.
-       mProf_FlushLocked()
+       index := cycle % uint32(len(memRecord{}.future))
+       lock(&profMemActiveLock)
+       lock(&profMemFutureLock[index])
+       mProf_FlushLocked(index)
+       unlock(&profMemFutureLock[index])
        clear := true
-       for b := mbuckets; b != nil; b = b.allnext {
+       head := (*bucket)(mbuckets.Load())
+       for b := head; b != nil; b = b.allnext {
                mp := b.mp()
                if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
                        n++
@@ -588,11 +676,13 @@ func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
                // garbage collection is disabled from the beginning of execution,
                // accumulate all of the cycles, and recount buckets.
                n = 0
-               for b := mbuckets; b != nil; b = b.allnext {
+               for b := head; b != nil; b = b.allnext {
                        mp := b.mp()
                        for c := range mp.future {
+                               lock(&profMemFutureLock[c])
                                mp.active.add(&mp.future[c])
                                mp.future[c] = memRecordCycle{}
+                               unlock(&profMemFutureLock[c])
                        }
                        if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
                                n++
@@ -602,7 +692,7 @@ func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
        if n <= len(p) {
                ok = true
                idx := 0
-               for b := mbuckets; b != nil; b = b.allnext {
+               for b := head; b != nil; b = b.allnext {
                        mp := b.mp()
                        if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
                                record(&p[idx], b)
@@ -610,7 +700,7 @@ func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
                        }
                }
        }
-       unlock(&proflock)
+       unlock(&profMemActiveLock)
        return
 }
 
@@ -637,12 +727,13 @@ func record(r *MemProfileRecord, b *bucket) {
 }
 
 func iterate_memprof(fn func(*bucket, uintptr, *uintptr, uintptr, uintptr, uintptr)) {
-       lock(&proflock)
-       for b := mbuckets; b != nil; b = b.allnext {
+       lock(&profMemActiveLock)
+       head := (*bucket)(mbuckets.Load())
+       for b := head; b != nil; b = b.allnext {
                mp := b.mp()
                fn(b, b.nstk, &b.stk()[0], b.size, mp.active.allocs, mp.active.frees)
        }
-       unlock(&proflock)
+       unlock(&profMemActiveLock)
 }
 
 // BlockProfileRecord describes blocking events originated
@@ -661,13 +752,14 @@ type BlockProfileRecord struct {
 // the testing package's -test.blockprofile flag instead
 // of calling BlockProfile directly.
 func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
-       lock(&proflock)
-       for b := bbuckets; b != nil; b = b.allnext {
+       lock(&profBlockLock)
+       head := (*bucket)(bbuckets.Load())
+       for b := head; b != nil; b = b.allnext {
                n++
        }
        if n <= len(p) {
                ok = true
-               for b := bbuckets; b != nil; b = b.allnext {
+               for b := head; b != nil; b = b.allnext {
                        bp := b.bp()
                        r := &p[0]
                        r.Count = int64(bp.count)
@@ -693,7 +785,7 @@ func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
                        p = p[1:]
                }
        }
-       unlock(&proflock)
+       unlock(&profBlockLock)
        return
 }
 
@@ -704,13 +796,14 @@ func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
 // Most clients should use the runtime/pprof package
 // instead of calling MutexProfile directly.
 func MutexProfile(p []BlockProfileRecord) (n int, ok bool) {
-       lock(&proflock)
-       for b := xbuckets; b != nil; b = b.allnext {
+       lock(&profBlockLock)
+       head := (*bucket)(xbuckets.Load())
+       for b := head; b != nil; b = b.allnext {
                n++
        }
        if n <= len(p) {
                ok = true
-               for b := xbuckets; b != nil; b = b.allnext {
+               for b := head; b != nil; b = b.allnext {
                        bp := b.bp()
                        r := &p[0]
                        r.Count = int64(bp.count)
@@ -722,7 +815,7 @@ func MutexProfile(p []BlockProfileRecord) (n int, ok bool) {
                        p = p[1:]
                }
        }
-       unlock(&proflock)
+       unlock(&profBlockLock)
        return
 }