[dev.garbage] all: merge dev.cc into dev.garbage

author Russ Cox <rsc@golang.org>

Sat, 15 Nov 2014 13:00:38 +0000 (08:00 -0500)

committer Russ Cox <rsc@golang.org>

Sat, 15 Nov 2014 13:00:38 +0000 (08:00 -0500)
author Russ Cox <rsc@golang.org>
Sat, 15 Nov 2014 13:00:38 +0000 (08:00 -0500)
committer Russ Cox <rsc@golang.org>
Sat, 15 Nov 2014 13:00:38 +0000 (08:00 -0500)
diff --cc src/cmd/gc/go.h
Simple merge
diff --cc src/run.bash
Simple merge
diff --cc src/runtime/export_test.go
Simple merge
diff --cc src/runtime/lfstack.go

index 0000000000000000000000000000000000000000,4a20fff9d80f1931e5a06fce01b4ac803398e718..a4ad8a10c69584aa3d4f2408757a00bbed4879c7

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go
@@@ -1,0 -1,40 +1,36 @@@
- -              node.next, _ = lfstackUnpack(old)
+ // Copyright 2012 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // Lock-free stack.
+ // The following code runs only on g0 stack.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ func lfstackpush(head *uint64, node *lfnode) {
+       node.pushcnt++
+       new := lfstackPack(node, node.pushcnt)
+       for {
+               old := atomicload64(head)
- -              node2 := (*lfnode)(atomicloadp(unsafe.Pointer(&node.next)))
- -              new := uint64(0)
- -              if node2 != nil {
- -                      new = lfstackPack(node2, node2.pushcnt)
- -              }
- -              if cas64(head, old, new) {
++              node.next = old
+               if cas64(head, old, new) {
+                       break
+               }
+       }
+ }
+ 
+ func lfstackpop(head *uint64) unsafe.Pointer {
+       for {
+               old := atomicload64(head)
+               if old == 0 {
+                       return nil
+               }
+               node, _ := lfstackUnpack(old)
++              next := atomicload64(&node.next)
++              if cas64(head, old, next) {
+                       return unsafe.Pointer(node)
+               }
+       }
+ }
diff --cc src/runtime/malloc.go

index fab8cf269579f423341b45a4c94f4f21e49e19a1,20cb6818d23cb380da9188eec3ba6757aa185857..f90a8f84a39b2260645dfa8aa06f4beda1076053
--- 1/src/runtime/malloc.go
--- 2/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@@ -306,18 -295,6 +297,17 @@@ func mallocgc(size uintptr, typ *_type
                 }
         }
   marked:
-               mp := acquirem()
-               mp.ptrarg[0] = x
-               onM(gcmarknewobject_m)
-               releasem(mp)
+ +
+ +      // GCmarkterminate allocates black
+ +      // All slots hold nil so no scanning is needed.
+ +      // This may be racing with GC so do it atomically if there can be
+ +      // a race marking the bit.
+ +      if gcphase == _GCmarktermination {
++              systemstack(func() {
++                      gcmarknewobject_m(uintptr(x))
++              })
+ +      }
+ +
         if raceenabled {
                 racemalloc(x, size)
         }
@@@ -358,37 -335,6 +348,36 @@@
         return x
   }
   
-                       mp := acquirem()
-                       mp.ptrarg[0] = unsafe.Pointer(typ)
-                       onM(unrollgcprog_m)
-                       releasem(mp)
+ +func loadPtrMask(typ *_type) []uint8 {
+ +      var ptrmask *uint8
+ +      nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+ +      if typ.kind&kindGCProg != 0 {
+ +              masksize := nptr
+ +              if masksize%2 != 0 {
+ +                      masksize *= 2 // repeated
+ +              }
+ +              masksize = masksize * pointersPerByte / 8 // 4 bits per word
+ +              masksize++                                // unroll flag in the beginning
+ +              if masksize > maxGCMask && typ.gc[1] != 0 {
+ +                      // write barriers have not been updated to deal with this case yet.
+ +                      gothrow("maxGCMask too small for now")
+ +              }
+ +              ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
+ +              // Check whether the program is already unrolled
+ +              // by checking if the unroll flag byte is set
+ +              maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
+ +              if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
++                      systemstack(func() {
++                              unrollgcprog_m(typ)
++                      })
+ +              }
+ +              ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+ +      } else {
+ +              ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
+ +      }
+ +      return (*[1 << 30]byte)(unsafe.Pointer(ptrmask))[:(nptr+1)/2]
+ +}
+ +
   // implementation of new builtin
   func newobject(typ *_type) unsafe.Pointer {
         flags := uint32(0)
@@@ -483,20 -429,7 +472,21 @@@ func gogc(force int32) 
         mp = acquirem()
         mp.gcing = 1
         releasem(mp)
-       onM(stoptheworld)
-       onM(finishsweep_m) // finish sweep before we start concurrent scan.
-       if false {         // To turn on concurrent scan and mark set to true...
-               onM(starttheworld)
+ +
-               onM(gcscan_m)
-               onM(stoptheworld)
-               onM(gcinstallmarkwb_m)
-               onM(starttheworld)
-               onM(gcmark_m)
-               onM(stoptheworld)
-               onM(gcinstalloffwb_m)
+       systemstack(stoptheworld)
++      systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
++      if false {                 // To turn on concurrent scan and mark set to true...
++              systemstack(starttheworld)
+ +              // Do a concurrent heap scan before we stop the world.
++              systemstack(gcscan_m)
++              systemstack(stoptheworld)
++              systemstack(gcinstallmarkwb_m)
++              systemstack(starttheworld)
++              systemstack(gcmark_m)
++              systemstack(stoptheworld)
++              systemstack(gcinstalloffwb_m)
+ +      }
++
         if mp != acquirem() {
                 gothrow("gogc: rescheduled")
         }
@@@ -512,23 -445,17 +502,21 @@@
         if debug.gctrace > 1 {
                 n = 2
         }
++      eagersweep := force >= 2
         for i := 0; i < n; i++ {
                 if i > 0 {
                         startTime = nanotime()
                 }
                 // switch to g0, call gc, then switch back
-               mp.scalararg[0] = uintptr(uint32(startTime)) // low 32 bits
-               mp.scalararg[1] = uintptr(startTime >> 32)   // high 32 bits
-               if force >= 2 {
-                       mp.scalararg[2] = 1 // eagersweep
-               } else {
-                       mp.scalararg[2] = 0
-               }
-               onM(gc_m)
- -              eagersweep := force >= 2
+               systemstack(func() {
+                       gc_m(startTime, eagersweep)
+               })
         }
   
-       onM(gccheckmark_m)
++      systemstack(func() {
++              gccheckmark_m(startTime, eagersweep)
++      })
+ +
         // all done
         mp.gcing = 0
         semrelease(&worldsema)
@@@ -543,14 -470,6 +531,14 @@@
         }
   }
   
-       onM(gccheckmarkenable_m)
+ +func GCcheckmarkenable() {
-       onM(gccheckmarkdisable_m)
++      systemstack(gccheckmarkenable_m)
+ +}
+ +
+ +func GCcheckmarkdisable() {
++      systemstack(gccheckmarkdisable_m)
+ +}
+ +
   // GC runs a garbage collection.
   func GC() {
         gogc(2)
diff --cc src/runtime/malloc2.go

index 0000000000000000000000000000000000000000,e4bd963d3060f0afa75defad8fbf94f79f375760..4ac0207b1ec620a6eaf90201b61425bafe4003d3

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/malloc2.go
+++ b/src/runtime/malloc2.go
@@@ -1,0 -1,475 +1,473 @@@
- -      gcworkbuf unsafe.Pointer
- -
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ // Memory allocator, based on tcmalloc.
+ // http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+ 
+ // The main allocator works in runs of pages.
+ // Small allocation sizes (up to and including 32 kB) are
+ // rounded to one of about 100 size classes, each of which
+ // has its own free list of objects of exactly that size.
+ // Any free page of memory can be split into a set of objects
+ // of one size class, which are then managed using free list
+ // allocators.
+ //
+ // The allocator's data structures are:
+ //
+ //    FixAlloc: a free-list allocator for fixed-size objects,
+ //            used to manage storage used by the allocator.
+ //    MHeap: the malloc heap, managed at page (4096-byte) granularity.
+ //    MSpan: a run of pages managed by the MHeap.
+ //    MCentral: a shared free list for a given size class.
+ //    MCache: a per-thread (in Go, per-P) cache for small objects.
+ //    MStats: allocation statistics.
+ //
+ // Allocating a small object proceeds up a hierarchy of caches:
+ //
+ //    1. Round the size up to one of the small size classes
+ //       and look in the corresponding MCache free list.
+ //       If the list is not empty, allocate an object from it.
+ //       This can all be done without acquiring a lock.
+ //
+ //    2. If the MCache free list is empty, replenish it by
+ //       taking a bunch of objects from the MCentral free list.
+ //       Moving a bunch amortizes the cost of acquiring the MCentral lock.
+ //
+ //    3. If the MCentral free list is empty, replenish it by
+ //       allocating a run of pages from the MHeap and then
+ //       chopping that memory into a objects of the given size.
+ //       Allocating many objects amortizes the cost of locking
+ //       the heap.
+ //
+ //    4. If the MHeap is empty or has no page runs large enough,
+ //       allocate a new group of pages (at least 1MB) from the
+ //       operating system.  Allocating a large run of pages
+ //       amortizes the cost of talking to the operating system.
+ //
+ // Freeing a small object proceeds up the same hierarchy:
+ //
+ //    1. Look up the size class for the object and add it to
+ //       the MCache free list.
+ //
+ //    2. If the MCache free list is too long or the MCache has
+ //       too much memory, return some to the MCentral free lists.
+ //
+ //    3. If all the objects in a given span have returned to
+ //       the MCentral list, return that span to the page heap.
+ //
+ //    4. If the heap has too much memory, return some to the
+ //       operating system.
+ //
+ //    TODO(rsc): Step 4 is not implemented.
+ //
+ // Allocating and freeing a large object uses the page heap
+ // directly, bypassing the MCache and MCentral free lists.
+ //
+ // The small objects on the MCache and MCentral free lists
+ // may or may not be zeroed.  They are zeroed if and only if
+ // the second word of the object is zero.  A span in the
+ // page heap is zeroed unless s->needzero is set. When a span
+ // is allocated to break into small objects, it is zeroed if needed
+ // and s->needzero is set. There are two main benefits to delaying the
+ // zeroing this way:
+ //
+ //    1. stack frames allocated from the small object lists
+ //       or the page heap can avoid zeroing altogether.
+ //    2. the cost of zeroing when reusing a small object is
+ //       charged to the mutator, not the garbage collector.
+ //
+ // This C code was written with an eye toward translating to Go
+ // in the future.  Methods have the form Type_Method(Type *t, ...).
+ 
+ const (
+       _PageShift = 13
+       _PageSize  = 1 << _PageShift
+       _PageMask  = _PageSize - 1
+ )
+ 
+ const (
+       // _64bit = 1 on 64-bit systems, 0 on 32-bit systems
+       _64bit = 1 << (^uintptr(0) >> 63) / 2
+ 
+       // Computed constant.  The definition of MaxSmallSize and the
+       // algorithm in msize.c produce some number of different allocation
+       // size classes.  NumSizeClasses is that number.  It's needed here
+       // because there are static arrays of this length; when msize runs its
+       // size choosing algorithm it double-checks that NumSizeClasses agrees.
+       _NumSizeClasses = 67
+ 
+       // Tunable constants.
+       _MaxSmallSize = 32 << 10
+ 
+       // Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
+       _TinySize      = 16
+       _TinySizeClass = 2
+ 
+       _FixAllocChunk  = 16 << 10               // Chunk size for FixAlloc
+       _MaxMHeapList   = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
+       _HeapAllocChunk = 1 << 20                // Chunk size for heap growth
+ 
+       // Per-P, per order stack segment cache size.
+       _StackCacheSize = 32 * 1024
+ 
+       // Number of orders that get caching.  Order 0 is FixedStack
+       // and each successive order is twice as large.
+       _NumStackOrders = 3
+ 
+       // Number of bits in page to span calculations (4k pages).
+       // On Windows 64-bit we limit the arena to 32GB or 35 bits.
+       // Windows counts memory used by page table into committed memory
+       // of the process, so we can't reserve too much memory.
+       // See http://golang.org/issue/5402 and http://golang.org/issue/5236.
+       // On other 64-bit platforms, we limit the arena to 128GB, or 37 bits.
+       // On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
+       _MHeapMap_TotalBits = (_64bit*_Windows)*35 + (_64bit*(1-_Windows))*37 + (1-_64bit)*32
+       _MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
+ 
+       _MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
+ 
+       // Max number of threads to run garbage collection.
+       // 2, 3, and 4 are all plausible maximums depending
+       // on the hardware details of the machine.  The garbage
+       // collector scales well to 32 cpus.
+       _MaxGcproc = 32
+ )
+ 
+ // A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+ type mlink struct {
+       next *mlink
+ }
+ 
+ // sysAlloc obtains a large chunk of zeroed memory from the
+ // operating system, typically on the order of a hundred kilobytes
+ // or a megabyte.
+ // NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
+ // may use larger alignment, so the caller must be careful to realign the
+ // memory obtained by sysAlloc.
+ //
+ // SysUnused notifies the operating system that the contents
+ // of the memory region are no longer needed and can be reused
+ // for other purposes.
+ // SysUsed notifies the operating system that the contents
+ // of the memory region are needed again.
+ //
+ // SysFree returns it unconditionally; this is only used if
+ // an out-of-memory error has been detected midway through
+ // an allocation.  It is okay if SysFree is a no-op.
+ //
+ // SysReserve reserves address space without allocating memory.
+ // If the pointer passed to it is non-nil, the caller wants the
+ // reservation there, but SysReserve can still choose another
+ // location if that one is unavailable.  On some systems and in some
+ // cases SysReserve will simply check that the address space is
+ // available and not actually reserve it.  If SysReserve returns
+ // non-nil, it sets *reserved to true if the address space is
+ // reserved, false if it has merely been checked.
+ // NOTE: SysReserve returns OS-aligned memory, but the heap allocator
+ // may use larger alignment, so the caller must be careful to realign the
+ // memory obtained by sysAlloc.
+ //
+ // SysMap maps previously reserved address space for use.
+ // The reserved argument is true if the address space was really
+ // reserved, not merely checked.
+ //
+ // SysFault marks a (already sysAlloc'd) region to fault
+ // if accessed.  Used only for debugging the runtime.
+ 
+ // FixAlloc is a simple free-list allocator for fixed size objects.
+ // Malloc uses a FixAlloc wrapped around sysAlloc to manages its
+ // MCache and MSpan objects.
+ //
+ // Memory returned by FixAlloc_Alloc is not zeroed.
+ // The caller is responsible for locking around FixAlloc calls.
+ // Callers can keep state in the object but the first word is
+ // smashed by freeing and reallocating.
+ type fixalloc struct {
+       size   uintptr
+       first  unsafe.Pointer // go func(unsafe.pointer, unsafe.pointer); f(arg, p) called first time p is returned
+       arg    unsafe.Pointer
+       list   *mlink
+       chunk  *byte
+       nchunk uint32
+       inuse  uintptr // in-use bytes now
+       stat   *uint64
+ }
+ 
+ // Statistics.
+ // Shared with Go: if you edit this structure, also edit type MemStats in mem.go.
+ type mstats struct {
+       // General statistics.
+       alloc       uint64 // bytes allocated and still in use
+       total_alloc uint64 // bytes allocated (even if freed)
+       sys         uint64 // bytes obtained from system (should be sum of xxx_sys below, no locking, approximate)
+       nlookup     uint64 // number of pointer lookups
+       nmalloc     uint64 // number of mallocs
+       nfree       uint64 // number of frees
+ 
+       // Statistics about malloc heap.
+       // protected by mheap.lock
+       heap_alloc    uint64 // bytes allocated and still in use
+       heap_sys      uint64 // bytes obtained from system
+       heap_idle     uint64 // bytes in idle spans
+       heap_inuse    uint64 // bytes in non-idle spans
+       heap_released uint64 // bytes released to the os
+       heap_objects  uint64 // total number of allocated objects
+ 
+       // Statistics about allocation of low-level fixed-size structures.
+       // Protected by FixAlloc locks.
+       stacks_inuse uint64 // this number is included in heap_inuse above
+       stacks_sys   uint64 // always 0 in mstats
+       mspan_inuse  uint64 // mspan structures
+       mspan_sys    uint64
+       mcache_inuse uint64 // mcache structures
+       mcache_sys   uint64
+       buckhash_sys uint64 // profiling bucket hash table
+       gc_sys       uint64
+       other_sys    uint64
+ 
+       // Statistics about garbage collector.
+       // Protected by mheap or stopping the world during GC.
+       next_gc        uint64 // next gc (in heap_alloc time)
+       last_gc        uint64 // last gc (in absolute time)
+       pause_total_ns uint64
+       pause_ns       [256]uint64 // circular buffer of recent gc pause lengths
+       pause_end      [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
+       numgc          uint32
+       enablegc       bool
+       debuggc        bool
+ 
+       // Statistics about allocation size classes.
+ 
+       by_size [_NumSizeClasses]struct {
+               size    uint32
+               nmalloc uint64
+               nfree   uint64
+       }
+ 
+       tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+ }
+ 
+ var memstats mstats
+ 
+ // Size classes.  Computed and initialized by InitSizes.
+ //
+ // SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
+ //    1 <= sizeclass < NumSizeClasses, for n.
+ //    Size class 0 is reserved to mean "not small".
+ //
+ // class_to_size[i] = largest size in class i
+ // class_to_allocnpages[i] = number of pages to allocate when
+ //    making new objects in class i
+ 
+ var class_to_size [_NumSizeClasses]int32
+ var class_to_allocnpages [_NumSizeClasses]int32
+ var size_to_class8 [1024/8 + 1]int8
+ var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
+ 
+ type mcachelist struct {
+       list  *mlink
+       nlist uint32
+ }
+ 
+ type stackfreelist struct {
+       list *mlink  // linked list of free stacks
+       size uintptr // total size of stacks in list
+ }
+ 
+ // Per-thread (in Go, per-P) cache for small objects.
+ // No locking needed because it is per-thread (per-P).
+ type mcache struct {
+       // The following members are accessed on every malloc,
+       // so they are grouped here for better caching.
+       next_sample      int32  // trigger heap sample after allocating this many bytes
+       local_cachealloc intptr // bytes allocated (or freed) from cache since last lock of heap
+       // Allocator cache for tiny objects w/o pointers.
+       // See "Tiny allocator" comment in malloc.goc.
+       tiny             *byte
+       tinysize         uintptr
+       local_tinyallocs uintptr // number of tiny allocs not counted in other stats
+ 
+       // The rest is not accessed on every malloc.
+       alloc [_NumSizeClasses]*mspan // spans to allocate from
+ 
+       stackcache [_NumStackOrders]stackfreelist
+ 
+       sudogcache *sudog
+ 
+       // Local allocator stats, flushed during GC.
+       local_nlookup    uintptr                  // number of pointer lookups
+       local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
+       local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
+       local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+ }
+ 
+ const (
+       _KindSpecialFinalizer = 1
+       _KindSpecialProfile   = 2
+       // Note: The finalizer special must be first because if we're freeing
+       // an object, a finalizer special will cause the freeing operation
+       // to abort, and we want to keep the other special records around
+       // if that happens.
+ )
+ 
+ type special struct {
+       next   *special // linked list in span
+       offset uint16   // span offset of object
+       kind   byte     // kind of special
+ }
+ 
+ // The described object has a finalizer set for it.
+ type specialfinalizer struct {
+       special special
+       fn      *funcval
+       nret    uintptr
+       fint    *_type
+       ot      *ptrtype
+ }
+ 
+ // The described object is being heap profiled.
+ type specialprofile struct {
+       special special
+       b       *bucket
+ }
+ 
+ // An MSpan is a run of pages.
+ const (
+       _MSpanInUse = iota // allocated for garbage collected heap
+       _MSpanStack        // allocated for use by stack allocator
+       _MSpanFree
+       _MSpanListHead
+       _MSpanDead
+ )
+ 
+ type mspan struct {
+       next     *mspan  // in a span linked list
+       prev     *mspan  // in a span linked list
+       start    pageID  // starting page number
+       npages   uintptr // number of pages in span
+       freelist *mlink  // list of free objects
+       // sweep generation:
+       // if sweepgen == h->sweepgen - 2, the span needs sweeping
+       // if sweepgen == h->sweepgen - 1, the span is currently being swept
+       // if sweepgen == h->sweepgen, the span is swept and ready to use
+       // h->sweepgen is incremented by 2 after every GC
+       sweepgen    uint32
+       ref         uint16   // capacity - number of objects in freelist
+       sizeclass   uint8    // size class
+       incache     bool     // being used by an mcache
+       state       uint8    // mspaninuse etc
+       needzero    uint8    // needs to be zeroed before allocation
+       elemsize    uintptr  // computed from sizeclass or from npages
+       unusedsince int64    // first time spotted by gc in mspanfree state
+       npreleased  uintptr  // number of pages released to the os
+       limit       uintptr  // end of data in span
+       speciallock mutex    // guards specials list
+       specials    *special // linked list of special records sorted by offset.
+ }
+ 
+ // Every MSpan is in one doubly-linked list,
+ // either one of the MHeap's free lists or one of the
+ // MCentral's span lists.  We use empty MSpan structures as list heads.
+ 
+ // Central list of free objects of a given size.
+ type mcentral struct {
+       lock      mutex
+       sizeclass int32
+       nonempty  mspan // list of spans with a free object
+       empty     mspan // list of spans with no free objects (or cached in an mcache)
+ }
+ 
+ // Main malloc heap.
+ // The heap itself is the "free[]" and "large" arrays,
+ // but all the other global data is here too.
+ type mheap struct {
+       lock      mutex
+       free      [_MaxMHeapList]mspan // free lists of given length
+       freelarge mspan                // free lists length >= _MaxMHeapList
+       busy      [_MaxMHeapList]mspan // busy lists of large objects of given length
+       busylarge mspan                // busy lists of large objects length >= _MaxMHeapList
+       allspans  **mspan              // all spans out there
+       gcspans   **mspan              // copy of allspans referenced by gc marker or sweeper
+       nspan     uint32
+       sweepgen  uint32 // sweep generation, see comment in mspan
+       sweepdone uint32 // all spans are swept
+ 
+       // span lookup
+       spans        **mspan
+       spans_mapped uintptr
+ 
+       // range of addresses we might see in the heap
+       bitmap         uintptr
+       bitmap_mapped  uintptr
+       arena_start    uintptr
+       arena_used     uintptr
+       arena_end      uintptr
+       arena_reserved bool
+ 
+       // central free lists for small size classes.
+       // the padding makes sure that the MCentrals are
+       // spaced CacheLineSize bytes apart, so that each MCentral.lock
+       // gets its own cache line.
+       central [_NumSizeClasses]struct {
+               mcentral mcentral
+               pad      [_CacheLineSize]byte
+       }
+ 
+       spanalloc             fixalloc // allocator for span*
+       cachealloc            fixalloc // allocator for mcache*
+       specialfinalizeralloc fixalloc // allocator for specialfinalizer*
+       specialprofilealloc   fixalloc // allocator for specialprofile*
+       speciallock           mutex    // lock for sepcial record allocators.
+ 
+       // Malloc stats.
+       largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
+       nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
+       nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+ }
+ 
+ var mheap_ mheap
+ 
+ const (
+       // flags to malloc
+       _FlagNoScan = 1 << 0 // GC doesn't have to scan object
+       _FlagNoZero = 1 << 1 // don't zero memory
+ )
+ 
+ // NOTE: Layout known to queuefinalizer.
+ type finalizer struct {
+       fn   *funcval       // function to call
+       arg  unsafe.Pointer // ptr to object
+       nret uintptr        // bytes of return values from fn
+       fint *_type         // type of first argument of fn
+       ot   *ptrtype       // type of ptr to object
+ }
+ 
+ type finblock struct {
+       alllink *finblock
+       next    *finblock
+       cnt     int32
+       cap     int32
+       fin     [1]finalizer
+ }
+ 
+ // Information from the compiler about the layout of stack frames.
+ type bitvector struct {
+       n        int32 // # of bits
+       bytedata *uint8
+ }
+ 
+ type stackmap struct {
+       n        int32   // number of bitmaps
+       nbit     int32   // number of bits in each bitmap
+       bytedata [0]byte // bitmaps, each starting on a 32-bit boundary
+ }
+ 
+ // Returns pointer map data for the given stackmap index
+ // (the index is encoded in PCDATA_StackMapIndex).
+ 
+ // defined in mgc0.go
diff --cc src/runtime/mcache.go

index 0000000000000000000000000000000000000000,d3afef6be6f065b6ed1d2005c2e02bf878bab2d8..08b1bc3597bc0a7b03298cf7ebb36302e27e1ae5

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@@ -1,0 -1,86 +1,91 @@@
- -              gcworkbuffree(c.gcworkbuf)
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // Per-P malloc cache for small objects.
+ //
+ // See malloc.h for an overview.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ // dummy MSpan that contains no free objects.
+ var emptymspan mspan
+ 
+ func allocmcache() *mcache {
+       lock(&mheap_.lock)
+       c := (*mcache)(fixAlloc_Alloc(&mheap_.cachealloc))
+       unlock(&mheap_.lock)
+       memclr(unsafe.Pointer(c), unsafe.Sizeof(*c))
+       for i := 0; i < _NumSizeClasses; i++ {
+               c.alloc[i] = &emptymspan
+       }
+ 
+       // Set first allocation sample size.
+       rate := MemProfileRate
+       if rate > 0x3fffffff { // make 2*rate not overflow
+               rate = 0x3fffffff
+       }
+       if rate != 0 {
+               c.next_sample = int32(int(fastrand1()) % (2 * rate))
+       }
+ 
+       return c
+ }
+ 
+ func freemcache(c *mcache) {
+       systemstack(func() {
+               mCache_ReleaseAll(c)
+               stackcache_clear(c)
++
++              // NOTE(rsc,rlh): If gcworkbuffree comes back, we need to coordinate
++              // with the stealing of gcworkbufs during garbage collection to avoid
++              // a race where the workbuf is double-freed.
++              // gcworkbuffree(c.gcworkbuf)
++
+               lock(&mheap_.lock)
+               purgecachedstats(c)
+               fixAlloc_Free(&mheap_.cachealloc, unsafe.Pointer(c))
+               unlock(&mheap_.lock)
+       })
+ }
+ 
+ // Gets a span that has a free object in it and assigns it
+ // to be the cached span for the given sizeclass.  Returns this span.
+ func mCache_Refill(c *mcache, sizeclass int32) *mspan {
+       _g_ := getg()
+ 
+       _g_.m.locks++
+       // Return the current cached span to the central lists.
+       s := c.alloc[sizeclass]
+       if s.freelist != nil {
+               gothrow("refill on a nonempty span")
+       }
+       if s != &emptymspan {
+               s.incache = false
+       }
+ 
+       // Get a new cached span from the central lists.
+       s = mCentral_CacheSpan(&mheap_.central[sizeclass].mcentral)
+       if s == nil {
+               gothrow("out of memory")
+       }
+       if s.freelist == nil {
+               println(s.ref, (s.npages<<_PageShift)/s.elemsize)
+               gothrow("empty span")
+       }
+       c.alloc[sizeclass] = s
+       _g_.m.locks--
+       return s
+ }
+ 
+ func mCache_ReleaseAll(c *mcache) {
+       for i := 0; i < _NumSizeClasses; i++ {
+               s := c.alloc[i]
+               if s != &emptymspan {
+                       mCentral_UncacheSpan(&mheap_.central[i].mcentral, s)
+                       c.alloc[i] = &emptymspan
+               }
+       }
+ }
diff --cc src/runtime/mgc.go

index 0000000000000000000000000000000000000000,f44d7ddbce552a97cd5a47241c98d054f2a011b8..57bd8b356321c23e36b049f4559fd81b959522d3

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@@ -1,0 -1,1798 +1,2422 @@@
- -// GC is:
- -// - mark&sweep
- -// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
- -// - parallel (up to MaxGcproc threads)
- -// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
- -// - non-moving/non-compacting
- -// - full (non-partial)
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // TODO(rsc): The code having to do with the heap bitmap needs very serious cleanup.
+ // It has gotten completely out of control.
+ 
+ // Garbage collector (GC).
+ //
- -// GC rate.
- -// Next GC is after we've allocated an extra amount of memory proportional to
- -// the amount already in use. The proportion is controlled by GOGC environment variable
- -// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
- -// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
- -// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
- -// (and also the amount of extra memory used).
++// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple GC
++// thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
++// non-generational and non-compacting. Allocation is done using size segregated per P allocation
++// areas to minimize fragmentation while eliminating locks in the common case.
+ //
- -// Initialized from $GOGC.  GOGC=off means no gc.
++// The algorithm decomposes into several steps.
++// This is a high level description of the algorithm being used. For an overview of GC a good
++// place to start is Richard Jones' gchandbook.org.
++//
++// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
++// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
++// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978), 966-975.
++// For journal quality proofs that these steps are complete, correct, and terminate see
++// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
++// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
+ //
++//  0. Set phase = GCscan from GCoff.
++//  1. Wait for all P's to acknowledge phase change.
++//         At this point all goroutines have passed through a GC safepoint and
++//         know we are in the GCscan phase.
++//  2. GC scans all goroutine stacks, mark and enqueues all encountered pointers
++//       (marking avoids most duplicate enqueuing but races may produce duplication which is benign).
++//       Preempted goroutines are scanned before P schedules next goroutine.
++//  3. Set phase = GCmark.
++//  4. Wait for all P's to acknowledge phase change.
++//  5. Now write barrier marks and enqueues black, grey, or white to white pointers.
++//       Malloc still allocates white (non-marked) objects.
++//  6. Meanwhile GC transitively walks the heap marking reachable objects.
++//  7. When GC finishes marking heap, it preempts P's one-by-one and
++//       retakes partial wbufs (filled by write barrier or during a stack scan of the goroutine
++//       currently scheduled on the P).
++//  8. Once the GC has exhausted all available marking work it sets phase = marktermination.
++//  9. Wait for all P's to acknowledge phase change.
++// 10. Malloc now allocates black objects, so number of unmarked reachable objects
++//        monotonically decreases.
++// 11. GC preempts P's one-by-one taking partial wbufs and marks all unmarked yet reachable objects.
++// 12. When GC completes a full cycle over P's and discovers no new grey
++//         objects, (which means all reachable objects are marked) set phase = GCsweep.
++// 13. Wait for all P's to acknowledge phase change.
++// 14. Now malloc allocates white (but sweeps spans before use).
++//         Write barrier becomes nop.
++// 15. GC does background sweeping, see description below.
++// 16. When sweeping is complete set phase to GCoff.
++// 17. When sufficient allocation has taken place replay the sequence starting at 0 above,
++//         see discussion of GC rate below.
++
++// Changing phases.
++// Phases are changed by setting the gcphase to the next phase and possibly calling ackgcphase.
++// All phase action must be benign in the presence of a change.
++// Starting with GCoff
++// GCoff to GCscan
++//     GSscan scans stacks and globals greying them and never marks an object black.
++//     Once all the P's are aware of the new phase they will scan gs on preemption.
++//     This means that the scanning of preempted gs can't start until all the Ps
++//     have acknowledged.
++// GCscan to GCmark
++//     GCMark turns on the write barrier which also only greys objects. No scanning
++//     of objects (making them black) can happen until all the Ps have acknowledged
++//     the phase change.
++// GCmark to GCmarktermination
++//     The only change here is that we start allocating black so the Ps must acknowledge
++//     the change before we begin the termination algorithm
++// GCmarktermination to GSsweep
++//     Object currently on the freelist must be marked black for this to work.
++//     Are things on the free lists black or white? How does the sweep phase work?
++
+ // Concurrent sweep.
+ // The sweep phase proceeds concurrently with normal program execution.
+ // The heap is swept span-by-span both lazily (when a goroutine needs another span)
+ // and concurrently in a background goroutine (this helps programs that are not CPU bound).
+ // However, at the end of the stop-the-world GC phase we don't know the size of the live heap,
+ // and so next_gc calculation is tricky and happens as follows.
+ // At the end of the stop-the-world phase next_gc is conservatively set based on total
+ // heap size; all spans are marked as "needs sweeping".
+ // Whenever a span is swept, next_gc is decremented by GOGC*newly_freed_memory.
+ // The background sweeper goroutine simply sweeps spans one-by-one bringing next_gc
+ // closer to the target value. However, this is not enough to avoid over-allocating memory.
+ // Consider that a goroutine wants to allocate a new span for a large object and
+ // there are no free swept spans, but there are small-object unswept spans.
+ // If the goroutine naively allocates a new span, it can surpass the yet-unknown
+ // target next_gc value. In order to prevent such cases (1) when a goroutine needs
+ // to allocate a new small-object span, it sweeps small-object spans for the same
+ // object size until it frees at least one object; (2) when a goroutine needs to
+ // allocate large-object span from heap, it sweeps spans until it frees at least
+ // that many pages into heap. Together these two measures ensure that we don't surpass
+ // target next_gc value by a large margin. There is an exception: if a goroutine sweeps
+ // and frees two nonadjacent one-page spans to the heap, it will allocate a new two-page span,
+ // but there can still be other one-page unswept spans which could be combined into a two-page span.
+ // It's critical to ensure that no operations proceed on unswept spans (that would corrupt
+ // mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
+ // so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
+ // When a goroutine explicitly frees an object or sets a finalizer, it ensures that
+ // the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
+ // The finalizer goroutine is kicked off only when all spans are swept.
+ // When the next GC starts, it sweeps all not-yet-swept spans (if any).
+ 
++// GC rate.
++// Next GC is after we've allocated an extra amount of memory proportional to
++// the amount already in use. The proportion is controlled by GOGC environment variable
++// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
++// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
++// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
++// (and also the amount of extra memory used).
++
+ package runtime
+ 
+ import "unsafe"
+ 
+ const (
+       _DebugGC         = 0
+       _DebugGCPtrs     = false // if true, print trace of every pointer load during GC
+       _ConcurrentSweep = true
+ 
+       _WorkbufSize     = 4 * 1024
+       _FinBlockSize    = 4 * 1024
+       _RootData        = 0
+       _RootBss         = 1
+       _RootFinalizers  = 2
+       _RootSpans       = 3
+       _RootFlushCaches = 4
+       _RootCount       = 5
+ )
+ 
+ // ptrmask for an allocation containing a single pointer.
+ var oneptr = [...]uint8{bitsPointer}
+ 
- -// scanblock scans a block of n bytes starting at pointer b for references
- -// to other objects, scanning any it finds recursively until there are no
- -// unscanned objects left.  Instead of using an explicit recursion, it keeps
- -// a work list in the Workbuf* structures and loops in the main function
- -// body.  Keeping an explicit work list is easier on the stack allocator and
- -// more efficient.
- -func scanblock(b, n uintptr, ptrmask *uint8) {
- -      // Cache memory arena parameters in local vars.
- -      arena_start := mheap_.arena_start
- -      arena_used := mheap_.arena_used
- -
- -      wbuf := getempty(nil)
- -      nobj := wbuf.nobj
- -      wp := &wbuf.obj[nobj]
- -      keepworking := b == 0
++// Initialized from $GOGC.  GOGC=off means no GC.
+ var gcpercent int32
+ 
+ // Holding worldsema grants an M the right to try to stop the world.
+ // The procedure is:
+ //
+ //    semacquire(&worldsema);
+ //    m.gcing = 1;
+ //    stoptheworld();
+ //
+ //    ... do stuff ...
+ //
+ //    m.gcing = 0;
+ //    semrelease(&worldsema);
+ //    starttheworld();
+ //
+ var worldsema uint32 = 1
+ 
++// It is a bug if bits does not have bitBoundary set but
++// there are still some cases where this happens related
++// to stack spans.
++type markbits struct {
++      bitp  *byte   // pointer to the byte holding xbits
++      shift uintptr // bits xbits needs to be shifted to get bits
++      xbits byte    // byte holding all the bits from *bitp
++      bits  byte    // mark and boundary bits relevant to corresponding slot.
++      tbits byte    // pointer||scalar bits relevant to corresponding slot.
++}
++
+ type workbuf struct {
+       node lfnode // must be first
+       nobj uintptr
+       obj  [(_WorkbufSize - unsafe.Sizeof(lfnode{}) - ptrSize) / ptrSize]uintptr
+ }
+ 
+ var data, edata, bss, ebss, gcdata, gcbss struct{}
+ 
+ var finlock mutex  // protects the following variables
+ var fing *g        // goroutine that runs finalizers
+ var finq *finblock // list of finalizers that are to be executed
+ var finc *finblock // cache of free blocks
+ var finptrmask [_FinBlockSize / ptrSize / pointersPerByte]byte
+ var fingwait bool
+ var fingwake bool
+ var allfin *finblock // list of all blocks
+ 
+ var gcdatamask bitvector
+ var gcbssmask bitvector
+ 
+ var gclock mutex
+ 
+ var badblock [1024]uintptr
+ var nbadblock int32
+ 
+ type workdata struct {
+       full    uint64                // lock-free list of full blocks
+       empty   uint64                // lock-free list of empty blocks
++      partial uint64                // lock-free list of partially filled blocks
+       pad0    [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+       nproc   uint32
+       tstart  int64
+       nwait   uint32
+       ndone   uint32
+       alldone note
+       markfor *parfor
+ 
+       // Copy of mheap.allspans for marker or sweeper.
+       spans []*mspan
+ }
+ 
+ var work workdata
+ 
+ //go:linkname weak_cgo_allocate go.weak.runtime._cgo_allocate_internal
+ var weak_cgo_allocate byte
+ 
+ // Is _cgo_allocate linked into the binary?
+ func have_cgo_allocate() bool {
+       return &weak_cgo_allocate != nil
+ }
+ 
- -      var ptrbitp unsafe.Pointer
++// To help debug the concurrent GC we remark with the world
++// stopped ensuring that any object encountered has their normal
++// mark bit set. To do this we use an orthogonal bit
++// pattern to indicate the object is marked. The following pattern
++// uses the upper two bits in the object's bounday nibble.
++// 01: scalar  not marked
++// 10: pointer not marked
++// 11: pointer     marked
++// 00: scalar      marked
++// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
++// The higher bit is 1 for pointers and 0 for scalars, whether the object
++// is marked or not.
++// The first nibble no longer holds the bitsDead pattern indicating that the
++// there are no more pointers in the object. This information is held
++// in the second nibble.
++
++// When marking an object if the bool checkmark is true one uses the above
++// encoding, otherwise one uses the bitMarked bit in the lower two bits
++// of the nibble.
++var (
++      checkmark         = false
++      gccheckmarkenable = true
++)
+ 
- -      // ptrmask can have 2 possible values:
- -      // 1. nil - obtain pointer mask from GC bitmap.
- -      // 2. pointer to a compact mask (for stacks and data).
- -      goto_scanobj := b != 0
++// Is address b in the known heap. If it doesn't have a valid gcmap
++// returns false. For example pointers into stacks will return false.
++func inheap(b uintptr) bool {
++      if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
++              return false
++      }
++      // Not a beginning of a block, consult span table to find the block beginning.
++      k := b >> _PageShift
++      x := k
++      x -= mheap_.arena_start >> _PageShift
++      s := h_spans[x]
++      if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
++              return false
++      }
++      return true
++}
+ 
- -              if goto_scanobj {
- -                      goto_scanobj = false
- -              } else {
- -                      if nobj == 0 {
- -                              // Out of work in workbuf.
- -                              if !keepworking {
- -                                      putempty(wbuf)
- -                                      return
- -                              }
++// Given an address in the heap return the relevant byte from the gcmap. This routine
++// can be used on addresses to the start of an object or to the interior of the an object.
++func slottombits(obj uintptr, mbits *markbits) {
++      off := (obj&^(ptrSize-1) - mheap_.arena_start) / ptrSize
++      mbits.bitp = (*byte)(unsafe.Pointer(mheap_.arena_start - off/wordsPerBitmapByte - 1))
++      mbits.shift = off % wordsPerBitmapByte * gcBits
++      mbits.xbits = *mbits.bitp
++      mbits.bits = (mbits.xbits >> mbits.shift) & bitMask
++      mbits.tbits = ((mbits.xbits >> mbits.shift) & bitPtrMask) >> 2
++}
+ 
++// b is a pointer into the heap.
++// Find the start of the object refered to by b.
++// Set mbits to the associated bits from the bit map.
++// If b is not a valid heap object return nil and
++// undefined values in mbits.
++func objectstart(b uintptr, mbits *markbits) uintptr {
++      obj := b &^ (ptrSize - 1)
+       for {
- -                              // Refill workbuf from global queue.
- -                              wbuf = getfull(wbuf)
- -                              if wbuf == nil {
- -                                      return
- -                              }
- -                              nobj = wbuf.nobj
- -                              if nobj < uintptr(len(wbuf.obj)) {
- -                                      wp = &wbuf.obj[nobj]
- -                              } else {
- -                                      wp = nil
- -                              }
++              slottombits(obj, mbits)
++              if mbits.bits&bitBoundary == bitBoundary {
++                      break
++              }
+ 
- -                      // If another proc wants a pointer, give it some.
- -                      if work.nwait > 0 && nobj > 4 && work.full == 0 {
- -                              wbuf.nobj = nobj
- -                              wbuf = handoff(wbuf)
- -                              nobj = wbuf.nobj
- -                              if nobj < uintptr(len(wbuf.obj)) {
- -                                      wp = &wbuf.obj[nobj]
++              // Not a beginning of a block, consult span table to find the block beginning.
++              k := b >> _PageShift
++              x := k
++              x -= mheap_.arena_start >> _PageShift
++              s := h_spans[x]
++              if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
++                      if s != nil && s.state == _MSpanStack {
++                              return 0 // This is legit.
+                       }
+ 
- -                                      wp = nil
++                      // The following ensures that we are rigorous about what data
++                      // structures hold valid pointers
++                      if false {
++                              // Still happens sometimes. We don't know why.
++                              printlock()
++                              print("runtime:objectstart Span weird: obj=", hex(obj), " k=", hex(k))
++                              if s == nil {
++                                      print(" s=nil\n")
+                               } else {
- -
- -                      nobj--
- -                      wp = &wbuf.obj[nobj]
- -                      b = *wp
- -                      n = arena_used - uintptr(b)
- -                      ptrmask = nil // use GC bitmap for pointer info
++                                      print(" s.start=", hex(s.start<<_PageShift), " s.limit=", hex(s.limit), " s.state=", s.state, "\n")
+                               }
++                              printunlock()
++                              gothrow("objectstart: bad pointer in unexpected span")
+                       }
- -              if _DebugGCPtrs {
- -                      print("scanblock ", b, " +", hex(n), " ", ptrmask, "\n")
++                      return 0
+               }
+ 
- -
- -              // Find bits of the beginning of the object.
- -              if ptrmask == nil {
- -                      off := (uintptr(b) - arena_start) / ptrSize
- -                      ptrbitp = unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1)
++              p := uintptr(s.start) << _PageShift
++              if s.sizeclass != 0 {
++                      size := s.elemsize
++                      idx := (obj - p) / size
++                      p = p + idx*size
+               }
- -              var i uintptr
- -              for i = 0; i < n; i += ptrSize {
- -                      // Find bits for this word.
- -                      var bits uintptr
- -                      if ptrmask == nil {
- -                              // Check if we have reached end of span.
- -                              if (uintptr(b)+i)%_PageSize == 0 &&
- -                                      h_spans[(uintptr(b)-arena_start)>>_PageShift] != h_spans[(uintptr(b)+i-arena_start)>>_PageShift] {
- -                                      break
- -                              }
++              if p == obj {
++                      print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", s.limit, "\n")
++                      gothrow("failed to find block beginning")
+               }
++              obj = p
++      }
+ 
- -                              // Consult GC bitmap.
- -                              bits = uintptr(*(*byte)(ptrbitp))
++      // if size(obj.firstfield) < PtrSize, the &obj.secondfield could map to the boundary bit
++      // Clear any low bits to get to the start of the object.
++      // greyobject depends on this.
++      return obj
++}
+ 
- -                              if wordsPerBitmapByte != 2 {
- -                                      gothrow("alg doesn't work for wordsPerBitmapByte != 2")
- -                              }
- -                              j := (uintptr(b) + i) / ptrSize & 1
- -                              ptrbitp = add(ptrbitp, -j)
- -                              bits >>= gcBits * j
++// Slow for now as we serialize this, since this is on a debug path
++// speed is not critical at this point.
++var andlock mutex
+ 
- -                              if bits&bitBoundary != 0 && i != 0 {
- -                                      break // reached beginning of the next object
- -                              }
- -                              bits = (bits >> 2) & bitsMask
- -                              if bits == bitsDead {
- -                                      break // reached no-scan part of the object
- -                              }
- -                      } else {
- -                              // dense mask (stack or data)
- -                              bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * bitsPerPointer)) & bitsMask
- -                      }
++func atomicand8(src *byte, val byte) {
++      lock(&andlock)
++      *src &= val
++      unlock(&andlock)
++}
+ 
- -                      if bits <= _BitsScalar { // BitsScalar || BitsDead
- -                              continue
- -                      }
++// Mark using the checkmark scheme.
++func docheckmark(mbits *markbits) {
++      // xor 01 moves 01(scalar unmarked) to 00(scalar marked)
++      // and 10(pointer unmarked) to 11(pointer marked)
++      if mbits.tbits == _BitsScalar {
++              atomicand8(mbits.bitp, ^byte(_BitsCheckMarkXor<<mbits.shift<<2))
++      } else if mbits.tbits == _BitsPointer {
++              atomicor8(mbits.bitp, byte(_BitsCheckMarkXor<<mbits.shift<<2))
++      }
+ 
- -                      if bits != _BitsPointer {
- -                              gothrow("unexpected garbage collection bits")
- -                      }
++      // reload bits for ischeckmarked
++      mbits.xbits = *mbits.bitp
++      mbits.bits = (mbits.xbits >> mbits.shift) & bitMask
++      mbits.tbits = ((mbits.xbits >> mbits.shift) & bitPtrMask) >> 2
++}
+ 
- -                      obj := *(*uintptr)(unsafe.Pointer(b + i))
- -                      obj0 := obj
++// In the default scheme does mbits refer to a marked object.
++func ismarked(mbits *markbits) bool {
++      if mbits.bits&bitBoundary != bitBoundary {
++              gothrow("ismarked: bits should have boundary bit set")
++      }
++      return mbits.bits&bitMarked == bitMarked
++}
+ 
- -              markobj:
- -                      var s *mspan
- -                      var off, bitp, shift, xbits uintptr
++// In the checkmark scheme does mbits refer to a marked object.
++func ischeckmarked(mbits *markbits) bool {
++      if mbits.bits&bitBoundary != bitBoundary {
++              gothrow("ischeckmarked: bits should have boundary bit set")
++      }
++      return mbits.tbits == _BitsScalarMarked || mbits.tbits == _BitsPointerMarked
++}
+ 
- -                      // At this point we have extracted the next potential pointer.
- -                      // Check if it points into heap.
- -                      if obj == 0 {
- -                              continue
- -                      }
- -                      if obj < arena_start || arena_used <= obj {
- -                              if uintptr(obj) < _PhysPageSize && invalidptr != 0 {
- -                                      s = nil
- -                                      goto badobj
- -                              }
- -                              continue
- -                      }
++// When in GCmarkterminate phase we allocate black.
++func gcmarknewobject_m(obj uintptr) {
++      if gcphase != _GCmarktermination {
++              gothrow("marking new object while not in mark termination phase")
++      }
++      if checkmark { // The world should be stopped so this should not happen.
++              gothrow("gcmarknewobject called while doing checkmark")
++      }
+ 
- -                      // Mark the object.
- -                      obj &^= ptrSize - 1
- -                      off = (obj - arena_start) / ptrSize
- -                      bitp = arena_start - off/wordsPerBitmapByte - 1
- -                      shift = (off % wordsPerBitmapByte) * gcBits
- -                      xbits = uintptr(*(*byte)(unsafe.Pointer(bitp)))
- -                      bits = (xbits >> shift) & bitMask
- -                      if (bits & bitBoundary) == 0 {
- -                              // Not a beginning of a block, consult span table to find the block beginning.
- -                              k := pageID(obj >> _PageShift)
- -                              x := k
- -                              x -= pageID(arena_start >> _PageShift)
- -                              s = h_spans[x]
- -                              if s == nil || k < s.start || s.limit <= obj || s.state != mSpanInUse {
- -                                      // Stack pointers lie within the arena bounds but are not part of the GC heap.
- -                                      // Ignore them.
- -                                      if s != nil && s.state == _MSpanStack {
- -                                              continue
- -                                      }
- -                                      goto badobj
- -                              }
- -                              p := uintptr(s.start) << _PageShift
- -                              if s.sizeclass != 0 {
- -                                      size := s.elemsize
- -                                      idx := (obj - p) / size
- -                                      p = p + idx*size
- -                              }
- -                              if p == obj {
- -                                      print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n")
- -                                      gothrow("failed to find block beginning")
++      var mbits markbits
++      slottombits(obj, &mbits)
++      if mbits.bits&bitMarked != 0 {
++              return
++      }
+ 
- -                              obj = p
- -                              goto markobj
++      // Each byte of GC bitmap holds info for two words.
++      // If the current object is larger than two words, or if the object is one word
++      // but the object it shares the byte with is already marked,
++      // then all the possible concurrent updates are trying to set the same bit,
++      // so we can use a non-atomic update.
++      if mbits.xbits&(bitMask|(bitMask<<gcBits)) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
++              *mbits.bitp = mbits.xbits | bitMarked<<mbits.shift
++      } else {
++              atomicor8(mbits.bitp, bitMarked<<mbits.shift)
++      }
++}
++
++// obj is the start of an object with mark mbits.
++// If it isn't already marked, mark it and enqueue into workbuf.
++// Return possibly new workbuf to use.
++func greyobject(obj uintptr, mbits *markbits, wbuf *workbuf) *workbuf {
++      // obj should be start of allocation, and so must be at least pointer-aligned.
++      if obj&(ptrSize-1) != 0 {
++              gothrow("greyobject: obj not pointer-aligned")
++      }
++
++      if checkmark {
++              if !ismarked(mbits) {
++                      print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), ", mbits->bits=", hex(mbits.bits), " *mbits->bitp=", hex(*mbits.bitp), "\n")
++
++                      k := obj >> _PageShift
++                      x := k
++                      x -= mheap_.arena_start >> _PageShift
++                      s := h_spans[x]
++                      printlock()
++                      print("runtime:greyobject Span: obj=", hex(obj), " k=", hex(k))
++                      if s == nil {
++                              print(" s=nil\n")
++                      } else {
++                              print(" s.start=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n")
++                              // NOTE(rsc): This code is using s.sizeclass as an approximation of the
++                              // number of pointer-sized words in an object. Perhaps not what was intended.
++                              for i := 0; i < int(s.sizeclass); i++ {
++                                      print(" *(obj+", i*ptrSize, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + uintptr(i)*ptrSize))), "\n")
+                               }
- -                      if _DebugGCPtrs {
- -                              print("scan *", hex(b+i), " = ", hex(obj0), " => base ", hex(obj), "\n")
- -                      }
+                       }
++                      gothrow("checkmark found unmarked object")
++              }
++              if ischeckmarked(mbits) {
++                      return wbuf
++              }
++              docheckmark(mbits)
++              if !ischeckmarked(mbits) {
++                      print("mbits xbits=", hex(mbits.xbits), " bits=", hex(mbits.bits), " tbits=", hex(mbits.tbits), " shift=", mbits.shift, "\n")
++                      gothrow("docheckmark and ischeckmarked disagree")
++              }
++      } else {
++              // If marked we have nothing to do.
++              if mbits.bits&bitMarked != 0 {
++                      return wbuf
++              }
+ 
- -                      if nbadblock > 0 && obj == badblock[nbadblock-1] {
- -                              // Running garbage collection again because
- -                              // we want to find the path from a root to a bad pointer.
- -                              // Found possible next step; extend or finish path.
- -                              for j := int32(0); j < nbadblock; j++ {
- -                                      if badblock[j] == b {
- -                                              goto AlreadyBad
- -                                      }
- -                              }
- -                              print("runtime: found *(", hex(b), "+", hex(i), ") = ", hex(obj0), "+", hex(obj-obj0), "\n")
- -                              if ptrmask != nil {
- -                                      gothrow("bad pointer")
- -                              }
- -                              if nbadblock >= int32(len(badblock)) {
- -                                      gothrow("badblock trace too long")
- -                              }
- -                              badblock[nbadblock] = uintptr(b)
- -                              nbadblock++
- -                      AlreadyBad:
++              // Each byte of GC bitmap holds info for two words.
++              // If the current object is larger than two words, or if the object is one word
++              // but the object it shares the byte with is already marked,
++              // then all the possible concurrent updates are trying to set the same bit,
++              // so we can use a non-atomic update.
++              if mbits.xbits&(bitMask|bitMask<<gcBits) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
++                      *mbits.bitp = mbits.xbits | bitMarked<<mbits.shift
++              } else {
++                      atomicor8(mbits.bitp, bitMarked<<mbits.shift)
++              }
++      }
+ 
- -                      // Now we have bits, bitp, and shift correct for
- -                      // obj pointing at the base of the object.
- -                      // Only care about not marked objects.
- -                      if bits&bitMarked != 0 {
- -                              continue
++      if !checkmark && (mbits.xbits>>(mbits.shift+2))&_BitsMask == _BitsDead {
++              return wbuf // noscan object
++      }
++
++      // Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
++      // seems like a nice optimization that can be added back in.
++      // There needs to be time between the PREFETCH and the use.
++      // Previously we put the obj in an 8 element buffer that is drained at a rate
++      // to give the PREFETCH time to do its work.
++      // Use of PREFETCHNTA might be more appropriate than PREFETCH
++
++      // If workbuf is full, obtain an empty one.
++      if wbuf.nobj >= uintptr(len(wbuf.obj)) {
++              wbuf = getempty(wbuf)
++      }
++
++      wbuf.obj[wbuf.nobj] = obj
++      wbuf.nobj++
++      return wbuf
++}
++
++// Scan the object b of size n, adding pointers to wbuf.
++// Return possibly new wbuf to use.
++// If ptrmask != nil, it specifies where pointers are in b.
++// If ptrmask == nil, the GC bitmap should be consulted.
++// In this case, n may be an overestimate of the size; the GC bitmap
++// must also be used to make sure the scan stops at the end of b.
++func scanobject(b, n uintptr, ptrmask *uint8, wbuf *workbuf) *workbuf {
++      arena_start := mheap_.arena_start
++      arena_used := mheap_.arena_used
++
++      // Find bits of the beginning of the object.
++      var ptrbitp unsafe.Pointer
++      var mbits markbits
++      if ptrmask == nil {
++              b = objectstart(b, &mbits)
++              if b == 0 {
++                      return wbuf
++              }
++              ptrbitp = unsafe.Pointer(mbits.bitp)
++      }
++      for i := uintptr(0); i < n; i += ptrSize {
++              // Find bits for this word.
++              var bits uintptr
++              if ptrmask != nil {
++                      // dense mask (stack or data)
++                      bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * bitsPerPointer)) & bitsMask
++              } else {
++                      // Check if we have reached end of span.
++                      // n is an overestimate of the size of the object.
++                      if (b+i)%_PageSize == 0 && h_spans[(b-arena_start)>>_PageShift] != h_spans[(b+i-arena_start)>>_PageShift] {
++                              break
+                       }
+ 
- -                      // If obj size is greater than 8, then each byte of GC bitmap
- -                      // contains info for at most one object. In such case we use
- -                      // non-atomic byte store to mark the object. This can lead
- -                      // to double enqueue of the object for scanning, but scanning
- -                      // is an idempotent operation, so it is OK. This cannot lead
- -                      // to bitmap corruption because the single marked bit is the
- -                      // only thing that can change in the byte.
- -                      // For 8-byte objects we use non-atomic store, if the other
- -                      // quadruple is already marked. Otherwise we resort to CAS
- -                      // loop for marking.
- -                      if xbits&(bitMask|bitMask<<gcBits) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
- -                              *(*byte)(unsafe.Pointer(bitp)) = uint8(xbits | bitMarked<<shift)
- -                      } else {
- -                              atomicor8((*byte)(unsafe.Pointer(bitp)), bitMarked<<shift)
++                      // Consult GC bitmap.
++                      bits = uintptr(*(*byte)(ptrbitp))
++                      if wordsPerBitmapByte != 2 {
++                              gothrow("alg doesn't work for wordsPerBitmapByte != 2")
++                      }
++                      j := (uintptr(b) + i) / ptrSize & 1 // j indicates upper nibble or lower nibble
++                      bits >>= gcBits * j
++                      if i == 0 {
++                              bits &^= bitBoundary
+                       }
++                      ptrbitp = add(ptrbitp, -j)
+ 
- -                      if (xbits>>(shift+2))&bitsMask == bitsDead {
- -                              continue // noscan object
++                      if bits&bitBoundary != 0 && i != 0 {
++                              break // reached beginning of the next object
+                       }
++                      bits = (bits & bitPtrMask) >> 2 // bits refer to the type bits.
+ 
- -                      // Queue the obj for scanning.
- -                      // TODO: PREFETCH here.
++                      if i != 0 && bits == bitsDead { // BitsDead in first nibble not valid during checkmark
++                              break // reached no-scan part of the object
+                       }
++              }
+ 
- -                      // If workbuf is full, obtain an empty one.
- -                      if nobj >= uintptr(len(wbuf.obj)) {
- -                              wbuf.nobj = nobj
- -                              wbuf = getempty(wbuf)
- -                              nobj = wbuf.nobj
- -                              wp = &wbuf.obj[nobj]
- -                      }
- -                      *wp = obj
- -                      nobj++
- -                      if nobj < uintptr(len(wbuf.obj)) {
- -                              wp = &wbuf.obj[nobj]
- -                      } else {
- -                              wp = nil
- -                      }
++              if bits <= _BitsScalar { // _BitsScalar, _BitsDead, _BitsScalarMarked
++                      continue
++              }
+ 
- -              badobj:
- -                      // If cgo_allocate is linked into the binary, it can allocate
- -                      // memory as []unsafe.Pointer that may not contain actual
- -                      // pointers and must be scanned conservatively.
- -                      // In this case alone, allow the bad pointer.
- -                      if have_cgo_allocate() && ptrmask == nil {
- -                              continue
- -                      }
++              if bits&_BitsPointer != _BitsPointer {
++                      print("gc checkmark=", checkmark, " b=", hex(b), " ptrmask=", ptrmask, " mbits.bitp=", mbits.bitp, " mbits.xbits=", hex(mbits.xbits), " bits=", hex(bits), "\n")
++                      gothrow("unexpected garbage collection bits")
++              }
++
++              obj := *(*uintptr)(unsafe.Pointer(b + i))
++
++              // At this point we have extracted the next potential pointer.
++              // Check if it points into heap.
++              if obj == 0 || obj < arena_start || obj >= arena_used {
+                       continue
++              }
+ 
- -                      // Anything else indicates a bug somewhere.
- -                      // If we're in the middle of chasing down a different bad pointer,
- -                      // don't confuse the trace by printing about this one.
- -                      if nbadblock > 0 {
- -                              continue
++              // Mark the object. return some important bits.
++              // We we combine the following two rotines we don't have to pass mbits or obj around.
++              var mbits markbits
++              obj = objectstart(obj, &mbits)
++              if obj == 0 {
++                      continue
++              }
++              wbuf = greyobject(obj, &mbits, wbuf)
++      }
++      return wbuf
++}
+ 
- -                      print("runtime: garbage collector found invalid heap pointer *(", hex(b), "+", hex(i), ")=", hex(obj))
- -                      if s == nil {
- -                              print(" s=nil\n")
- -                      } else {
- -                              print(" span=", uintptr(s.start)<<_PageShift, "-", s.limit, "-", (uintptr(s.start)+s.npages)<<_PageShift, " state=", s.state, "\n")
++// scanblock starts by scanning b as scanobject would.
++// If the gcphase is GCscan, that's all scanblock does.
++// Otherwise it traverses some fraction of the pointers it found in b, recursively.
++// As a special case, scanblock(nil, 0, nil) means to scan previously queued work,
++// stopping only when no work is left in the system.
++func scanblock(b, n uintptr, ptrmask *uint8) {
++      wbuf := getpartialorempty()
++      if b != 0 {
++              wbuf = scanobject(b, n, ptrmask, wbuf)
++              if gcphase == _GCscan {
++                      if inheap(b) && ptrmask == nil {
++                              // b is in heap, we are in GCscan so there should be a ptrmask.
++                              gothrow("scanblock: In GCscan phase and inheap is true.")
+                       }
++                      // GCscan only goes one level deep since mark wb not turned on.
++                      putpartial(wbuf)
++                      return
++              }
++      }
++      if gcphase == _GCscan {
++              gothrow("scanblock: In GCscan phase but no b passed in.")
++      }
+ 
- -                      if ptrmask != nil {
- -                              gothrow("invalid heap pointer")
++      keepworking := b == 0
++
++      // ptrmask can have 2 possible values:
++      // 1. nil - obtain pointer mask from GC bitmap.
++      // 2. pointer to a compact mask (for stacks and data).
++      for {
++              if wbuf.nobj == 0 {
++                      if !keepworking {
++                              putempty(wbuf)
++                              return
+                       }
- -                      // Add to badblock list, which will cause the garbage collection
- -                      // to keep repeating until it has traced the chain of pointers
- -                      // leading to obj all the way back to a root.
- -                      if nbadblock == 0 {
- -                              badblock[nbadblock] = uintptr(b)
- -                              nbadblock++
++                      // Refill workbuf from global queue.
++                      wbuf = getfull(wbuf)
++                      if wbuf == nil { // nil means out of work barrier reached
++                              return
+                       }
- -              if _DebugGCPtrs {
- -                      print("end scanblock ", hex(b), " +", hex(n), " ", ptrmask, "\n")
- -              }
- -              if _DebugGC > 0 && ptrmask == nil {
- -                      // For heap objects ensure that we did not overscan.
- -                      var p, n uintptr
- -                      if mlookup(b, &p, &n, nil) == 0 || b != p || i > n {
- -                              print("runtime: scanned (", hex(b), "+", hex(i), "), heap object (", hex(p), "+", hex(n), ")\n")
- -                              gothrow("scanblock: scanned invalid object")
- -                      }
++
++                      if wbuf.nobj <= 0 {
++                              gothrow("runtime:scanblock getfull returns empty buffer")
+                       }
+               }
- -                      if s.sweepgen != sg {
++
++              // If another proc wants a pointer, give it some.
++              if work.nwait > 0 && wbuf.nobj > 4 && work.full == 0 {
++                      wbuf = handoff(wbuf)
+               }
++
++              // This might be a good place to add prefetch code...
++              // if(wbuf->nobj > 4) {
++              //         PREFETCH(wbuf->obj[wbuf->nobj - 3];
++              //  }
++              wbuf.nobj--
++              b = wbuf.obj[wbuf.nobj]
++              wbuf = scanobject(b, mheap_.arena_used-b, nil, wbuf)
+       }
+ }
+ 
+ func markroot(desc *parfor, i uint32) {
+       // Note: if you add a case here, please also update heapdump.c:dumproots.
+       switch i {
+       case _RootData:
+               scanblock(uintptr(unsafe.Pointer(&data)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)), gcdatamask.bytedata)
+ 
+       case _RootBss:
+               scanblock(uintptr(unsafe.Pointer(&bss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)), gcbssmask.bytedata)
+ 
+       case _RootFinalizers:
+               for fb := allfin; fb != nil; fb = fb.alllink {
+                       scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0])
+               }
+ 
+       case _RootSpans:
+               // mark MSpan.specials
+               sg := mheap_.sweepgen
+               for spanidx := uint32(0); spanidx < uint32(len(work.spans)); spanidx++ {
+                       s := work.spans[spanidx]
+                       if s.state != mSpanInUse {
+                               continue
+                       }
- -                              scanblock(p, s.elemsize, nil)
++                      if !checkmark && s.sweepgen != sg {
++                              // sweepgen was updated (+2) during non-checkmark GC pass
+                               print("sweep ", s.sweepgen, " ", sg, "\n")
+                               gothrow("gc: unswept span")
+                       }
+                       for sp := s.specials; sp != nil; sp = sp.next {
+                               if sp.kind != _KindSpecialFinalizer {
+                                       continue
+                               }
+                               // don't mark finalized object, but scan it so we
+                               // retain everything it points to.
+                               spf := (*specialfinalizer)(unsafe.Pointer(sp))
+                               // A finalizer can be set for an inner byte of an object, find object beginning.
+                               p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
- -              flushallmcaches()
++                              if gcphase != _GCscan {
++                                      scanblock(p, s.elemsize, nil) // scanned during mark phase
++                              }
+                               scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptr[0])
+                       }
+               }
+ 
+       case _RootFlushCaches:
- -              status := readgstatus(gp)
++              if gcphase != _GCscan { // Do not flush mcaches during GCscan phase.
++                      flushallmcaches()
++              }
+ 
+       default:
+               // the rest is scanning goroutine stacks
+               if uintptr(i-_RootCount) >= allglen {
+                       gothrow("markroot: bad index")
+               }
+               gp := allgs[i-_RootCount]
++
+               // remember when we've first observed the G blocked
+               // needed only to output in traceback
- -              // Shrink a stack if not much of it is being used.
- -              shrinkstack(gp)
++              status := readgstatus(gp) // We are not in a scan state
+               if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
+                       gp.waitsince = work.tstart
+               }
- -              scanstack(gp)
++
++              // Shrink a stack if not much of it is being used but not in the scan phase.
++              if gcphase != _GCscan { // Do not shrink during GCscan phase.
++                      shrinkstack(gp)
++              }
+               if readgstatus(gp) == _Gdead {
+                       gp.gcworkdone = true
+               } else {
+                       gp.gcworkdone = false
+               }
+               restart := stopg(gp)
- -      _g_ := getg()
++
++              // goroutine will scan its own stack when it stops running.
++              // Wait until it has.
++              for readgstatus(gp) == _Grunning && !gp.gcworkdone {
++              }
++
++              // scanstack(gp) is done as part of gcphasework
++              // But to make sure we finished we need to make sure that
++              // the stack traps have all responded so drop into
++              // this while loop until they respond.
++              for !gp.gcworkdone {
++                      status = readgstatus(gp)
++                      if status == _Gdead {
++                              gp.gcworkdone = true // scan is a noop
++                              break
++                      }
++                      if status == _Gwaiting || status == _Grunnable {
++                              restart = stopg(gp)
++                      }
++              }
+               if restart {
+                       restartg(gp)
+               }
+       }
+ }
+ 
+ // Get an empty work buffer off the work.empty list,
+ // allocating new buffers as needed.
+ func getempty(b *workbuf) *workbuf {
- -              lfstackpush(&work.full, &b.node)
+       if b != nil {
- -      b = nil
- -      c := _g_.m.mcache
- -      if c.gcworkbuf != nil {
- -              b = (*workbuf)(c.gcworkbuf)
- -              c.gcworkbuf = nil
- -      }
- -      if b == nil {
++              putfull(b)
++              b = nil
+       }
- -      b.nobj = 0
++      if work.empty != 0 {
+               b = (*workbuf)(lfstackpop(&work.empty))
+       }
++      if b != nil && b.nobj != 0 {
++              _g_ := getg()
++              print("m", _g_.m.id, ": getempty: popped b=", b, " with non-zero b.nobj=", b.nobj, "\n")
++              gothrow("getempty: workbuffer not empty, b->nobj not 0")
++      }
+       if b == nil {
+               b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), _CacheLineSize, &memstats.gc_sys))
++              b.nobj = 0
+       }
- -      _g_ := getg()
- -      c := _g_.m.mcache
- -      if c.gcworkbuf == nil {
- -              c.gcworkbuf = (unsafe.Pointer)(b)
- -              return
+       return b
+ }
+ 
+ func putempty(b *workbuf) {
- -func gcworkbuffree(b unsafe.Pointer) {
- -      if b != nil {
- -              putempty((*workbuf)(b))
++      if b.nobj != 0 {
++              gothrow("putempty: b->nobj not 0")
+       }
+       lfstackpush(&work.empty, &b.node)
+ }
+ 
- -// Get a full work buffer off the work.full list, or return nil.
++func putfull(b *workbuf) {
++      if b.nobj <= 0 {
++              gothrow("putfull: b->nobj <= 0")
++      }
++      lfstackpush(&work.full, &b.node)
++}
++
++// Get an partially empty work buffer
++// if none are available get an empty one.
++func getpartialorempty() *workbuf {
++      b := (*workbuf)(lfstackpop(&work.partial))
++      if b == nil {
++              b = getempty(nil)
+       }
++      return b
+ }
+ 
- -              lfstackpush(&work.empty, &b.node)
++func putpartial(b *workbuf) {
++      if b.nobj == 0 {
++              lfstackpush(&work.empty, &b.node)
++      } else if b.nobj < uintptr(len(b.obj)) {
++              lfstackpush(&work.partial, &b.node)
++      } else if b.nobj == uintptr(len(b.obj)) {
++              lfstackpush(&work.full, &b.node)
++      } else {
++              print("b=", b, " b.nobj=", b.nobj, " len(b.obj)=", len(b.obj), "\n")
++              gothrow("putpartial: bad Workbuf b.nobj")
++      }
++}
++
++// Get a full work buffer off the work.full or a partially
++// filled one off the work.partial list. If nothing is available
++// wait until all the other gc helpers have finished and then
++// return nil.
++// getfull acts as a barrier for work.nproc helpers. As long as one
++// gchelper is actively marking objects it
++// may create a workbuffer that the other helpers can work on.
++// The for loop either exits when a work buffer is found
++// or when _all_ of the work.nproc GC helpers are in the loop
++// looking for work and thus not capable of creating new work.
++// This is in fact the termination condition for the STW mark
++// phase.
+ func getfull(b *workbuf) *workbuf {
+       if b != nil {
- -              gothrow("mark - world not stopped")
++              putempty(b)
+       }
++
+       b = (*workbuf)(lfstackpop(&work.full))
++      if b == nil {
++              b = (*workbuf)(lfstackpop(&work.partial))
++      }
+       if b != nil || work.nproc == 1 {
+               return b
+       }
+ 
+       xadd(&work.nwait, +1)
+       for i := 0; ; i++ {
+               if work.full != 0 {
+                       xadd(&work.nwait, -1)
+                       b = (*workbuf)(lfstackpop(&work.full))
++                      if b == nil {
++                              b = (*workbuf)(lfstackpop(&work.partial))
++                      }
+                       if b != nil {
+                               return b
+                       }
+                       xadd(&work.nwait, +1)
+               }
+               if work.nwait == work.nproc {
+                       return nil
+               }
+               _g_ := getg()
+               if i < 10 {
+                       _g_.m.gcstats.nprocyield++
+                       procyield(20)
+               } else if i < 20 {
+                       _g_.m.gcstats.nosyield++
+                       osyield()
+               } else {
+                       _g_.m.gcstats.nsleep++
+                       usleep(100)
+               }
+       }
+ }
+ 
+ func handoff(b *workbuf) *workbuf {
+       // Make new buffer with half of b's pointers.
+       b1 := getempty(nil)
+       n := b.nobj / 2
+       b.nobj -= n
+       b1.nobj = n
+       memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), n*unsafe.Sizeof(b1.obj[0]))
+       _g_ := getg()
+       _g_.m.gcstats.nhandoff++
+       _g_.m.gcstats.nhandoffcnt += uint64(n)
+ 
+       // Put b on full list - let first half of b get stolen.
+       lfstackpush(&work.full, &b.node)
+       return b1
+ }
+ 
+ func stackmapdata(stkmap *stackmap, n int32) bitvector {
+       if n < 0 || n >= stkmap.n {
+               gothrow("stackmapdata: index out of range")
+       }
+       return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+31)/32*4))))}
+ }
+ 
+ // Scan a stack frame: local variables and function arguments/results.
+ func scanframe(frame *stkframe, unused unsafe.Pointer) bool {
+ 
+       f := frame.fn
+       targetpc := frame.continpc
+       if targetpc == 0 {
+               // Frame is dead.
+               return true
+       }
+       if _DebugGC > 1 {
+               print("scanframe ", gofuncname(f), "\n")
+       }
+       if targetpc != f.entry {
+               targetpc--
+       }
+       pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+       if pcdata == -1 {
+               // We do not have a valid pcdata value but there might be a
+               // stackmap for this function.  It is likely that we are looking
+               // at the function prologue, assume so and hope for the best.
+               pcdata = 0
+       }
+ 
+       // Scan local variables if stack frame has been allocated.
+       size := frame.varp - frame.sp
+       var minsize uintptr
+       if thechar != '6' && thechar != '8' {
+               minsize = ptrSize
+       } else {
+               minsize = 0
+       }
+       if size > minsize {
+               stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+               if stkmap == nil || stkmap.n <= 0 {
+                       print("runtime: frame ", gofuncname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
+                       gothrow("missing stackmap")
+               }
+ 
+               // Locals bitmap information, scan just the pointers in locals.
+               if pcdata < 0 || pcdata >= stkmap.n {
+                       // don't know where we are
+                       print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " locals stack map entries for ", gofuncname(f), " (targetpc=", targetpc, ")\n")
+                       gothrow("scanframe: bad symbol table")
+               }
+               bv := stackmapdata(stkmap, pcdata)
+               size = (uintptr(bv.n) * ptrSize) / bitsPerPointer
+               scanblock(frame.varp-size, uintptr(bv.n)/bitsPerPointer*ptrSize, bv.bytedata)
+       }
+ 
+       // Scan arguments.
+       if frame.arglen > 0 {
+               var bv bitvector
+               if frame.argmap != nil {
+                       bv = *frame.argmap
+               } else {
+                       stkmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+                       if stkmap == nil || stkmap.n <= 0 {
+                               print("runtime: frame ", gofuncname(f), " untyped args ", hex(frame.argp), "+", hex(frame.arglen), "\n")
+                               gothrow("missing stackmap")
+                       }
+                       if pcdata < 0 || pcdata >= stkmap.n {
+                               // don't know where we are
+                               print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " args stack map entries for ", gofuncname(f), " (targetpc=", targetpc, ")\n")
+                               gothrow("scanframe: bad symbol table")
+                       }
+                       bv = stackmapdata(stkmap, pcdata)
+               }
+               scanblock(frame.argp, uintptr(bv.n)/bitsPerPointer*ptrSize, bv.bytedata)
+       }
+       return true
+ }
+ 
+ func scanstack(gp *g) {
+       // TODO(rsc): Due to a precedence error, this was never checked in the original C version.
+       // If you enable the check, the gothrow happens.
+       /*
+               if readgstatus(gp)&_Gscan == 0 {
+                       print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+                       gothrow("mark - bad status")
+               }
+       */
+ 
+       switch readgstatus(gp) &^ _Gscan {
+       default:
+               print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+               gothrow("mark - bad status")
+       case _Gdead:
+               return
+       case _Grunning:
+               print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
- -// The gp has been moved to a gc safepoint. If there is gcphase specific
- -// work it is done here.
++              gothrow("scanstack: goroutine not stopped")
+       case _Grunnable, _Gsyscall, _Gwaiting:
+               // ok
+       }
+ 
+       if gp == getg() {
+               gothrow("can't scan our own stack")
+       }
+       mp := gp.m
+       if mp != nil && mp.helpgc != 0 {
+               gothrow("can't scan gchelper stack")
+       }
+ 
+       gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
+       tracebackdefers(gp, scanframe, nil)
+ }
+ 
- -              // No work for now.
++// If the slot is grey or black return true, if white return false.
++// If the slot is not in the known heap and thus does not have a valid GC bitmap then
++// it is considered grey. Globals and stacks can hold such slots.
++// The slot is grey if its mark bit is set and it is enqueued to be scanned.
++// The slot is black if it has already been scanned.
++// It is white if it has a valid mark bit and the bit is not set.
++func shaded(slot uintptr) bool {
++      if !inheap(slot) { // non-heap slots considered grey
++              return true
++      }
++
++      var mbits markbits
++      valid := objectstart(slot, &mbits)
++      if valid == 0 {
++              return true
++      }
++
++      if checkmark {
++              return ischeckmarked(&mbits)
++      }
++
++      return mbits.bits&bitMarked != 0
++}
++
++// Shade the object if it isn't already.
++// The object is not nil and known to be in the heap.
++func shade(b uintptr) {
++      if !inheap(b) {
++              gothrow("shade: passed an address not in the heap")
++      }
++
++      wbuf := getpartialorempty()
++      // Mark the object, return some important bits.
++      // If we combine the following two rotines we don't have to pass mbits or obj around.
++      var mbits markbits
++      obj := objectstart(b, &mbits)
++      if obj != 0 {
++              wbuf = greyobject(obj, &mbits, wbuf) // augments the wbuf
++      }
++      putpartial(wbuf)
++}
++
++// This is the Dijkstra barrier coarsened to always shade the ptr (dst) object.
++// The original Dijkstra barrier only shaded ptrs being placed in black slots.
++//
++// Shade indicates that it has seen a white pointer by adding the referent
++// to wbuf as well as marking it.
++//
++// slot is the destination (dst) in go code
++// ptr is the value that goes into the slot (src) in the go code
++//
++// Dijkstra pointed out that maintaining the no black to white
++// pointers means that white to white pointers not need
++// to be noted by the write barrier. Furthermore if either
++// white object dies before it is reached by the
++// GC then the object can be collected during this GC cycle
++// instead of waiting for the next cycle. Unfortunately the cost of
++// ensure that the object holding the slot doesn't concurrently
++// change to black without the mutator noticing seems prohibitive.
++//
++// Consider the following example where the mutator writes into
++// a slot and then loads the slot's mark bit while the GC thread
++// writes to the slot's mark bit and then as part of scanning reads
++// the slot.
++//
++// Initially both [slot] and [slotmark] are 0 (nil)
++// Mutator thread          GC thread
++// st [slot], ptr          st [slotmark], 1
++//
++// ld r1, [slotmark]       ld r2, [slot]
++//
++// This is a classic example of independent reads of independent writes,
++// aka IRIW. The question is if r1==r2==0 is allowed and for most HW the
++// answer is yes without inserting a memory barriers between the st and the ld.
++// These barriers are expensive so we have decided that we will
++// always grey the ptr object regardless of the slot's color.
++func gcmarkwb_m(slot *uintptr, ptr uintptr) {
++      switch gcphase {
++      default:
++              gothrow("gcphasework in bad gcphase")
++
++      case _GCoff, _GCquiesce, _GCstw, _GCsweep, _GCscan:
++              // ok
++
++      case _GCmark, _GCmarktermination:
++              if ptr != 0 && inheap(ptr) {
++                      shade(ptr)
++              }
++      }
++}
++
++// The gp has been moved to a GC safepoint. GC phase specific
++// work is done here.
+ func gcphasework(gp *g) {
+       switch gcphase {
+       default:
+               gothrow("gcphasework in bad gcphase")
+       case _GCoff, _GCquiesce, _GCstw, _GCsweep:
- -              // Disabled until concurrent GC is implemented
- -              // but indicate the scan has been done.
- -              // scanstack(gp);
++              // No work.
++      case _GCscan:
++              // scan the stack, mark the objects, put pointers in work buffers
++              // hanging off the P where this is being run.
++              scanstack(gp)
+       case _GCmark:
- -      // parallel mark for over gc roots
++              // No work.
++      case _GCmarktermination:
++              scanstack(gp)
++              // All available mark work will be emptied before returning.
+       }
+       gp.gcworkdone = true
+ }
+ 
+ var finalizer1 = [...]byte{
+       // Each Finalizer is 5 words, ptr ptr uintptr ptr ptr.
+       // Each byte describes 4 words.
+       // Need 4 Finalizers described by 5 bytes before pattern repeats:
+       //      ptr ptr uintptr ptr ptr
+       //      ptr ptr uintptr ptr ptr
+       //      ptr ptr uintptr ptr ptr
+       //      ptr ptr uintptr ptr ptr
+       // aka
+       //      ptr ptr uintptr ptr
+       //      ptr ptr ptr uintptr
+       //      ptr ptr ptr ptr
+       //      uintptr ptr ptr ptr
+       //      ptr uintptr ptr ptr
+       // Assumptions about Finalizer layout checked below.
+       bitsPointer | bitsPointer<<2 | bitsScalar<<4 | bitsPointer<<6,
+       bitsPointer | bitsPointer<<2 | bitsPointer<<4 | bitsScalar<<6,
+       bitsPointer | bitsPointer<<2 | bitsPointer<<4 | bitsPointer<<6,
+       bitsScalar | bitsPointer<<2 | bitsPointer<<4 | bitsPointer<<6,
+       bitsPointer | bitsScalar<<2 | bitsPointer<<4 | bitsPointer<<6,
+ }
+ 
+ func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot *ptrtype) {
+       lock(&finlock)
+       if finq == nil || finq.cnt == finq.cap {
+               if finc == nil {
+                       finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys))
+                       finc.cap = int32((_FinBlockSize-unsafe.Sizeof(finblock{}))/unsafe.Sizeof(finalizer{}) + 1)
+                       finc.alllink = allfin
+                       allfin = finc
+                       if finptrmask[0] == 0 {
+                               // Build pointer mask for Finalizer array in block.
+                               // Check assumptions made in finalizer1 array above.
+                               if (unsafe.Sizeof(finalizer{}) != 5*ptrSize ||
+                                       unsafe.Offsetof(finalizer{}.fn) != 0 ||
+                                       unsafe.Offsetof(finalizer{}.arg) != ptrSize ||
+                                       unsafe.Offsetof(finalizer{}.nret) != 2*ptrSize ||
+                                       unsafe.Offsetof(finalizer{}.fint) != 3*ptrSize ||
+                                       unsafe.Offsetof(finalizer{}.ot) != 4*ptrSize ||
+                                       bitsPerPointer != 2) {
+                                       gothrow("finalizer out of sync")
+                               }
+                               for i := range finptrmask {
+                                       finptrmask[i] = finalizer1[i%len(finalizer1)]
+                               }
+                       }
+               }
+               block := finc
+               finc = block.next
+               block.next = finq
+               finq = block
+       }
+       f := (*finalizer)(add(unsafe.Pointer(&finq.fin[0]), uintptr(finq.cnt)*unsafe.Sizeof(finq.fin[0])))
+       finq.cnt++
+       f.fn = fn
+       f.nret = nret
+       f.fint = fint
+       f.ot = ot
+       f.arg = p
+       fingwake = true
+       unlock(&finlock)
+ }
+ 
+ func iterate_finq(callback func(*funcval, unsafe.Pointer, uintptr, *_type, *ptrtype)) {
+       for fb := allfin; fb != nil; fb = fb.alllink {
+               for i := int32(0); i < fb.cnt; i++ {
+                       f := &fb.fin[i]
+                       callback(f.fn, f.arg, f.nret, f.fint, f.ot)
+               }
+       }
+ }
+ 
++// Returns only when span s has been swept.
+ func mSpan_EnsureSwept(s *mspan) {
+       // Caller must disable preemption.
+       // Otherwise when this function returns the span can become unswept again
+       // (if GC is triggered on another goroutine).
+       _g_ := getg()
+       if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+               gothrow("MSpan_EnsureSwept: m is not locked")
+       }
+ 
+       sg := mheap_.sweepgen
+       if atomicload(&s.sweepgen) == sg {
+               return
+       }
++      // The caller must be sure that the span is a MSpanInUse span.
+       if cas(&s.sweepgen, sg-2, sg-1) {
+               mSpan_Sweep(s, false)
+               return
+       }
+       // unfortunate condition, and we don't have efficient means to wait
+       for atomicload(&s.sweepgen) != sg {
+               osyield()
+       }
+ }
+ 
+ // Sweep frees or collects finalizers for blocks not marked in the mark phase.
+ // It clears the mark bits in preparation for the next GC round.
+ // Returns true if the span was returned to heap.
+ // If preserve=true, don't return it to heap nor relink in MCentral lists;
+ // caller takes care of it.
+ func mSpan_Sweep(s *mspan, preserve bool) bool {
++      if checkmark {
++              gothrow("MSpan_Sweep: checkmark only runs in STW and after the sweep")
++      }
++
+       // It's critical that we enter this function with preemption disabled,
+       // GC must not start while we are in the middle of this function.
+       _g_ := getg()
+       if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+               gothrow("MSpan_Sweep: m is not locked")
+       }
+       sweepgen := mheap_.sweepgen
+       if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+               print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+               gothrow("MSpan_Sweep: bad span state")
+       }
+       arena_start := mheap_.arena_start
+       cl := s.sizeclass
+       size := s.elemsize
+       var n int32
+       var npages int32
+       if cl == 0 {
+               n = 1
+       } else {
+               // Chunk full of small blocks.
+               npages = class_to_allocnpages[cl]
+               n = (npages << _PageShift) / int32(size)
+       }
+       res := false
+       nfree := 0
+       var head mlink
+       end := &head
+       c := _g_.m.mcache
+       sweepgenset := false
+ 
+       // Mark any free objects in this span so we don't collect them.
+       for link := s.freelist; link != nil; link = link.next {
+               off := (uintptr(unsafe.Pointer(link)) - arena_start) / ptrSize
+               bitp := arena_start - off/wordsPerBitmapByte - 1
+               shift := (off % wordsPerBitmapByte) * gcBits
+               *(*byte)(unsafe.Pointer(bitp)) |= bitMarked << shift
+       }
+ 
+       // Unlink & free special records for any objects we're about to free.
+       specialp := &s.specials
+       special := *specialp
+       for special != nil {
+               // A finalizer can be set for an inner byte of an object, find object beginning.
+               p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size
+               off := (p - arena_start) / ptrSize
+               bitp := arena_start - off/wordsPerBitmapByte - 1
+               shift := (off % wordsPerBitmapByte) * gcBits
+               bits := (*(*byte)(unsafe.Pointer(bitp)) >> shift) & bitMask
+               if bits&bitMarked == 0 {
+                       // Find the exact byte for which the special was setup
+                       // (as opposed to object beginning).
+                       p := uintptr(s.start<<_PageShift) + uintptr(special.offset)
+                       // about to free object: splice out special record
+                       y := special
+                       special = special.next
+                       *specialp = special
+                       if !freespecial(y, unsafe.Pointer(p), size, false) {
+                               // stop freeing of object if it has a finalizer
+                               *(*byte)(unsafe.Pointer(bitp)) |= bitMarked << shift
+                       }
+               } else {
+                       // object is still live: keep special record
+                       specialp = &special.next
+                       special = *specialp
+               }
+       }
+ 
+       // Sweep through n objects of given size starting at p.
+       // This thread owns the span now, so it can manipulate
+       // the block bitmap without atomic operations.
+       p := uintptr(s.start << _PageShift)
+       off := (p - arena_start) / ptrSize
+       bitp := arena_start - off/wordsPerBitmapByte - 1
+       shift := uint(0)
+       step := size / (ptrSize * wordsPerBitmapByte)
+       // Rewind to the previous quadruple as we move to the next
+       // in the beginning of the loop.
+       bitp += step
+       if step == 0 {
+               // 8-byte objects.
+               bitp++
+               shift = gcBits
+       }
+       for ; n > 0; n, p = n-1, p+size {
+               bitp -= step
+               if step == 0 {
+                       if shift != 0 {
+                               bitp--
+                       }
+                       shift = gcBits - shift
+               }
+ 
+               xbits := *(*byte)(unsafe.Pointer(bitp))
+               bits := (xbits >> shift) & bitMask
+ 
+               // Allocated and marked object, reset bits to allocated.
+               if bits&bitMarked != 0 {
+                       *(*byte)(unsafe.Pointer(bitp)) &^= bitMarked << shift
+                       continue
+               }
+ 
+               // At this point we know that we are looking at garbage object
+               // that needs to be collected.
+               if debug.allocfreetrace != 0 {
+                       tracefree(unsafe.Pointer(p), size)
+               }
+ 
+               // Reset to allocated+noscan.
+               *(*byte)(unsafe.Pointer(bitp)) = uint8(uintptr(xbits&^((bitMarked|bitsMask<<2)<<shift)) | uintptr(bitsDead)<<(shift+2))
+               if cl == 0 {
+                       // Free large span.
+                       if preserve {
+                               gothrow("can't preserve large span")
+                       }
+                       unmarkspan(p, s.npages<<_PageShift)
+                       s.needzero = 1
+ 
+                       // important to set sweepgen before returning it to heap
+                       atomicstore(&s.sweepgen, sweepgen)
+                       sweepgenset = true
+ 
+                       // NOTE(rsc,dvyukov): The original implementation of efence
+                       // in CL 22060046 used SysFree instead of SysFault, so that
+                       // the operating system would eventually give the memory
+                       // back to us again, so that an efence program could run
+                       // longer without running out of memory. Unfortunately,
+                       // calling SysFree here without any kind of adjustment of the
+                       // heap data structures means that when the memory does
+                       // come back to us, we have the wrong metadata for it, either in
+                       // the MSpan structures or in the garbage collection bitmap.
+                       // Using SysFault here means that the program will run out of
+                       // memory fairly quickly in efence mode, but at least it won't
+                       // have mysterious crashes due to confused memory reuse.
+                       // It should be possible to switch back to SysFree if we also
+                       // implement and then call some kind of MHeap_DeleteSpan.
+                       if debug.efence > 0 {
+                               s.limit = 0 // prevent mlookup from finding this span
+                               sysFault(unsafe.Pointer(p), size)
+                       } else {
+                               mHeap_Free(&mheap_, s, 1)
+                       }
+                       c.local_nlargefree++
+                       c.local_largefree += size
+                       xadd64(&memstats.next_gc, -int64(size)*int64(gcpercent+100)/100)
+                       res = true
+               } else {
+                       // Free small object.
+                       if size > 2*ptrSize {
+                               *(*uintptr)(unsafe.Pointer(p + ptrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed"
+                       } else if size > ptrSize {
+                               *(*uintptr)(unsafe.Pointer(p + ptrSize)) = 0
+                       }
+                       end.next = (*mlink)(unsafe.Pointer(p))
+                       end = end.next
+                       nfree++
+               }
+       }
+ 
+       // We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
+       // because of the potential for a concurrent free/SetFinalizer.
+       // But we need to set it before we make the span available for allocation
+       // (return it to heap or mcentral), because allocation code assumes that a
+       // span is already swept if available for allocation.
+       if !sweepgenset && nfree == 0 {
+               // The span must be in our exclusive ownership until we update sweepgen,
+               // check for potential races.
+               if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+                       print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+                       gothrow("MSpan_Sweep: bad span state after sweep")
+               }
+               atomicstore(&s.sweepgen, sweepgen)
+       }
+       if nfree > 0 {
+               c.local_nsmallfree[cl] += uintptr(nfree)
+               c.local_cachealloc -= intptr(uintptr(nfree) * size)
+               xadd64(&memstats.next_gc, -int64(nfree)*int64(size)*int64(gcpercent+100)/100)
+               res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head.next, end, preserve)
+               // MCentral_FreeSpan updates sweepgen
+       }
+       return res
+ }
+ 
+ // State of background sweep.
+ // Protected by gclock.
+ type sweepdata struct {
+       g       *g
+       parked  bool
+       started bool
+ 
+       spanidx uint32 // background sweeper position
+ 
+       nbgsweep    uint32
+       npausesweep uint32
+ }
+ 
+ var sweep sweepdata
+ 
+ // sweeps one span
+ // returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep
+ func sweepone() uintptr {
+       _g_ := getg()
+ 
+       // increment locks to ensure that the goroutine is not preempted
+       // in the middle of sweep thus leaving the span in an inconsistent state for next GC
+       _g_.m.locks++
+       sg := mheap_.sweepgen
+       for {
+               idx := xadd(&sweep.spanidx, 1) - 1
+               if idx >= uint32(len(work.spans)) {
+                       mheap_.sweepdone = 1
+                       _g_.m.locks--
+                       return ^uintptr(0)
+               }
+               s := work.spans[idx]
+               if s.state != mSpanInUse {
+                       s.sweepgen = sg
+                       continue
+               }
+               if s.sweepgen != sg-2 || !cas(&s.sweepgen, sg-2, sg-1) {
+                       continue
+               }
+               npages := s.npages
+               if !mSpan_Sweep(s, false) {
+                       npages = 0
+               }
+               _g_.m.locks--
+               return npages
+       }
+ }
+ 
+ func gosweepone() uintptr {
+       var ret uintptr
+       systemstack(func() {
+               ret = sweepone()
+       })
+       return ret
+ }
+ 
+ func gosweepdone() bool {
+       return mheap_.sweepdone != 0
+ }
+ 
+ func gchelper() {
+       _g_ := getg()
+       _g_.m.traceback = 2
+       gchelperstart()
+ 
- -
- -      // help other threads scan secondary blocks
- -      scanblock(0, 0, nil)
++      // parallel mark for over GC roots
+       parfordo(work.markfor)
- -      if nbadblock > 0 {
- -              // Work out path from root to bad block.
- -              for {
- -                      gc(start_time, eagersweep)
- -                      if nbadblock >= int32(len(badblock)) {
- -                              gothrow("cannot find path to bad pointer")
++      if gcphase != _GCscan {
++              scanblock(0, 0, nil) // blocks in getfull
++      }
+ 
+       nproc := work.nproc // work.nproc can change right after we increment work.ndone
+       if xadd(&work.ndone, +1) == nproc-1 {
+               notewakeup(&work.alldone)
+       }
+       _g_.m.traceback = 0
+ }
+ 
+ func cachestats() {
+       for i := 0; ; i++ {
+               p := allp[i]
+               if p == nil {
+                       break
+               }
+               c := p.mcache
+               if c == nil {
+                       continue
+               }
+               purgecachedstats(c)
+       }
+ }
+ 
+ func flushallmcaches() {
+       for i := 0; ; i++ {
+               p := allp[i]
+               if p == nil {
+                       break
+               }
+               c := p.mcache
+               if c == nil {
+                       continue
+               }
+               mCache_ReleaseAll(c)
+               stackcache_clear(c)
+       }
+ }
+ 
+ func updatememstats(stats *gcstats) {
+       if stats != nil {
+               *stats = gcstats{}
+       }
+       for mp := allm; mp != nil; mp = mp.alllink {
+               if stats != nil {
+                       src := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(&mp.gcstats))
+                       dst := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(stats))
+                       for i, v := range src {
+                               dst[i] += v
+                       }
+                       mp.gcstats = gcstats{}
+               }
+       }
+ 
+       memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
+       memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
+       memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
+               memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
+ 
+       // Calculate memory allocator stats.
+       // During program execution we only count number of frees and amount of freed memory.
+       // Current number of alive object in the heap and amount of alive heap memory
+       // are calculated by scanning all spans.
+       // Total number of mallocs is calculated as number of frees plus number of alive objects.
+       // Similarly, total amount of allocated memory is calculated as amount of freed memory
+       // plus amount of alive heap memory.
+       memstats.alloc = 0
+       memstats.total_alloc = 0
+       memstats.nmalloc = 0
+       memstats.nfree = 0
+       for i := 0; i < len(memstats.by_size); i++ {
+               memstats.by_size[i].nmalloc = 0
+               memstats.by_size[i].nfree = 0
+       }
+ 
+       // Flush MCache's to MCentral.
+       systemstack(flushallmcaches)
+ 
+       // Aggregate local stats.
+       cachestats()
+ 
+       // Scan all spans and count number of alive objects.
+       lock(&mheap_.lock)
+       for i := uint32(0); i < mheap_.nspan; i++ {
+               s := h_allspans[i]
+               if s.state != mSpanInUse {
+                       continue
+               }
+               if s.sizeclass == 0 {
+                       memstats.nmalloc++
+                       memstats.alloc += uint64(s.elemsize)
+               } else {
+                       memstats.nmalloc += uint64(s.ref)
+                       memstats.by_size[s.sizeclass].nmalloc += uint64(s.ref)
+                       memstats.alloc += uint64(s.ref) * uint64(s.elemsize)
+               }
+       }
+       unlock(&mheap_.lock)
+ 
+       // Aggregate by size class.
+       smallfree := uint64(0)
+       memstats.nfree = mheap_.nlargefree
+       for i := 0; i < len(memstats.by_size); i++ {
+               memstats.nfree += mheap_.nsmallfree[i]
+               memstats.by_size[i].nfree = mheap_.nsmallfree[i]
+               memstats.by_size[i].nmalloc += mheap_.nsmallfree[i]
+               smallfree += uint64(mheap_.nsmallfree[i]) * uint64(class_to_size[i])
+       }
+       memstats.nfree += memstats.tinyallocs
+       memstats.nmalloc += memstats.nfree
+ 
+       // Calculate derived stats.
+       memstats.total_alloc = uint64(memstats.alloc) + uint64(mheap_.largefree) + smallfree
+       memstats.heap_alloc = memstats.alloc
+       memstats.heap_objects = memstats.nmalloc - memstats.nfree
+ }
+ 
+ func gcinit() {
+       if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
+               gothrow("runtime: size of Workbuf is suboptimal")
+       }
+ 
+       work.markfor = parforalloc(_MaxGcproc)
+       gcpercent = readgogc()
+       gcdatamask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcdata)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)))
+       gcbssmask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcbss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)))
+ }
+ 
++// Called from malloc.go using onM, stopping and starting the world handled in caller.
+ func gc_m(start_time int64, eagersweep bool) {
+       _g_ := getg()
+       gp := _g_.m.curg
+       casgstatus(gp, _Grunning, _Gwaiting)
+       gp.waitreason = "garbage collection"
+ 
+       gc(start_time, eagersweep)
++      casgstatus(gp, _Gwaiting, _Grunning)
++}
++
++// Similar to clearcheckmarkbits but works on a single span.
++// It preforms two tasks.
++// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
++//    for nibbles with the BoundaryBit set.
++// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and
++//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
++// For the second case it is possible to restore the BitsDead pattern but since
++// clearmark is a debug tool performance has a lower priority than simplicity.
++// The span is MSpanInUse and the world is stopped.
++func clearcheckmarkbitsspan(s *mspan) {
++      if s.state != _MSpanInUse {
++              print("runtime:clearcheckmarkbitsspan: state=", s.state, "\n")
++              gothrow("clearcheckmarkbitsspan: bad span state")
++      }
+ 
- -      casgstatus(gp, _Gwaiting, _Grunning)
++      arena_start := mheap_.arena_start
++      cl := s.sizeclass
++      size := s.elemsize
++      var n int32
++      if cl == 0 {
++              n = 1
++      } else {
++              // Chunk full of small blocks
++              npages := class_to_allocnpages[cl]
++              n = npages << _PageShift / int32(size)
++      }
++
++      // MSpan_Sweep has similar code but instead of overloading and
++      // complicating that routine we do a simpler walk here.
++      // Sweep through n objects of given size starting at p.
++      // This thread owns the span now, so it can manipulate
++      // the block bitmap without atomic operations.
++      p := uintptr(s.start) << _PageShift
++
++      // Find bits for the beginning of the span.
++      off := (p - arena_start) / ptrSize
++      bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
++      step := size / (ptrSize * wordsPerBitmapByte)
++
++      // The type bit values are:
++      //      00 - BitsDead, for us BitsScalarMarked
++      //      01 - BitsScalar
++      //      10 - BitsPointer
++      //      11 - unused, for us BitsPointerMarked
++      //
++      // When called to prepare for the checkmark phase (checkmark==1),
++      // we change BitsDead to BitsScalar, so that there are no BitsScalarMarked
++      // type bits anywhere.
++      //
++      // The checkmark phase marks by changing BitsScalar to BitsScalarMarked
++      // and BitsPointer to BitsPointerMarked.
++      //
++      // When called to clean up after the checkmark phase (checkmark==0),
++      // we unmark by changing BitsScalarMarked back to BitsScalar and
++      // BitsPointerMarked back to BitsPointer.
++      //
++      // There are two problems with the scheme as just described.
++      // First, the setup rewrites BitsDead to BitsScalar, but the type bits
++      // following a BitsDead are uninitialized and must not be used.
++      // Second, objects that are free are expected to have their type
++      // bits zeroed (BitsDead), so in the cleanup we need to restore
++      // any BitsDeads that were there originally.
++      //
++      // In a one-word object (8-byte allocation on 64-bit system),
++      // there is no difference between BitsScalar and BitsDead, because
++      // neither is a pointer and there are no more words in the object,
++      // so using BitsScalar during the checkmark is safe and mapping
++      // both back to BitsDead during cleanup is also safe.
++      //
++      // In a larger object, we need to be more careful. During setup,
++      // if the type of the first word is BitsDead, we change it to BitsScalar
++      // (as we must) but also initialize the type of the second
++      // word to BitsDead, so that a scan during the checkmark phase
++      // will still stop before seeing the uninitialized type bits in the
++      // rest of the object. The sequence 'BitsScalar BitsDead' never
++      // happens in real type bitmaps - BitsDead is always as early
++      // as possible, so immediately after the last BitsPointer.
++      // During cleanup, if we see a BitsScalar, we can check to see if it
++      // is followed by BitsDead. If so, it was originally BitsDead and
++      // we can change it back.
++
++      if step == 0 {
++              // updating top and bottom nibbles, all boundaries
++              for i := int32(0); i < n/2; i, bitp = i+1, addb(bitp, uintptrMask&-1) {
++                      if *bitp&bitBoundary == 0 {
++                              gothrow("missing bitBoundary")
++                      }
++                      b := (*bitp & bitPtrMask) >> 2
++                      if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) {
++                              *bitp &^= 0x0c // convert to _BitsDead
++                      } else if b == _BitsScalarMarked || b == _BitsPointerMarked {
++                              *bitp &^= _BitsCheckMarkXor << 2
++                      }
++
++                      if (*bitp>>gcBits)&bitBoundary == 0 {
++                              gothrow("missing bitBoundary")
++                      }
++                      b = ((*bitp >> gcBits) & bitPtrMask) >> 2
++                      if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) {
++                              *bitp &^= 0xc0 // convert to _BitsDead
++                      } else if b == _BitsScalarMarked || b == _BitsPointerMarked {
++                              *bitp &^= _BitsCheckMarkXor << (2 + gcBits)
++                      }
++              }
++      } else {
++              // updating bottom nibble for first word of each object
++              for i := int32(0); i < n; i, bitp = i+1, addb(bitp, -step) {
++                      if *bitp&bitBoundary == 0 {
++                              gothrow("missing bitBoundary")
++                      }
++                      b := (*bitp & bitPtrMask) >> 2
++
++                      if checkmark && b == _BitsDead {
++                              // move BitsDead into second word.
++                              // set bits to BitsScalar in preparation for checkmark phase.
++                              *bitp &^= 0xc0
++                              *bitp |= _BitsScalar << 2
++                      } else if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) && *bitp&0xc0 == 0 {
++                              // Cleaning up after checkmark phase.
++                              // First word is scalar or dead (we forgot)
++                              // and second word is dead.
++                              // First word might as well be dead too.
++                              *bitp &^= 0x0c
++                      } else if b == _BitsScalarMarked || b == _BitsPointerMarked {
++                              *bitp ^= _BitsCheckMarkXor << 2
+                       }
+               }
+       }
++}
+ 
- -      // Sweep what is not sweeped by bgsweep.
- -      for sweepone() != ^uintptr(0) {
- -              sweep.npausesweep++
++// clearcheckmarkbits preforms two tasks.
++// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
++//    for nibbles with the BoundaryBit set.
++// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and
++//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
++// This is a bit expensive but preserves the BitsDead encoding during the normal marking.
++// BitsDead remains valid for every nibble except the ones with BitsBoundary set.
++func clearcheckmarkbits() {
++      for _, s := range work.spans {
++              if s.state == _MSpanInUse {
++                      clearcheckmarkbitsspan(s)
++              }
++      }
++}
++
++// Called from malloc.go using onM.
++// The world is stopped. Rerun the scan and mark phases
++// using the bitMarkedCheck bit instead of the
++// bitMarked bit. If the marking encounters an
++// bitMarked bit that is not set then we throw.
++func gccheckmark_m(startTime int64, eagersweep bool) {
++      if !gccheckmarkenable {
++              return
++      }
++
++      if checkmark {
++              gothrow("gccheckmark_m, entered with checkmark already true")
++      }
++
++      checkmark = true
++      clearcheckmarkbits()        // Converts BitsDead to BitsScalar.
++      gc_m(startTime, eagersweep) // turns off checkmark
++      // Work done, fixed up the GC bitmap to remove the checkmark bits.
++      clearcheckmarkbits()
++}
++
++func gccheckmarkenable_m() {
++      gccheckmarkenable = true
++}
++
++func gccheckmarkdisable_m() {
++      gccheckmarkenable = false
++}
++
++func finishsweep_m() {
++      // The world is stopped so we should be able to complete the sweeps
++      // quickly.
++      for sweepone() != ^uintptr(0) {
++              sweep.npausesweep++
++      }
++
++      // There may be some other spans being swept concurrently that
++      // we need to wait for. If finishsweep_m is done with the world stopped
++      // this code is not required.
++      sg := mheap_.sweepgen
++      for _, s := range work.spans {
++              if s.sweepgen != sg && s.state == _MSpanInUse {
++                      mSpan_EnsureSwept(s)
++              }
++      }
++}
++
++// Scan all of the stacks, greying (or graying if in America) the referents
++// but not blackening them since the mark write barrier isn't installed.
++func gcscan_m() {
++      _g_ := getg()
++
++      // Grab the g that called us and potentially allow rescheduling.
++      // This allows it to be scanned like other goroutines.
++      mastergp := _g_.m.curg
++      casgstatus(mastergp, _Grunning, _Gwaiting)
++      mastergp.waitreason = "garbage collection scan"
++
++      // Span sweeping has been done by finishsweep_m.
++      // Long term we will want to make this goroutine runnable
++      // by placing it onto a scanenqueue state and then calling
++      // runtime·restartg(mastergp) to make it Grunnable.
++      // At the bottom we will want to return this p back to the scheduler.
++      oldphase := gcphase
++
++      // Prepare flag indicating that the scan has not been completed.
++      lock(&allglock)
++      local_allglen := allglen
++      for i := uintptr(0); i < local_allglen; i++ {
++              gp := allgs[i]
++              gp.gcworkdone = false // set to true in gcphasework
++      }
++      unlock(&allglock)
++
++      work.nwait = 0
++      work.ndone = 0
++      work.nproc = 1 // For now do not do this in parallel.
++      gcphase = _GCscan
++      //      ackgcphase is not needed since we are not scanning running goroutines.
++      parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), nil, false, markroot)
++      parfordo(work.markfor)
++
++      lock(&allglock)
++      // Check that gc work is done.
++      for i := uintptr(0); i < local_allglen; i++ {
++              gp := allgs[i]
++              if !gp.gcworkdone {
++                      gothrow("scan missed a g")
++              }
++      }
++      unlock(&allglock)
++
++      gcphase = oldphase
++      casgstatus(mastergp, _Gwaiting, _Grunning)
++      // Let the g that called us continue to run.
++}
++
++// Mark all objects that are known about.
++func gcmark_m() {
++      scanblock(0, 0, nil)
++}
++
++// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
++// all go routines see the new barrier.
++func gcinstallmarkwb_m() {
++      gcphase = _GCmark
++}
++
++// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
++// all go routines see the new barrier.
++func gcinstalloffwb_m() {
++      gcphase = _GCoff
+ }
+ 
+ func gc(start_time int64, eagersweep bool) {
+       if _DebugGCPtrs {
+               print("GC start\n")
+       }
+ 
+       if debug.allocfreetrace > 0 {
+               tracegc()
+       }
+ 
+       _g_ := getg()
+       _g_.m.traceback = 2
+       t0 := start_time
+       work.tstart = start_time
+ 
+       var t1 int64
+       if debug.gctrace > 0 {
+               t1 = nanotime()
+       }
+ 
++      if !checkmark {
++              finishsweep_m() // skip during checkmark debug phase.
+       }
+ 
+       // Cache runtime.mheap_.allspans in work.spans to avoid conflicts with
+       // resizing/freeing allspans.
+       // New spans can be created while GC progresses, but they are not garbage for
+       // this round:
+       //  - new stack spans can be created even while the world is stopped.
+       //  - new malloc spans can be created during the concurrent sweep
+ 
+       // Even if this is stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
+       lock(&mheap_.lock)
+       // Free the old cached sweep array if necessary.
+       if work.spans != nil && &work.spans[0] != &h_allspans[0] {
+               sysFree(unsafe.Pointer(&work.spans[0]), uintptr(len(work.spans))*unsafe.Sizeof(work.spans[0]), &memstats.other_sys)
+       }
+       // Cache the current array for marking.
+       mheap_.gcspans = mheap_.allspans
+       work.spans = h_allspans
+       unlock(&mheap_.lock)
++      oldphase := gcphase
+ 
+       work.nwait = 0
+       work.ndone = 0
+       work.nproc = uint32(gcprocs())
++      gcphase = _GCmarktermination
++
++      // World is stopped so allglen will not change.
++      for i := uintptr(0); i < allglen; i++ {
++              gp := allgs[i]
++              gp.gcworkdone = false // set to true in gcphasework
++      }
++
+       parforsetup(work.markfor, work.nproc, uint32(_RootCount+allglen), nil, false, markroot)
+       if work.nproc > 1 {
+               noteclear(&work.alldone)
+               helpgc(int32(work.nproc))
+       }
+ 
+       var t2 int64
+       if debug.gctrace > 0 {
+               t2 = nanotime()
+       }
+ 
+       gchelperstart()
+       parfordo(work.markfor)
+       scanblock(0, 0, nil)
+ 
++      if work.full != 0 {
++              gothrow("work.full != 0")
++      }
++      if work.partial != 0 {
++              gothrow("work.partial != 0")
++      }
++
++      gcphase = oldphase
+       var t3 int64
+       if debug.gctrace > 0 {
+               t3 = nanotime()
+       }
+ 
+       if work.nproc > 1 {
+               notesleep(&work.alldone)
+       }
+ 
+       shrinkfinish()
+ 
+       cachestats()
+       // next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
+       // estimate what was live heap size after previous GC (for printing only)
+       heap0 := memstats.next_gc * 100 / (uint64(gcpercent) + 100)
+       // conservatively set next_gc to high value assuming that everything is live
+       // concurrent/lazy sweep will reduce this number while discovering new garbage
+       memstats.next_gc = memstats.heap_alloc + memstats.heap_alloc*uint64(gcpercent)/100
+ 
+       t4 := nanotime()
+       atomicstore64(&memstats.last_gc, uint64(unixnanotime())) // must be Unix time to make sense to user
+       memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(t4 - t0)
+       memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(t4)
+       memstats.pause_total_ns += uint64(t4 - t0)
+       memstats.numgc++
+       if memstats.debuggc {
+               print("pause ", t4-t0, "\n")
+       }
+ 
+       if debug.gctrace > 0 {
+               heap1 := memstats.heap_alloc
+               var stats gcstats
+               updatememstats(&stats)
+               if heap1 != memstats.heap_alloc {
+                       print("runtime: mstats skew: heap=", heap1, "/", memstats.heap_alloc, "\n")
+                       gothrow("mstats skew")
+               }
+               obj := memstats.nmalloc - memstats.nfree
+ 
+               stats.nprocyield += work.markfor.nprocyield
+               stats.nosyield += work.markfor.nosyield
+               stats.nsleep += work.markfor.nsleep
+ 
+               print("gc", memstats.numgc, "(", work.nproc, "): ",
+                       (t1-t0)/1000, "+", (t2-t1)/1000, "+", (t3-t2)/1000, "+", (t4-t3)/1000, " us, ",
+                       heap0>>20, " -> ", heap1>>20, " MB, ",
+                       obj, " (", memstats.nmalloc, "-", memstats.nfree, ") objects, ",
+                       gcount(), " goroutines, ",
+                       len(work.spans), "/", sweep.nbgsweep, "/", sweep.npausesweep, " sweeps, ",
+                       stats.nhandoff, "(", stats.nhandoffcnt, ") handoff, ",
+                       work.markfor.nsteal, "(", work.markfor.nstealcnt, ") steal, ",
+                       stats.nprocyield, "/", stats.nosyield, "/", stats.nsleep, " yields\n")
+               sweep.nbgsweep = 0
+               sweep.npausesweep = 0
+       }
+ 
+       // See the comment in the beginning of this function as to why we need the following.
+       // Even if this is still stop-the-world, a concurrent exitsyscall can allocate a stack from heap.
+       lock(&mheap_.lock)
+       // Free the old cached mark array if necessary.
+       if work.spans != nil && &work.spans[0] != &h_allspans[0] {
+               sysFree(unsafe.Pointer(&work.spans[0]), uintptr(len(work.spans))*unsafe.Sizeof(work.spans[0]), &memstats.other_sys)
+       }
+ 
++      if gccheckmarkenable {
++              if !checkmark {
++                      // first half of two-pass; don't set up sweep
++                      unlock(&mheap_.lock)
++                      return
++              }
++              checkmark = false // done checking marks
++      }
++
+       // Cache the current array for sweeping.
+       mheap_.gcspans = mheap_.allspans
+       mheap_.sweepgen += 2
+       mheap_.sweepdone = 0
+       work.spans = h_allspans
+       sweep.spanidx = 0
+       unlock(&mheap_.lock)
+ 
+       if _ConcurrentSweep && !eagersweep {
+               lock(&gclock)
+               if !sweep.started {
+                       go bgsweep()
+                       sweep.started = true
+               } else if sweep.parked {
+                       sweep.parked = false
+                       ready(sweep.g)
+               }
+               unlock(&gclock)
+       } else {
+               // Sweep all spans eagerly.
+               for sweepone() != ^uintptr(0) {
+                       sweep.npausesweep++
+               }
+               // Do an additional mProf_GC, because all 'free' events are now real as well.
+               mProf_GC()
+       }
+ 
+       mProf_GC()
+       _g_.m.traceback = 0
+ 
+       if _DebugGCPtrs {
+               print("GC end\n")
+       }
+ }
+ 
+ func readmemstats_m(stats *MemStats) {
+       updatememstats(nil)
+ 
+       // Size of the trailing by_size array differs between Go and C,
+       // NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+       memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
+ 
+       // Stack numbers are part of the heap numbers, separate those out for user consumption
+       stats.StackSys = stats.StackInuse
+       stats.HeapInuse -= stats.StackInuse
+       stats.HeapSys -= stats.StackInuse
+ }
+ 
+ //go:linkname readGCStats runtime/debug.readGCStats
+ func readGCStats(pauses *[]uint64) {
+       systemstack(func() {
+               readGCStats_m(pauses)
+       })
+ }
+ 
+ func readGCStats_m(pauses *[]uint64) {
+       p := *pauses
+       // Calling code in runtime/debug should make the slice large enough.
+       if cap(p) < len(memstats.pause_ns)+3 {
+               gothrow("runtime: short slice passed to readGCStats")
+       }
+ 
+       // Pass back: pauses, pause ends, last gc (absolute time), number of gc, total pause ns.
+       lock(&mheap_.lock)
+ 
+       n := memstats.numgc
+       if n > uint32(len(memstats.pause_ns)) {
+               n = uint32(len(memstats.pause_ns))
+       }
+ 
+       // The pause buffer is circular. The most recent pause is at
+       // pause_ns[(numgc-1)%len(pause_ns)], and then backward
+       // from there to go back farther in time. We deliver the times
+       // most recent first (in p[0]).
+       p = p[:cap(p)]
+       for i := uint32(0); i < n; i++ {
+               j := (memstats.numgc - 1 - i) % uint32(len(memstats.pause_ns))
+               p[i] = memstats.pause_ns[j]
+               p[n+i] = memstats.pause_end[j]
+       }
+ 
+       p[n+n] = memstats.last_gc
+       p[n+n+1] = uint64(memstats.numgc)
+       p[n+n+2] = memstats.pause_total_ns
+       unlock(&mheap_.lock)
+       *pauses = p[:n+n+3]
+ }
+ 
+ func setGCPercent(in int32) (out int32) {
+       lock(&mheap_.lock)
+       out = gcpercent
+       if in < 0 {
+               in = -1
+       }
+       gcpercent = in
+       unlock(&mheap_.lock)
+       return out
+ }
+ 
+ func gchelperstart() {
+       _g_ := getg()
+ 
+       if _g_.m.helpgc < 0 || _g_.m.helpgc >= _MaxGcproc {
+               gothrow("gchelperstart: bad m->helpgc")
+       }
+       if _g_ != _g_.m.g0 {
+               gothrow("gchelper not running on g0 stack")
+       }
+ }
+ 
+ func wakefing() *g {
+       var res *g
+       lock(&finlock)
+       if fingwait && fingwake {
+               fingwait = false
+               fingwake = false
+               res = fing
+       }
+       unlock(&finlock)
+       return res
+ }
+ 
+ func addb(p *byte, n uintptr) *byte {
+       return (*byte)(add(unsafe.Pointer(p), n))
+ }
+ 
+ // Recursively unrolls GC program in prog.
+ // mask is where to store the result.
+ // ppos is a pointer to position in mask, in bits.
+ // sparse says to generate 4-bits per word mask for heap (2-bits for data/bss otherwise).
+ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool) *byte {
+       arena_start := mheap_.arena_start
+       pos := *ppos
+       mask := (*[1 << 30]byte)(unsafe.Pointer(maskp))
+       for {
+               switch *prog {
+               default:
+                       gothrow("unrollgcprog: unknown instruction")
+ 
+               case insData:
+                       prog = addb(prog, 1)
+                       siz := int(*prog)
+                       prog = addb(prog, 1)
+                       p := (*[1 << 30]byte)(unsafe.Pointer(prog))
+                       for i := 0; i < siz; i++ {
+                               v := p[i/_PointersPerByte]
+                               v >>= (uint(i) % _PointersPerByte) * _BitsPerPointer
+                               v &= _BitsMask
+                               if inplace {
+                                       // Store directly into GC bitmap.
+                                       off := (uintptr(unsafe.Pointer(&mask[pos])) - arena_start) / ptrSize
+                                       bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+                                       shift := (off % wordsPerBitmapByte) * gcBits
+                                       if shift == 0 {
+                                               *bitp = 0
+                                       }
+                                       *bitp |= v << (shift + 2)
+                                       pos += ptrSize
+                               } else if sparse {
+                                       // 4-bits per word
+                                       v <<= (pos % 8) + 2
+                                       mask[pos/8] |= v
+                                       pos += gcBits
+                               } else {
+                                       // 2-bits per word
+                                       v <<= pos % 8
+                                       mask[pos/8] |= v
+                                       pos += _BitsPerPointer
+                               }
+                       }
+                       prog = addb(prog, round(uintptr(siz)*_BitsPerPointer, 8)/8)
+ 
+               case insArray:
+                       prog = (*byte)(add(unsafe.Pointer(prog), 1))
+                       siz := uintptr(0)
+                       for i := uintptr(0); i < ptrSize; i++ {
+                               siz = (siz << 8) + uintptr(*(*byte)(add(unsafe.Pointer(prog), ptrSize-i-1)))
+                       }
+                       prog = (*byte)(add(unsafe.Pointer(prog), ptrSize))
+                       var prog1 *byte
+                       for i := uintptr(0); i < siz; i++ {
+                               prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace, sparse)
+                       }
+                       if *prog1 != insArrayEnd {
+                               gothrow("unrollgcprog: array does not end with insArrayEnd")
+                       }
+                       prog = (*byte)(add(unsafe.Pointer(prog1), 1))
+ 
+               case insArrayEnd, insEnd:
+                       *ppos = pos
+                       return prog
+               }
+       }
+ }
+ 
+ // Unrolls GC program prog for data/bss, returns dense GC mask.
+ func unrollglobgcprog(prog *byte, size uintptr) bitvector {
+       masksize := round(round(size, ptrSize)/ptrSize*bitsPerPointer, 8) / 8
+       mask := (*[1 << 30]byte)(persistentalloc(masksize+1, 0, &memstats.gc_sys))
+       mask[masksize] = 0xa1
+       pos := uintptr(0)
+       prog = unrollgcprog1(&mask[0], prog, &pos, false, false)
+       if pos != size/ptrSize*bitsPerPointer {
+               print("unrollglobgcprog: bad program size, got ", pos, ", expect ", size/ptrSize*bitsPerPointer, "\n")
+               gothrow("unrollglobgcprog: bad program size")
+       }
+       if *prog != insEnd {
+               gothrow("unrollglobgcprog: program does not end with insEnd")
+       }
+       if mask[masksize] != 0xa1 {
+               gothrow("unrollglobgcprog: overflow")
+       }
+       return bitvector{int32(masksize * 8), &mask[0]}
+ }
+ 
+ func unrollgcproginplace_m(v unsafe.Pointer, typ *_type, size, size0 uintptr) {
+       pos := uintptr(0)
+       prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
+       for pos != size0 {
+               unrollgcprog1((*byte)(v), prog, &pos, true, true)
+       }
+ 
+       // Mark first word as bitAllocated.
+       arena_start := mheap_.arena_start
+       off := (uintptr(v) - arena_start) / ptrSize
+       bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+       shift := (off % wordsPerBitmapByte) * gcBits
+       *bitp |= bitBoundary << shift
+ 
+       // Mark word after last as BitsDead.
+       if size0 < size {
+               off := (uintptr(v) + size0 - arena_start) / ptrSize
+               bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+               shift := (off % wordsPerBitmapByte) * gcBits
+               *bitp &= uint8(^(bitPtrMask << shift) | uintptr(bitsDead)<<(shift+2))
+       }
+ }
+ 
+ var unroll mutex
+ 
+ // Unrolls GC program in typ.gc[1] into typ.gc[0]
+ func unrollgcprog_m(typ *_type) {
+       lock(&unroll)
+       mask := (*byte)(unsafe.Pointer(uintptr(typ.gc[0])))
+       if *mask == 0 {
+               pos := uintptr(8) // skip the unroll flag
+               prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
+               prog = unrollgcprog1(mask, prog, &pos, false, true)
+               if *prog != insEnd {
+                       gothrow("unrollgcprog: program does not end with insEnd")
+               }
+               if typ.size/ptrSize%2 != 0 {
+                       // repeat the program
+                       prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
+                       unrollgcprog1(mask, prog, &pos, false, true)
+               }
+ 
+               // atomic way to say mask[0] = 1
+               atomicor8(mask, 1)
+       }
+       unlock(&unroll)
+ }
+ 
+ // mark the span of memory at v as having n blocks of the given size.
+ // if leftover is true, there is left over space at the end of the span.
+ func markspan(v unsafe.Pointer, size uintptr, n uintptr, leftover bool) {
+       if uintptr(v)+size*n > mheap_.arena_used || uintptr(v) < mheap_.arena_start {
+               gothrow("markspan: bad pointer")
+       }
+ 
+       // Find bits of the beginning of the span.
+       off := (uintptr(v) - uintptr(mheap_.arena_start)) / ptrSize
+       if off%wordsPerBitmapByte != 0 {
+               gothrow("markspan: unaligned length")
+       }
+       b := mheap_.arena_start - off/wordsPerBitmapByte - 1
+ 
+       // Okay to use non-atomic ops here, because we control
+       // the entire span, and each bitmap byte has bits for only
+       // one span, so no other goroutines are changing these bitmap words.
+ 
+       if size == ptrSize {
+               // Possible only on 64-bits (minimal size class is 8 bytes).
+               // Set memory to 0x11.
+               if (bitBoundary|bitsDead)<<gcBits|bitBoundary|bitsDead != 0x11 {
+                       gothrow("markspan: bad bits")
+               }
+               if n%(wordsPerBitmapByte*ptrSize) != 0 {
+                       gothrow("markspan: unaligned length")
+               }
+               b = b - n/wordsPerBitmapByte + 1 // find first byte
+               if b%ptrSize != 0 {
+                       gothrow("markspan: unaligned pointer")
+               }
+               for i := uintptr(0); i < n; i, b = i+wordsPerBitmapByte*ptrSize, b+ptrSize {
+                       *(*uintptr)(unsafe.Pointer(b)) = uintptrMask & 0x1111111111111111 // bitBoundary | bitsDead, repeated
+               }
+               return
+       }
+ 
+       if leftover {
+               n++ // mark a boundary just past end of last block too
+       }
+       step := size / (ptrSize * wordsPerBitmapByte)
+       for i := uintptr(0); i < n; i, b = i+1, b-step {
+               *(*byte)(unsafe.Pointer(b)) = bitBoundary | bitsDead<<2
+       }
+ }
+ 
+ // unmark the span of memory at v of length n bytes.
+ func unmarkspan(v, n uintptr) {
+       if v+n > mheap_.arena_used || v < mheap_.arena_start {
+               gothrow("markspan: bad pointer")
+       }
+ 
+       off := (v - mheap_.arena_start) / ptrSize // word offset
+       if off%(ptrSize*wordsPerBitmapByte) != 0 {
+               gothrow("markspan: unaligned pointer")
+       }
+ 
+       b := mheap_.arena_start - off/wordsPerBitmapByte - 1
+       n /= ptrSize
+       if n%(ptrSize*wordsPerBitmapByte) != 0 {
+               gothrow("unmarkspan: unaligned length")
+       }
+ 
+       // Okay to use non-atomic ops here, because we control
+       // the entire span, and each bitmap word has bits for only
+       // one span, so no other goroutines are changing these
+       // bitmap words.
+       n /= wordsPerBitmapByte
+       memclr(unsafe.Pointer(b-n+1), n)
+ }
+ 
+ func mHeap_MapBits(h *mheap) {
+       // Caller has added extra mappings to the arena.
+       // Add extra mappings of bitmap words as needed.
+       // We allocate extra bitmap pieces in chunks of bitmapChunk.
+       const bitmapChunk = 8192
+ 
+       n := (h.arena_used - h.arena_start) / (ptrSize * wordsPerBitmapByte)
+       n = round(n, bitmapChunk)
+       n = round(n, _PhysPageSize)
+       if h.bitmap_mapped >= n {
+               return
+       }
+ 
+       sysMap(unsafe.Pointer(h.arena_start-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
+       h.bitmap_mapped = n
+ }
+ 
+ func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
+       target := (*stkframe)(ctxt)
+       if frame.sp <= target.sp && target.sp < frame.varp {
+               *target = *frame
+               return false
+       }
+       return true
+ }
+ 
+ // Returns GC type info for object p for testing.
+ func getgcmask(p unsafe.Pointer, t *_type, mask **byte, len *uintptr) {
+       *mask = nil
+       *len = 0
+ 
+       // data
+       if uintptr(unsafe.Pointer(&data)) <= uintptr(p) && uintptr(p) < uintptr(unsafe.Pointer(&edata)) {
+               n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+               *len = n / ptrSize
+               *mask = &make([]byte, *len)[0]
+               for i := uintptr(0); i < n; i += ptrSize {
+                       off := (uintptr(p) + i - uintptr(unsafe.Pointer(&data))) / ptrSize
+                       bits := (*(*byte)(add(unsafe.Pointer(gcdatamask.bytedata), off/pointersPerByte)) >> ((off % pointersPerByte) * bitsPerPointer)) & bitsMask
+                       *(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+               }
+               return
+       }
+ 
+       // bss
+       if uintptr(unsafe.Pointer(&bss)) <= uintptr(p) && uintptr(p) < uintptr(unsafe.Pointer(&ebss)) {
+               n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+               *len = n / ptrSize
+               *mask = &make([]byte, *len)[0]
+               for i := uintptr(0); i < n; i += ptrSize {
+                       off := (uintptr(p) + i - uintptr(unsafe.Pointer(&bss))) / ptrSize
+                       bits := (*(*byte)(add(unsafe.Pointer(gcbssmask.bytedata), off/pointersPerByte)) >> ((off % pointersPerByte) * bitsPerPointer)) & bitsMask
+                       *(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+               }
+               return
+       }
+ 
+       // heap
+       var n uintptr
+       var base uintptr
+       if mlookup(uintptr(p), &base, &n, nil) != 0 {
+               *len = n / ptrSize
+               *mask = &make([]byte, *len)[0]
+               for i := uintptr(0); i < n; i += ptrSize {
+                       off := (uintptr(base) + i - mheap_.arena_start) / ptrSize
+                       b := mheap_.arena_start - off/wordsPerBitmapByte - 1
+                       shift := (off % wordsPerBitmapByte) * gcBits
+                       bits := (*(*byte)(unsafe.Pointer(b)) >> (shift + 2)) & bitsMask
+                       *(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+               }
+               return
+       }
+ 
+       // stack
+       var frame stkframe
+       frame.sp = uintptr(p)
+       _g_ := getg()
+       gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
+       if frame.fn != nil {
+               f := frame.fn
+               targetpc := frame.continpc
+               if targetpc == 0 {
+                       return
+               }
+               if targetpc != f.entry {
+                       targetpc--
+               }
+               pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+               if pcdata == -1 {
+                       return
+               }
+               stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+               if stkmap == nil || stkmap.n <= 0 {
+                       return
+               }
+               bv := stackmapdata(stkmap, pcdata)
+               size := uintptr(bv.n) / bitsPerPointer * ptrSize
+               n := (*ptrtype)(unsafe.Pointer(t)).elem.size
+               *len = n / ptrSize
+               *mask = &make([]byte, *len)[0]
+               for i := uintptr(0); i < n; i += ptrSize {
+                       off := (uintptr(p) + i - frame.varp + size) / ptrSize
+                       bits := ((*(*byte)(add(unsafe.Pointer(bv.bytedata), off*bitsPerPointer/8))) >> ((off * bitsPerPointer) % 8)) & bitsMask
+                       *(*byte)(add(unsafe.Pointer(*mask), i/ptrSize)) = bits
+               }
+       }
+ }
+ 
+ func unixnanotime() int64 {
+       var now int64
+       gc_unixnanotime(&now)
+       return now
+ }
diff --cc src/runtime/mgc0.go

index dc4eec51969bf9d7b858af0b4c7c119346c74fce,6d4ae61c112fa53d9c0685c6e401128e7f09a3a8..00e64c0fff2940062c285fc7d0e1d987b4920e35
--- 1/src/runtime/mgc0.go
--- 2/src/runtime/mgc0.go
+++ b/src/runtime/mgc0.go
@@@ -93,36 -86,6 +91,32 @@@ const 
   //go:nosplit
   func writebarrierptr(dst *uintptr, src uintptr) {
         *dst = src
-               onM(func() { gothrow("bad pointer in write barrier") })
+ +      writebarrierptr_nostore(dst, src)
+ +}
+ +
+ +// Like writebarrierptr, but the store has already been applied.
+ +// Do not reapply.
+ +//go:nosplit
+ +func writebarrierptr_nostore(dst *uintptr, src uintptr) {
+ +      if getg() == nil { // very low-level startup
+ +              return
+ +      }
+ +
+ +      if src != 0 && (src < _PageSize || src == _PoisonGC || src == _PoisonStack) {
-       oldscalar0 := mp.scalararg[0]
-       oldscalar1 := mp.scalararg[1]
-       mp.scalararg[0] = uintptr(unsafe.Pointer(dst))
-       mp.scalararg[1] = src
-       onM_signalok(gcmarkwb_m)
-       mp.scalararg[0] = oldscalar0
-       mp.scalararg[1] = oldscalar1
++              systemstack(func() { gothrow("bad pointer in write barrier") })
+ +      }
+ +
+ +      mp := acquirem()
+ +      if mp.inwb || mp.dying > 0 {
+ +              releasem(mp)
+ +              return
+ +      }
+ +      mp.inwb = true
++      systemstack(func() {
++              gcmarkwb_m(dst, src)
++      })
+ +      mp.inwb = false
+ +      releasem(mp)
   }
   
   //go:nosplit
diff --cc src/runtime/mgc0.h

index 519d7206e75cc9c41e7e48812158d6cd03c99bb6,62726b4f0f2c1e4e15f43d27b78b260dd7a621df..dd0c460246c652c7c6d669d9327ba09133b16110
--- 1/src/runtime/mgc0.h
--- 2/src/runtime/mgc0.h
+++ b/src/runtime/mgc0.h
@@@ -2,81 -2,19 +2,21 @@@
   // Use of this source code is governed by a BSD-style
   // license that can be found in the LICENSE file.
   
- // Garbage collector (GC)
+ // Used by cmd/gc.
   
   enum {
-       // Four bits per word (see #defines below).
         gcBits = 4,
-       wordsPerBitmapByte = 8/gcBits,
- 
-       // GC type info programs.
-       // The programs allow to store type info required for GC in a compact form.
-       // Most importantly arrays take O(1) space instead of O(n).
-       // The program grammar is:
-       //
-       // Program = {Block} "insEnd"
-       // Block = Data | Array
-       // Data = "insData" DataSize DataBlock
-       // DataSize = int // size of the DataBlock in bit pairs, 1 byte
-       // DataBlock = binary // dense GC mask (2 bits per word) of size ]DataSize/4[ bytes
-       // Array = "insArray" ArrayLen Block "insArrayEnd"
-       // ArrayLen = int // length of the array, 8 bytes (4 bytes for 32-bit arch)
-       //
-       // Each instruction (insData, insArray, etc) is 1 byte.
-       // For example, for type struct { x []byte; y [20]struct{ z int; w *byte }; }
-       // the program looks as:
-       //
-       // insData 3 (BitsMultiWord BitsSlice BitsScalar)
-       //      insArray 20 insData 2 (BitsScalar BitsPointer) insArrayEnd insEnd
-       //
-       // Total size of the program is 17 bytes (13 bytes on 32-bits).
-       // The corresponding GC mask would take 43 bytes (it would be repeated
-       // because the type has odd number of words).
+       BitsPerPointer = 2,
+       BitsDead = 0,
+       BitsScalar = 1,
+       BitsPointer = 2,
+       BitsMask = 3,
+       PointersPerByte = 8/BitsPerPointer,
- -      MaxGCMask = 64,
         insData = 1,
         insArray,
         insArrayEnd,
         insEnd,
-       // Pointer map
-       BitsPerPointer  = 2,
-       BitsMask        = (1<<BitsPerPointer)-1,
-       PointersPerByte = 8/BitsPerPointer,
- 
-       // If you change these, also change scanblock.
-       // scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)".
-       BitsDead        = 0,
-       BitsScalar      = 1,                                // 01
-       BitsPointer     = 2,                                // 10
-       BitsCheckMarkXor = 1,                               // 10
-       BitsScalarMarked = BitsScalar ^ BitsCheckMarkXor,   // 00
-       BitsPointerMarked = BitsPointer ^ BitsCheckMarkXor, // 11
- 
-       BitsMultiWord   = 3,
-       // BitsMultiWord will be set for the first word of a multi-word item.
-       // When it is set, one of the following will be set for the second word.
-       // NOT USED ANYMORE: BitsString = 0,
-       // NOT USED ANYMORE: BitsSlice  = 1,
-       BitsIface       = 2,
-       BitsEface       = 3,
- 
+ +
+ +      // 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
+ +      MaxGCMask       = 65536, // TODO(rsc): change back to 64
   };
- 
- // Bits in per-word bitmap.
- // #defines because we shift the values beyond 32 bits.
- //
- // Each word in the bitmap describes wordsPerBitmapWord words
- // of heap memory.  There are 4 bitmap bits dedicated to each heap word,
- // so on a 64-bit system there is one bitmap word per 16 heap words.
- //
- // The bitmap starts at mheap.arena_start and extends *backward* from
- // there.  On a 64-bit system the off'th word in the arena is tracked by
- // the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
- // the only difference is that the divisor is 8.)
- enum {
-       bitBoundary = 1, // boundary of an object
-       bitMarked = 2, // marked object
-       bitMask = bitBoundary | bitMarked,
-       bitPtrMask = BitsMask<<2,
- };
diff --cc src/runtime/mgc1.go

index 0000000000000000000000000000000000000000,d1aab4554614462a1aa2a4643569e6be2b291c9b..04a5207e5478cdd39a74e5e1b63c6175744e96bc

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/mgc1.go
+++ b/src/runtime/mgc1.go
@@@ -1,0 -1,77 +1,80 @@@
- -      _BitsDead    = 0
- -      _BitsScalar  = 1
- -      _BitsPointer = 2
+ // Copyright 2012 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // Garbage collector (GC)
+ 
+ package runtime
+ 
+ const (
+       // Four bits per word (see #defines below).
+       gcBits             = 4
+       wordsPerBitmapByte = 8 / gcBits
+ )
+ 
+ const (
+       // GC type info programs.
+       // The programs allow to store type info required for GC in a compact form.
+       // Most importantly arrays take O(1) space instead of O(n).
+       // The program grammar is:
+       //
+       // Program = {Block} "insEnd"
+       // Block = Data | Array
+       // Data = "insData" DataSize DataBlock
+       // DataSize = int // size of the DataBlock in bit pairs, 1 byte
+       // DataBlock = binary // dense GC mask (2 bits per word) of size ]DataSize/4[ bytes
+       // Array = "insArray" ArrayLen Block "insArrayEnd"
+       // ArrayLen = int // length of the array, 8 bytes (4 bytes for 32-bit arch)
+       //
+       // Each instruction (insData, insArray, etc) is 1 byte.
+       // For example, for type struct { x []byte; y [20]struct{ z int; w *byte }; }
+       // the program looks as:
+       //
+       // insData 3 (BitsPointer BitsScalar BitsScalar)
+       //      insArray 20 insData 2 (BitsScalar BitsPointer) insArrayEnd insEnd
+       //
+       // Total size of the program is 17 bytes (13 bytes on 32-bits).
+       // The corresponding GC mask would take 43 bytes (it would be repeated
+       // because the type has odd number of words).
+       insData = 1 + iota
+       insArray
+       insArrayEnd
+       insEnd
+ )
+ 
+ const (
+       // Pointer map
+       _BitsPerPointer  = 2
+       _BitsMask        = (1 << _BitsPerPointer) - 1
+       _PointersPerByte = 8 / _BitsPerPointer
+ 
+       // If you change these, also change scanblock.
+       // scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)".
- -      _MaxGCMask = 64
++      _BitsDead          = 0
++      _BitsScalar        = 1                                // 01
++      _BitsPointer       = 2                                // 10
++      _BitsCheckMarkXor  = 1                                // 10
++      _BitsScalarMarked  = _BitsScalar ^ _BitsCheckMarkXor  // 00
++      _BitsPointerMarked = _BitsPointer ^ _BitsCheckMarkXor // 11
+ 
+       // 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
++      _MaxGCMask = 65536 // TODO(rsc): change back to 64
+ )
+ 
+ // Bits in per-word bitmap.
+ // #defines because we shift the values beyond 32 bits.
+ //
+ // Each word in the bitmap describes wordsPerBitmapWord words
+ // of heap memory.  There are 4 bitmap bits dedicated to each heap word,
+ // so on a 64-bit system there is one bitmap word per 16 heap words.
+ //
+ // The bitmap starts at mheap.arena_start and extends *backward* from
+ // there.  On a 64-bit system the off'th word in the arena is tracked by
+ // the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
+ // the only difference is that the divisor is 8.)
+ const (
+       bitBoundary = 1 // boundary of an object
+       bitMarked   = 2 // marked object
+       bitMask     = bitBoundary | bitMarked
+       bitPtrMask  = _BitsMask << 2
+ )
diff --cc src/runtime/os1_freebsd.go

index 0000000000000000000000000000000000000000,dd22b61d6b78d3829e39781079c3dfe9a6ef58dd..2cacfbae61f1881b5c37055c1c207aa15fa56d35

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/os1_freebsd.go
+++ b/src/runtime/os1_freebsd.go
@@@ -1,0 -1,221 +1,221 @@@
- -              ts.set_sec(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec))))
+ // Copyright 2011 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ // From FreeBSD's <sys/sysctl.h>
+ const (
+       _CTL_HW  = 6
+       _HW_NCPU = 3
+ )
+ 
+ var sigset_none = sigset{}
+ var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
+ 
+ func getncpu() int32 {
+       mib := [2]uint32{_CTL_HW, _HW_NCPU}
+       out := uint32(0)
+       nout := unsafe.Sizeof(out)
+       ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+       if ret >= 0 {
+               return int32(out)
+       }
+       return 1
+ }
+ 
+ // FreeBSD's umtx_op syscall is effectively the same as Linux's futex, and
+ // thus the code is largely similar. See Linux implementation
+ // and lock_futex.c for comments.
+ 
+ //go:nosplit
+ func futexsleep(addr *uint32, val uint32, ns int64) {
+       systemstack(func() {
+               futexsleep1(addr, val, ns)
+       })
+ }
+ 
+ func futexsleep1(addr *uint32, val uint32, ns int64) {
+       var tsp *timespec
+       if ns >= 0 {
+               var ts timespec
+               ts.tv_nsec = 0
++              ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
+               tsp = &ts
+       }
+       ret := sys_umtx_op(addr, _UMTX_OP_WAIT_UINT_PRIVATE, val, nil, tsp)
+       if ret >= 0 || ret == -_EINTR {
+               return
+       }
+       print("umtx_wait addr=", addr, " val=", val, " ret=", ret, "\n")
+       *(*int32)(unsafe.Pointer(uintptr(0x1005))) = 0x1005
+ }
+ 
+ //go:nosplit
+ func futexwakeup(addr *uint32, cnt uint32) {
+       ret := sys_umtx_op(addr, _UMTX_OP_WAKE_PRIVATE, cnt, nil, nil)
+       if ret >= 0 {
+               return
+       }
+ 
+       systemstack(func() {
+               print("umtx_wake_addr=", addr, " ret=", ret, "\n")
+       })
+ }
+ 
+ func thr_start()
+ 
+ func newosproc(mp *m, stk unsafe.Pointer) {
+       if false {
+               print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " thr_start=", funcPC(thr_start), " id=", mp.id, "/", mp.tls[0], " ostk=", &mp, "\n")
+       }
+ 
+       // NOTE(rsc): This code is confused. stackbase is the top of the stack
+       // and is equal to stk. However, it's working, so I'm not changing it.
+       param := thrparam{
+               start_func: funcPC(thr_start),
+               arg:        unsafe.Pointer(mp),
+               stack_base: mp.g0.stack.hi,
+               stack_size: uintptr(stk) - mp.g0.stack.hi,
+               child_tid:  unsafe.Pointer(&mp.procid),
+               parent_tid: nil,
+               tls_base:   unsafe.Pointer(&mp.tls[0]),
+               tls_size:   unsafe.Sizeof(mp.tls),
+       }
+       mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
+ 
+       var oset sigset
+       sigprocmask(&sigset_all, &oset)
+       thr_new(&param, int32(unsafe.Sizeof(param)))
+       sigprocmask(&oset, nil)
+ }
+ 
+ func osinit() {
+       ncpu = getncpu()
+ }
+ 
+ var urandom_data [_HashRandomBytes]byte
+ var urandom_dev = []byte("/dev/random\x00")
+ 
+ //go:nosplit
+ func get_random_data(rnd *unsafe.Pointer, rnd_len *int32) {
+       fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+       if read(fd, unsafe.Pointer(&urandom_data), _HashRandomBytes) == _HashRandomBytes {
+               *rnd = unsafe.Pointer(&urandom_data[0])
+               *rnd_len = _HashRandomBytes
+       } else {
+               *rnd = nil
+               *rnd_len = 0
+       }
+       close(fd)
+ }
+ 
+ func goenvs() {
+       goenvs_unix()
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+ func mpreinit(mp *m) {
+       mp.gsignal = malg(32 * 1024)
+       mp.gsignal.m = mp
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the new thread, can not allocate memory.
+ func minit() {
+       _g_ := getg()
+ 
+       // m.procid is a uint64, but thr_new writes a uint32 on 32-bit systems.
+       // Fix it up. (Only matters on big-endian, but be clean anyway.)
+       if ptrSize == 4 {
+               _g_.m.procid = uint64(*(*uint32)(unsafe.Pointer(&_g_.m.procid)))
+       }
+ 
+       // Initialize signal handling.
+       signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
+       sigprocmask(&sigset_none, nil)
+ }
+ 
+ // Called from dropm to undo the effect of an minit.
+ func unminit() {
+       signalstack(nil, 0)
+ }
+ 
+ func memlimit() uintptr {
+       /*
+               TODO: Convert to Go when something actually uses the result.
+               Rlimit rl;
+               extern byte runtime·text[], runtime·end[];
+               uintptr used;
+ 
+               if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+                       return 0;
+               if(rl.rlim_cur >= 0x7fffffff)
+                       return 0;
+ 
+               // Estimate our VM footprint excluding the heap.
+               // Not an exact science: use size of binary plus
+               // some room for thread stacks.
+               used = runtime·end - runtime·text + (64<<20);
+               if(used >= rl.rlim_cur)
+                       return 0;
+ 
+               // If there's not at least 16 MB left, we're probably
+               // not going to be able to do much.  Treat as no limit.
+               rl.rlim_cur -= used;
+               if(rl.rlim_cur < (16<<20))
+                       return 0;
+ 
+               return rl.rlim_cur - used;
+       */
+ 
+       return 0
+ }
+ 
+ func sigtramp()
+ 
+ type sigactiont struct {
+       sa_handler uintptr
+       sa_flags   int32
+       sa_mask    sigset
+ }
+ 
+ func setsig(i int32, fn uintptr, restart bool) {
+       var sa sigactiont
+       sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+       if restart {
+               sa.sa_flags |= _SA_RESTART
+       }
+       sa.sa_mask = sigset_all
+       if fn == funcPC(sighandler) {
+               fn = funcPC(sigtramp)
+       }
+       sa.sa_handler = fn
+       sigaction(i, &sa, nil)
+ }
+ func getsig(i int32) uintptr {
+       var sa sigactiont
+       sigaction(i, nil, &sa)
+       if sa.sa_handler == funcPC(sigtramp) {
+               return funcPC(sighandler)
+       }
+       return sa.sa_handler
+ }
+ 
+ func signalstack(p *byte, n int32) {
+       var st stackt
+       st.ss_sp = uintptr(unsafe.Pointer(p))
+       st.ss_size = uintptr(n)
+       st.ss_flags = 0
+       if p == nil {
+               st.ss_flags = _SS_DISABLE
+       }
+       sigaltstack(&st, nil)
+ }
+ 
+ func unblocksignals() {
+       sigprocmask(&sigset_none, nil)
+ }
diff --cc src/runtime/os1_linux.go

index 0000000000000000000000000000000000000000,0d24c5edc9dccae0ba19079c6bf96d7013c117fe..67fa6391e1d47c4738ee70ff35b2ed4611f4cb63

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/os1_linux.go
+++ b/src/runtime/os1_linux.go
@@@ -1,0 -1,287 +1,287 @@@
- -              ts.set_sec(int32(ns / 1000000000))
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ var sigset_none sigset
+ var sigset_all sigset = sigset{^uint32(0), ^uint32(0)}
+ 
+ // Linux futex.
+ //
+ //    futexsleep(uint32 *addr, uint32 val)
+ //    futexwakeup(uint32 *addr)
+ //
+ // Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
+ // Futexwakeup wakes up threads sleeping on addr.
+ // Futexsleep is allowed to wake up spuriously.
+ 
+ const (
+       _FUTEX_WAIT = 0
+       _FUTEX_WAKE = 1
+ )
+ 
+ // Atomically,
+ //    if(*addr == val) sleep
+ // Might be woken up spuriously; that's allowed.
+ // Don't sleep longer than ns; ns < 0 means forever.
+ //go:nosplit
+ func futexsleep(addr *uint32, val uint32, ns int64) {
+       var ts timespec
+ 
+       // Some Linux kernels have a bug where futex of
+       // FUTEX_WAIT returns an internal error code
+       // as an errno.  Libpthread ignores the return value
+       // here, and so can we: as it says a few lines up,
+       // spurious wakeups are allowed.
+       if ns < 0 {
+               futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, nil, nil, 0)
+               return
+       }
+ 
+       // It's difficult to live within the no-split stack limits here.
+       // On ARM and 386, a 64-bit divide invokes a general software routine
+       // that needs more stack than we can afford. So we use timediv instead.
+       // But on real 64-bit systems, where words are larger but the stack limit
+       // is not, even timediv is too heavy, and we really need to use just an
+       // ordinary machine instruction.
+       if ptrSize == 8 {
- -              ts.set_sec(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec))))
++              ts.set_sec(ns / 1000000000)
+               ts.set_nsec(int32(ns % 1000000000))
+       } else {
+               ts.tv_nsec = 0
++              ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
+       }
+       futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, unsafe.Pointer(&ts), nil, 0)
+ }
+ 
+ // If any procs are sleeping on addr, wake up at most cnt.
+ //go:nosplit
+ func futexwakeup(addr *uint32, cnt uint32) {
+       ret := futex(unsafe.Pointer(addr), _FUTEX_WAKE, cnt, nil, nil, 0)
+       if ret >= 0 {
+               return
+       }
+ 
+       // I don't know that futex wakeup can return
+       // EAGAIN or EINTR, but if it does, it would be
+       // safe to loop and call futex again.
+       systemstack(func() {
+               print("futexwakeup addr=", addr, " returned ", ret, "\n")
+       })
+ 
+       *(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
+ }
+ 
+ func getproccount() int32 {
+       var buf [16]uintptr
+       r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
+       n := int32(0)
+       for _, v := range buf[:r/ptrSize] {
+               for i := 0; i < 64; i++ {
+                       n += int32(v & 1)
+                       v >>= 1
+               }
+       }
+       if n == 0 {
+               n = 1
+       }
+       return n
+ }
+ 
+ // Clone, the Linux rfork.
+ const (
+       _CLONE_VM             = 0x100
+       _CLONE_FS             = 0x200
+       _CLONE_FILES          = 0x400
+       _CLONE_SIGHAND        = 0x800
+       _CLONE_PTRACE         = 0x2000
+       _CLONE_VFORK          = 0x4000
+       _CLONE_PARENT         = 0x8000
+       _CLONE_THREAD         = 0x10000
+       _CLONE_NEWNS          = 0x20000
+       _CLONE_SYSVSEM        = 0x40000
+       _CLONE_SETTLS         = 0x80000
+       _CLONE_PARENT_SETTID  = 0x100000
+       _CLONE_CHILD_CLEARTID = 0x200000
+       _CLONE_UNTRACED       = 0x800000
+       _CLONE_CHILD_SETTID   = 0x1000000
+       _CLONE_STOPPED        = 0x2000000
+       _CLONE_NEWUTS         = 0x4000000
+       _CLONE_NEWIPC         = 0x8000000
+ )
+ 
+ func newosproc(mp *m, stk unsafe.Pointer) {
+       /*
+        * note: strace gets confused if we use CLONE_PTRACE here.
+        */
+       var flags int32 = _CLONE_VM | /* share memory */
+               _CLONE_FS | /* share cwd, etc */
+               _CLONE_FILES | /* share fd table */
+               _CLONE_SIGHAND | /* share sig handler table */
+               _CLONE_THREAD /* revisit - okay for now */
+ 
+       mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
+       if false {
+               print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " clone=", funcPC(clone), " id=", mp.id, "/", mp.tls[0], " ostk=", &mp, "\n")
+       }
+ 
+       // Disable signals during clone, so that the new thread starts
+       // with signals disabled.  It will enable them in minit.
+       var oset sigset
+       rtsigprocmask(_SIG_SETMASK, &sigset_all, &oset, int32(unsafe.Sizeof(oset)))
+       ret := clone(flags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(funcPC(mstart)))
+       rtsigprocmask(_SIG_SETMASK, &oset, nil, int32(unsafe.Sizeof(oset)))
+ 
+       if ret < 0 {
+               print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
+               gothrow("newosproc")
+       }
+ }
+ 
+ func osinit() {
+       ncpu = getproccount()
+ }
+ 
+ // Random bytes initialized at startup.  These come
+ // from the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.c).
+ // byte*      runtime·startup_random_data;
+ // uint32     runtime·startup_random_data_len;
+ 
+ var urandom_data [_HashRandomBytes]byte
+ var urandom_dev = []byte("/dev/random\x00")
+ 
+ //go:nosplit
+ func get_random_data(rnd *unsafe.Pointer, rnd_len *int32) {
+       if startup_random_data != nil {
+               *rnd = unsafe.Pointer(startup_random_data)
+               *rnd_len = int32(startup_random_data_len)
+               return
+       }
+       fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+       if read(fd, unsafe.Pointer(&urandom_data), _HashRandomBytes) == _HashRandomBytes {
+               *rnd = unsafe.Pointer(&urandom_data[0])
+               *rnd_len = _HashRandomBytes
+       } else {
+               *rnd = nil
+               *rnd_len = 0
+       }
+       close(fd)
+ }
+ 
+ func goenvs() {
+       goenvs_unix()
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+ func mpreinit(mp *m) {
+       mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
+       mp.gsignal.m = mp
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the new thread, can not allocate memory.
+ func minit() {
+       // Initialize signal handling.
+       _g_ := getg()
+       signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
+       rtsigprocmask(_SIG_SETMASK, &sigset_none, nil, int32(unsafe.Sizeof(sigset_none)))
+ }
+ 
+ // Called from dropm to undo the effect of an minit.
+ func unminit() {
+       signalstack(nil, 0)
+ }
+ 
+ func memlimit() uintptr {
+       /*
+               TODO: Convert to Go when something actually uses the result.
+ 
+               Rlimit rl;
+               extern byte runtime·text[], runtime·end[];
+               uintptr used;
+ 
+               if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+                       return 0;
+               if(rl.rlim_cur >= 0x7fffffff)
+                       return 0;
+ 
+               // Estimate our VM footprint excluding the heap.
+               // Not an exact science: use size of binary plus
+               // some room for thread stacks.
+               used = runtime·end - runtime·text + (64<<20);
+               if(used >= rl.rlim_cur)
+                       return 0;
+ 
+               // If there's not at least 16 MB left, we're probably
+               // not going to be able to do much.  Treat as no limit.
+               rl.rlim_cur -= used;
+               if(rl.rlim_cur < (16<<20))
+                       return 0;
+ 
+               return rl.rlim_cur - used;
+       */
+ 
+       return 0
+ }
+ 
+ //#ifdef GOARCH_386
+ //#define sa_handler k_sa_handler
+ //#endif
+ 
+ func sigreturn()
+ func sigtramp()
+ 
+ func setsig(i int32, fn uintptr, restart bool) {
+       var sa sigactiont
+       memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
+       sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER
+       if restart {
+               sa.sa_flags |= _SA_RESTART
+       }
+       sa.sa_mask = ^uint64(0)
+       // Although Linux manpage says "sa_restorer element is obsolete and
+       // should not be used". x86_64 kernel requires it. Only use it on
+       // x86.
+       if GOARCH == "386" || GOARCH == "amd64" {
+               sa.sa_restorer = funcPC(sigreturn)
+       }
+       if fn == funcPC(sighandler) {
+               fn = funcPC(sigtramp)
+       }
+       sa.sa_handler = fn
+       if rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask)) != 0 {
+               gothrow("rt_sigaction failure")
+       }
+ }
+ 
+ func getsig(i int32) uintptr {
+       var sa sigactiont
+ 
+       memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
+       if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
+               gothrow("rt_sigaction read failure")
+       }
+       if sa.sa_handler == funcPC(sigtramp) {
+               return funcPC(sighandler)
+       }
+       return sa.sa_handler
+ }
+ 
+ func signalstack(p *byte, n int32) {
+       var st sigaltstackt
+       st.ss_sp = p
+       st.ss_size = uintptr(n)
+       st.ss_flags = 0
+       if p == nil {
+               st.ss_flags = _SS_DISABLE
+       }
+       sigaltstack(&st, nil)
+ }
+ 
+ func unblocksignals() {
+       rtsigprocmask(_SIG_SETMASK, &sigset_none, nil, int32(unsafe.Sizeof(sigset_none)))
+ }
diff --cc src/runtime/os1_openbsd.go

index 0000000000000000000000000000000000000000,5c6ea74121d55b97786b6e2727dca14281ce32fb..d5ffe10a8170484c2066ebf09e40f283ef535f6b

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/os1_openbsd.go
+++ b/src/runtime/os1_openbsd.go
@@@ -1,0 -1,235 +1,235 @@@
- -              ts.set_sec(timediv(ns, 1000000000, &nsec))
+ // Copyright 2011 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ const (
+       ESRCH       = 3
+       EAGAIN      = 35
+       EWOULDBLOCK = EAGAIN
+       ENOTSUP     = 91
+ 
+       // From OpenBSD's sys/time.h
+       CLOCK_REALTIME  = 0
+       CLOCK_VIRTUAL   = 1
+       CLOCK_PROF      = 2
+       CLOCK_MONOTONIC = 3
+ )
+ 
+ var sigset_none = uint32(0)
+ var sigset_all = ^sigset_none
+ 
+ // From OpenBSD's <sys/sysctl.h>
+ const (
+       CTL_HW  = 6
+       HW_NCPU = 3
+ )
+ 
+ func getncpu() int32 {
+       mib := [2]uint32{CTL_HW, HW_NCPU}
+       out := uint32(0)
+       nout := unsafe.Sizeof(out)
+ 
+       // Fetch hw.ncpu via sysctl.
+       ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+       if ret >= 0 {
+               return int32(out)
+       }
+       return 1
+ }
+ 
+ //go:nosplit
+ func semacreate() uintptr {
+       return 1
+ }
+ 
+ //go:nosplit
+ func semasleep(ns int64) int32 {
+       _g_ := getg()
+ 
+       // Compute sleep deadline.
+       var tsp *timespec
+       if ns >= 0 {
+               var ts timespec
+               var nsec int32
+               ns += nanotime()
++              ts.set_sec(int64(timediv(ns, 1000000000, &nsec)))
+               ts.set_nsec(nsec)
+               tsp = &ts
+       }
+ 
+       for {
+               // spin-mutex lock
+               for {
+                       if xchg(&_g_.m.waitsemalock, 1) == 0 {
+                               break
+                       }
+                       osyield()
+               }
+ 
+               if _g_.m.waitsemacount != 0 {
+                       // semaphore is available.
+                       _g_.m.waitsemacount--
+                       // spin-mutex unlock
+                       atomicstore(&_g_.m.waitsemalock, 0)
+                       return 0 // semaphore acquired
+               }
+ 
+               // sleep until semaphore != 0 or timeout.
+               // thrsleep unlocks m.waitsemalock.
+               ret := thrsleep((uintptr)(unsafe.Pointer(&_g_.m.waitsemacount)), CLOCK_MONOTONIC, tsp, (uintptr)(unsafe.Pointer(&_g_.m.waitsemalock)), (*int32)(unsafe.Pointer(&_g_.m.waitsemacount)))
+               if ret == EWOULDBLOCK {
+                       return -1
+               }
+       }
+ }
+ 
+ //go:nosplit
+ func semawakeup(mp *m) {
+       // spin-mutex lock
+       for {
+               if xchg(&mp.waitsemalock, 1) == 0 {
+                       break
+               }
+               osyield()
+       }
+       mp.waitsemacount++
+       ret := thrwakeup(uintptr(unsafe.Pointer(&mp.waitsemacount)), 1)
+       if ret != 0 && ret != ESRCH {
+               // semawakeup can be called on signal stack.
+               systemstack(func() {
+                       print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n")
+               })
+       }
+       // spin-mutex unlock
+       atomicstore(&mp.waitsemalock, 0)
+ }
+ 
+ func newosproc(mp *m, stk unsafe.Pointer) {
+       if false {
+               print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, "/", int32(mp.tls[0]), " ostk=", &mp, "\n")
+       }
+ 
+       mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
+ 
+       param := tforkt{
+               tf_tcb:   unsafe.Pointer(&mp.tls[0]),
+               tf_tid:   (*int32)(unsafe.Pointer(&mp.procid)),
+               tf_stack: uintptr(stk),
+       }
+ 
+       oset := sigprocmask(_SIG_SETMASK, sigset_all)
+       ret := tfork(&param, unsafe.Sizeof(param), mp, mp.g0, funcPC(mstart))
+       sigprocmask(_SIG_SETMASK, oset)
+ 
+       if ret < 0 {
+               print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
+               if ret == -ENOTSUP {
+                       print("runtime: is kern.rthreads disabled?\n")
+               }
+               gothrow("runtime.newosproc")
+       }
+ }
+ 
+ func osinit() {
+       ncpu = getncpu()
+ }
+ 
+ var urandom_data [_HashRandomBytes]byte
+ var urandom_dev = []byte("/dev/urandom\x00")
+ 
+ //go:nosplit
+ func get_random_data(rnd *unsafe.Pointer, rnd_len *int32) {
+       fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+       if read(fd, unsafe.Pointer(&urandom_data), _HashRandomBytes) == _HashRandomBytes {
+               *rnd = unsafe.Pointer(&urandom_data[0])
+               *rnd_len = _HashRandomBytes
+       } else {
+               *rnd = nil
+               *rnd_len = 0
+       }
+       close(fd)
+ }
+ 
+ func goenvs() {
+       goenvs_unix()
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+ func mpreinit(mp *m) {
+       mp.gsignal = malg(32 * 1024)
+       mp.gsignal.m = mp
+ }
+ 
+ // Called to initialize a new m (including the bootstrap m).
+ // Called on the new thread, can not allocate memory.
+ func minit() {
+       _g_ := getg()
+ 
+       // m.procid is a uint64, but tfork writes an int32. Fix it up.
+       _g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
+ 
+       // Initialize signal handling
+       signalstack((*byte)(unsafe.Pointer(_g_.m.gsignal.stack.lo)), 32*1024)
+       sigprocmask(_SIG_SETMASK, sigset_none)
+ }
+ 
+ // Called from dropm to undo the effect of an minit.
+ func unminit() {
+       signalstack(nil, 0)
+ }
+ 
+ func memlimit() uintptr {
+       return 0
+ }
+ 
+ func sigtramp()
+ 
+ type sigactiont struct {
+       sa_sigaction uintptr
+       sa_mask      uint32
+       sa_flags     int32
+ }
+ 
+ func setsig(i int32, fn uintptr, restart bool) {
+       var sa sigactiont
+       sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+       if restart {
+               sa.sa_flags |= _SA_RESTART
+       }
+       sa.sa_mask = sigset_all
+       if fn == funcPC(sighandler) {
+               fn = funcPC(sigtramp)
+       }
+       sa.sa_sigaction = fn
+       sigaction(i, &sa, nil)
+ }
+ 
+ func getsig(i int32) uintptr {
+       var sa sigactiont
+       sigaction(i, nil, &sa)
+       if sa.sa_sigaction == funcPC(sigtramp) {
+               return funcPC(sighandler)
+       }
+       return sa.sa_sigaction
+ }
+ 
+ func signalstack(p *byte, n int32) {
+       var st stackt
+ 
+       st.ss_sp = uintptr(unsafe.Pointer(p))
+       st.ss_size = uintptr(n)
+       st.ss_flags = 0
+       if p == nil {
+               st.ss_flags = _SS_DISABLE
+       }
+       sigaltstack(&st, nil)
+ }
+ 
+ func unblocksignals() {
+       sigprocmask(_SIG_SETMASK, sigset_none)
+ }
diff --cc src/runtime/os_linux_386.go

index 0000000000000000000000000000000000000000,c4f95804acb6aae50fa258df49ea04f76e6afe3e..adcd5a1c4eec7e0c454a36b4370a65658364bd39

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/os_linux_386.go
+++ b/src/runtime/os_linux_386.go
@@@ -1,0 -1,37 +1,36 @@@
- -//go:nosplit
- -func linux_setup_vdso(argc int32, argv **byte) {
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ const (
+       _AT_NULL    = 0
+       _AT_RANDOM  = 25
+       _AT_SYSINFO = 32
+ )
+ 
+ var _vdso uint32
+ 
++func sysargs(argc int32, argv **byte) {
+       // skip over argv, envv to get to auxv
+       n := argc + 1
+       for argv_index(argv, n) != nil {
+               n++
+       }
+       n++
+       auxv := (*[1 << 28]uint32)(add(unsafe.Pointer(argv), uintptr(n)*ptrSize))
+ 
+       for i := 0; auxv[i] != _AT_NULL; i += 2 {
+               switch auxv[i] {
+               case _AT_SYSINFO:
+                       _vdso = auxv[i+1]
+ 
+               case _AT_RANDOM:
+                       startup_random_data = (*byte)(unsafe.Pointer(uintptr(auxv[i+1])))
+                       startup_random_data_len = 16
+               }
+       }
+ }
diff --cc src/runtime/proc.go
Simple merge
diff --cc src/runtime/proc1.go

index 0000000000000000000000000000000000000000,81b211d0d35039a0b5c4099a04a63722ec1c326f..8c941dd35dc47d1b1d34e3dfe17ad9fbcedb7975

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/proc1.go
+++ b/src/runtime/proc1.go
@@@ -1,0 -1,3170 +1,3186 @@@
- -              // Help GC if needed.
- -              if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
- -                      gp.preemptscan = false
- -                      systemstack(func() {
- -                              gcphasework(gp)
- -                      })
- -              }
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ var (
+       m0 m
+       g0 g
+ )
+ 
+ // Goroutine scheduler
+ // The scheduler's job is to distribute ready-to-run goroutines over worker threads.
+ //
+ // The main concepts are:
+ // G - goroutine.
+ // M - worker thread, or machine.
+ // P - processor, a resource that is required to execute Go code.
+ //     M must have an associated P to execute Go code, however it can be
+ //     blocked or in a syscall w/o an associated P.
+ //
+ // Design doc at http://golang.org/s/go11sched.
+ 
+ const (
+       // Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
+       // 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
+       _GoidCacheBatch = 16
+ )
+ 
+ /*
+ SchedT        sched;
+ int32 gomaxprocs;
+ uint32        needextram;
+ bool  iscgo;
+ M     m0;
+ G     g0;     // idle goroutine for m0
+ G*    lastg;
+ M*    allm;
+ M*    extram;
+ P*    allp[MaxGomaxprocs+1];
+ int8* goos;
+ int32 ncpu;
+ int32 newprocs;
+ 
+ Mutex allglock;       // the following vars are protected by this lock or by stoptheworld
+ G**   allg;
+ Slice allgs;
+ uintptr allglen;
+ ForceGCState  forcegc;
+ 
+ void mstart(void);
+ static void runqput(P*, G*);
+ static G* runqget(P*);
+ static bool runqputslow(P*, G*, uint32, uint32);
+ static G* runqsteal(P*, P*);
+ static void mput(M*);
+ static M* mget(void);
+ static void mcommoninit(M*);
+ static void schedule(void);
+ static void procresize(int32);
+ static void acquirep(P*);
+ static P* releasep(void);
+ static void newm(void(*)(void), P*);
+ static void stopm(void);
+ static void startm(P*, bool);
+ static void handoffp(P*);
+ static void wakep(void);
+ static void stoplockedm(void);
+ static void startlockedm(G*);
+ static void sysmon(void);
+ static uint32 retake(int64);
+ static void incidlelocked(int32);
+ static void checkdead(void);
+ static void exitsyscall0(G*);
+ void park_m(G*);
+ static void goexit0(G*);
+ static void gfput(P*, G*);
+ static G* gfget(P*);
+ static void gfpurge(P*);
+ static void globrunqput(G*);
+ static void globrunqputbatch(G*, G*, int32);
+ static G* globrunqget(P*, int32);
+ static P* pidleget(void);
+ static void pidleput(P*);
+ static void injectglist(G*);
+ static bool preemptall(void);
+ static bool preemptone(P*);
+ static bool exitsyscallfast(void);
+ static bool haveexperiment(int8*);
+ void allgadd(G*);
+ static void dropg(void);
+ 
+ extern String buildVersion;
+ */
+ 
+ // The bootstrap sequence is:
+ //
+ //    call osinit
+ //    call schedinit
+ //    make & queue new G
+ //    call runtime·mstart
+ //
+ // The new G calls runtime·main.
+ func schedinit() {
+       // raceinit must be the first call to race detector.
+       // In particular, it must be done before mallocinit below calls racemapshadow.
+       _g_ := getg()
+       if raceenabled {
+               _g_.racectx = raceinit()
+       }
+ 
+       sched.maxmcount = 10000
+ 
+       tracebackinit()
+       symtabinit()
+       stackinit()
+       mallocinit()
+       mcommoninit(_g_.m)
+ 
+       goargs()
+       goenvs()
+       parsedebugvars()
+       gcinit()
+ 
+       sched.lastpoll = uint64(nanotime())
+       procs := 1
+       if n := goatoi(gogetenv("GOMAXPROCS")); n > 0 {
+               if n > _MaxGomaxprocs {
+                       n = _MaxGomaxprocs
+               }
+               procs = n
+       }
+       procresize(int32(procs))
+ 
+       if buildVersion == "" {
+               // Condition should never trigger.  This code just serves
+               // to ensure runtime·buildVersion is kept in the resulting binary.
+               buildVersion = "unknown"
+       }
+ }
+ 
+ func newsysmon() {
+       _newm(sysmon, nil)
+ }
+ 
+ func dumpgstatus(gp *g) {
+       _g_ := getg()
+       print("runtime: gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+       print("runtime:  g:  g=", _g_, ", goid=", _g_.goid, ",  g->atomicstatus=", readgstatus(_g_), "\n")
+ }
+ 
+ func checkmcount() {
+       // sched lock is held
+       if sched.mcount > sched.maxmcount {
+               print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
+               gothrow("thread exhaustion")
+       }
+ }
+ 
+ func mcommoninit(mp *m) {
+       _g_ := getg()
+ 
+       // g0 stack won't make sense for user (and is not necessary unwindable).
+       if _g_ != _g_.m.g0 {
+               callers(1, &mp.createstack[0], len(mp.createstack))
+       }
+ 
+       mp.fastrand = 0x49f6428a + uint32(mp.id) + uint32(cputicks())
+       if mp.fastrand == 0 {
+               mp.fastrand = 0x49f6428a
+       }
+ 
+       lock(&sched.lock)
+       mp.id = sched.mcount
+       sched.mcount++
+       checkmcount()
+       mpreinit(mp)
+       if mp.gsignal != nil {
+               mp.gsignal.stackguard1 = mp.gsignal.stack.lo + _StackGuard
+       }
+ 
+       // Add to allm so garbage collector doesn't free g->m
+       // when it is just in a register or thread-local storage.
+       mp.alllink = allm
+ 
+       // NumCgoCall() iterates over allm w/o schedlock,
+       // so we need to publish it safely.
+       atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
+       unlock(&sched.lock)
+ }
+ 
+ // Mark gp ready to run.
+ func ready(gp *g) {
+       status := readgstatus(gp)
+ 
+       // Mark runnable.
+       _g_ := getg()
+       _g_.m.locks++ // disable preemption because it can be holding p in a local var
+       if status&^_Gscan != _Gwaiting {
+               dumpgstatus(gp)
+               gothrow("bad g->status in ready")
+       }
+ 
+       // status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
+       casgstatus(gp, _Gwaiting, _Grunnable)
+       runqput(_g_.m.p, gp)
+       if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 { // TODO: fast atomic
+               wakep()
+       }
+       _g_.m.locks--
+       if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+               _g_.stackguard0 = stackPreempt
+       }
+ }
+ 
+ func gcprocs() int32 {
+       // Figure out how many CPUs to use during GC.
+       // Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
+       lock(&sched.lock)
+       n := gomaxprocs
+       if n > ncpu {
+               n = ncpu
+       }
+       if n > _MaxGcproc {
+               n = _MaxGcproc
+       }
+       if n > sched.nmidle+1 { // one M is currently running
+               n = sched.nmidle + 1
+       }
+       unlock(&sched.lock)
+       return n
+ }
+ 
+ func needaddgcproc() bool {
+       lock(&sched.lock)
+       n := gomaxprocs
+       if n > ncpu {
+               n = ncpu
+       }
+       if n > _MaxGcproc {
+               n = _MaxGcproc
+       }
+       n -= sched.nmidle + 1 // one M is currently running
+       unlock(&sched.lock)
+       return n > 0
+ }
+ 
+ func helpgc(nproc int32) {
+       _g_ := getg()
+       lock(&sched.lock)
+       pos := 0
+       for n := int32(1); n < nproc; n++ { // one M is currently running
+               if allp[pos].mcache == _g_.m.mcache {
+                       pos++
+               }
+               mp := mget()
+               if mp == nil {
+                       gothrow("gcprocs inconsistency")
+               }
+               mp.helpgc = n
+               mp.mcache = allp[pos].mcache
+               pos++
+               notewakeup(&mp.park)
+       }
+       unlock(&sched.lock)
+ }
+ 
+ // Similar to stoptheworld but best-effort and can be called several times.
+ // There is no reverse operation, used during crashing.
+ // This function must not lock any mutexes.
+ func freezetheworld() {
+       if gomaxprocs == 1 {
+               return
+       }
+       // stopwait and preemption requests can be lost
+       // due to races with concurrently executing threads,
+       // so try several times
+       for i := 0; i < 5; i++ {
+               // this should tell the scheduler to not start any new goroutines
+               sched.stopwait = 0x7fffffff
+               atomicstore(&sched.gcwaiting, 1)
+               // this should stop running goroutines
+               if !preemptall() {
+                       break // no running goroutines
+               }
+               usleep(1000)
+       }
+       // to be sure
+       usleep(1000)
+       preemptall()
+       usleep(1000)
+ }
+ 
+ func isscanstatus(status uint32) bool {
+       if status == _Gscan {
+               gothrow("isscanstatus: Bad status Gscan")
+       }
+       return status&_Gscan == _Gscan
+ }
+ 
+ // All reads and writes of g's status go through readgstatus, casgstatus
+ // castogscanstatus, casfrom_Gscanstatus.
+ //go:nosplit
+ func readgstatus(gp *g) uint32 {
+       return atomicload(&gp.atomicstatus)
+ }
+ 
+ // The Gscanstatuses are acting like locks and this releases them.
+ // If it proves to be a performance hit we should be able to make these
+ // simple atomic stores but for now we are going to throw if
+ // we see an inconsistent state.
+ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
+       success := false
+ 
+       // Check that transition is valid.
+       switch oldval {
+       case _Gscanrunnable,
+               _Gscanwaiting,
+               _Gscanrunning,
+               _Gscansyscall:
+               if newval == oldval&^_Gscan {
+                       success = cas(&gp.atomicstatus, oldval, newval)
+               }
+       case _Gscanenqueue:
+               if newval == _Gwaiting {
+                       success = cas(&gp.atomicstatus, oldval, newval)
+               }
+       }
+       if !success {
+               print("runtime: casfrom_Gscanstatus failed gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
+               dumpgstatus(gp)
+               gothrow("casfrom_Gscanstatus: gp->status is not in scan state")
+       }
+ }
+ 
+ // This will return false if the gp is not in the expected status and the cas fails.
+ // This acts like a lock acquire while the casfromgstatus acts like a lock release.
+ func castogscanstatus(gp *g, oldval, newval uint32) bool {
+       switch oldval {
+       case _Grunnable,
+               _Gwaiting,
+               _Gsyscall:
+               if newval == oldval|_Gscan {
+                       return cas(&gp.atomicstatus, oldval, newval)
+               }
+       case _Grunning:
+               if newval == _Gscanrunning || newval == _Gscanenqueue {
+                       return cas(&gp.atomicstatus, oldval, newval)
+               }
+       }
+       print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n")
+       gothrow("castogscanstatus")
+       panic("not reached")
+ }
+ 
+ // If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
+ // and casfrom_Gscanstatus instead.
+ // casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that
+ // put it in the Gscan state is finished.
+ //go:nosplit
+ func casgstatus(gp *g, oldval, newval uint32) {
+       if (oldval&_Gscan != 0) || (newval&_Gscan != 0) || oldval == newval {
+               systemstack(func() {
+                       print("casgstatus: oldval=", hex(oldval), " newval=", hex(newval), "\n")
+                       gothrow("casgstatus: bad incoming values")
+               })
+       }
+ 
+       // loop if gp->atomicstatus is in a scan state giving
+       // GC time to finish and change the state to oldval.
+       for !cas(&gp.atomicstatus, oldval, newval) {
- -      activeglen := len(allgs)
+       }
+ }
+ 
+ // stopg ensures that gp is stopped at a GC safe point where its stack can be scanned
+ // or in the context of a moving collector the pointers can be flipped from pointing
+ // to old object to pointing to new objects.
+ // If stopg returns true, the caller knows gp is at a GC safe point and will remain there until
+ // the caller calls restartg.
+ // If stopg returns false, the caller is not responsible for calling restartg. This can happen
+ // if another thread, either the gp itself or another GC thread is taking the responsibility
+ // to do the GC work related to this thread.
+ func stopg(gp *g) bool {
+       for {
+               if gp.gcworkdone {
+                       return false
+               }
+ 
+               switch s := readgstatus(gp); s {
+               default:
+                       dumpgstatus(gp)
+                       gothrow("stopg: gp->atomicstatus is not valid")
+ 
+               case _Gdead:
+                       return false
+ 
+               case _Gcopystack:
+                       // Loop until a new stack is in place.
+ 
+               case _Grunnable,
+                       _Gsyscall,
+                       _Gwaiting:
+                       // Claim goroutine by setting scan bit.
+                       if !castogscanstatus(gp, s, s|_Gscan) {
+                               break
+                       }
+                       // In scan state, do work.
+                       gcphasework(gp)
+                       return true
+ 
+               case _Gscanrunnable,
+                       _Gscanwaiting,
+                       _Gscansyscall:
+                       // Goroutine already claimed by another GC helper.
+                       return false
+ 
+               case _Grunning:
++                      if gcphase == _GCscan {
++                              // Running routines not scanned during
++                              // GCscan phase, we only scan non-running routines.
++                              gp.gcworkdone = true
++                              return false
++                      }
++
+                       // Claim goroutine, so we aren't racing with a status
+                       // transition away from Grunning.
+                       if !castogscanstatus(gp, _Grunning, _Gscanrunning) {
+                               break
+                       }
+ 
+                       // Mark gp for preemption.
+                       if !gp.gcworkdone {
+                               gp.preemptscan = true
+                               gp.preempt = true
+                               gp.stackguard0 = stackPreempt
+                       }
+ 
+                       // Unclaim.
+                       casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
+                       return false
+               }
+       }
+ }
+ 
+ // The GC requests that this routine be moved from a scanmumble state to a mumble state.
+ func restartg(gp *g) {
+       s := readgstatus(gp)
+       switch s {
+       default:
+               dumpgstatus(gp)
+               gothrow("restartg: unexpected status")
+ 
+       case _Gdead:
+               // ok
+ 
+       case _Gscanrunnable,
+               _Gscanwaiting,
+               _Gscansyscall:
+               casfrom_Gscanstatus(gp, s, s&^_Gscan)
+ 
+       // Scan is now completed.
+       // Goroutine now needs to be made runnable.
+       // We put it on the global run queue; ready blocks on the global scheduler lock.
+       case _Gscanenqueue:
+               casfrom_Gscanstatus(gp, _Gscanenqueue, _Gwaiting)
+               if gp != getg().m.curg {
+                       gothrow("processing Gscanenqueue on wrong m")
+               }
+               dropg()
+               ready(gp)
+       }
+ }
+ 
+ func stopscanstart(gp *g) {
+       _g_ := getg()
+       if _g_ == gp {
+               gothrow("GC not moved to G0")
+       }
+       if stopg(gp) {
+               if !isscanstatus(readgstatus(gp)) {
+                       dumpgstatus(gp)
+                       gothrow("GC not in scan state")
+               }
+               restartg(gp)
+       }
+ }
+ 
+ // Runs on g0 and does the actual work after putting the g back on the run queue.
+ func mquiesce(gpmaster *g) {
- -      _g_.sched.g = _g_
+       // enqueue the calling goroutine.
+       restartg(gpmaster)
++
++      activeglen := len(allgs)
+       for i := 0; i < activeglen; i++ {
+               gp := allgs[i]
+               if readgstatus(gp) == _Gdead {
+                       gp.gcworkdone = true // noop scan.
+               } else {
+                       gp.gcworkdone = false
+               }
+               stopscanstart(gp)
+       }
+ 
+       // Check that the G's gcwork (such as scanning) has been done. If not do it now.
+       // You can end up doing work here if the page trap on a Grunning Goroutine has
+       // not been sprung or in some race situations. For example a runnable goes dead
+       // and is started up again with a gp->gcworkdone set to false.
+       for i := 0; i < activeglen; i++ {
+               gp := allgs[i]
+               for !gp.gcworkdone {
+                       status := readgstatus(gp)
+                       if status == _Gdead {
+                               //do nothing, scan not needed.
+                               gp.gcworkdone = true // scan is a noop
+                               break
+                       }
+                       if status == _Grunning && gp.stackguard0 == uintptr(stackPreempt) && notetsleep(&sched.stopnote, 100*1000) { // nanosecond arg
+                               noteclear(&sched.stopnote)
+                       } else {
+                               stopscanstart(gp)
+                       }
+               }
+       }
+ 
+       for i := 0; i < activeglen; i++ {
+               gp := allgs[i]
+               status := readgstatus(gp)
+               if isscanstatus(status) {
+                       print("mstopandscang:bottom: post scan bad status gp=", gp, " has status ", hex(status), "\n")
+                       dumpgstatus(gp)
+               }
+               if !gp.gcworkdone && status != _Gdead {
+                       print("mstopandscang:bottom: post scan gp=", gp, "->gcworkdone still false\n")
+                       dumpgstatus(gp)
+               }
+       }
+ 
+       schedule() // Never returns.
+ }
+ 
+ // quiesce moves all the goroutines to a GC safepoint which for now is a at preemption point.
+ // If the global gcphase is GCmark quiesce will ensure that all of the goroutine's stacks
+ // have been scanned before it returns.
+ func quiesce(mastergp *g) {
+       castogscanstatus(mastergp, _Grunning, _Gscanenqueue)
+       // Now move this to the g0 (aka m) stack.
+       // g0 will potentially scan this thread and put mastergp on the runqueue
+       mcall(mquiesce)
+ }
+ 
+ // This is used by the GC as well as the routines that do stack dumps. In the case
+ // of GC all the routines can be reliably stopped. This is not always the case
+ // when the system is in panic or being exited.
+ func stoptheworld() {
+       _g_ := getg()
+ 
+       // If we hold a lock, then we won't be able to stop another M
+       // that is blocked trying to acquire the lock.
+       if _g_.m.locks > 0 {
+               gothrow("stoptheworld: holding locks")
+       }
+ 
+       lock(&sched.lock)
+       sched.stopwait = gomaxprocs
+       atomicstore(&sched.gcwaiting, 1)
+       preemptall()
+       // stop current P
+       _g_.m.p.status = _Pgcstop // Pgcstop is only diagnostic.
+       sched.stopwait--
+       // try to retake all P's in Psyscall status
+       for i := 0; i < int(gomaxprocs); i++ {
+               p := allp[i]
+               s := p.status
+               if s == _Psyscall && cas(&p.status, s, _Pgcstop) {
+                       sched.stopwait--
+               }
+       }
+       // stop idle P's
+       for {
+               p := pidleget()
+               if p == nil {
+                       break
+               }
+               p.status = _Pgcstop
+               sched.stopwait--
+       }
+       wait := sched.stopwait > 0
+       unlock(&sched.lock)
+ 
+       // wait for remaining P's to stop voluntarily
+       if wait {
+               for {
+                       // wait for 100us, then try to re-preempt in case of any races
+                       if notetsleep(&sched.stopnote, 100*1000) {
+                               noteclear(&sched.stopnote)
+                               break
+                       }
+                       preemptall()
+               }
+       }
+       if sched.stopwait != 0 {
+               gothrow("stoptheworld: not stopped")
+       }
+       for i := 0; i < int(gomaxprocs); i++ {
+               p := allp[i]
+               if p.status != _Pgcstop {
+                       gothrow("stoptheworld: not stopped")
+               }
+       }
+ }
+ 
+ func mhelpgc() {
+       _g_ := getg()
+       _g_.m.helpgc = -1
+ }
+ 
+ func starttheworld() {
+       _g_ := getg()
+ 
+       _g_.m.locks++        // disable preemption because it can be holding p in a local var
+       gp := netpoll(false) // non-blocking
+       injectglist(gp)
+       add := needaddgcproc()
+       lock(&sched.lock)
+       if newprocs != 0 {
+               procresize(newprocs)
+               newprocs = 0
+       } else {
+               procresize(gomaxprocs)
+       }
+       sched.gcwaiting = 0
+ 
+       var p1 *p
+       for {
+               p := pidleget()
+               if p == nil {
+                       break
+               }
+               // procresize() puts p's with work at the beginning of the list.
+               // Once we reach a p without a run queue, the rest don't have one either.
+               if p.runqhead == p.runqtail {
+                       pidleput(p)
+                       break
+               }
+               p.m = mget()
+               p.link = p1
+               p1 = p
+       }
+       if sched.sysmonwait != 0 {
+               sched.sysmonwait = 0
+               notewakeup(&sched.sysmonnote)
+       }
+       unlock(&sched.lock)
+ 
+       for p1 != nil {
+               p := p1
+               p1 = p1.link
+               if p.m != nil {
+                       mp := p.m
+                       p.m = nil
+                       if mp.nextp != nil {
+                               gothrow("starttheworld: inconsistent mp->nextp")
+                       }
+                       mp.nextp = p
+                       notewakeup(&mp.park)
+               } else {
+                       // Start M to run P.  Do not start another M below.
+                       _newm(nil, p)
+                       add = false
+               }
+       }
+ 
+       if add {
+               // If GC could have used another helper proc, start one now,
+               // in the hope that it will be available next time.
+               // It would have been even better to start it before the collection,
+               // but doing so requires allocating memory, so it's tricky to
+               // coordinate.  This lazy approach works out in practice:
+               // we don't mind if the first couple gc rounds don't have quite
+               // the maximum number of procs.
+               _newm(mhelpgc, nil)
+       }
+       _g_.m.locks--
+       if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+               _g_.stackguard0 = stackPreempt
+       }
+ }
+ 
+ // Called to start an M.
+ //go:nosplit
+ func mstart() {
+       _g_ := getg()
+ 
+       if _g_.stack.lo == 0 {
+               // Initialize stack bounds from system stack.
+               // Cgo may have left stack size in stack.hi.
+               size := _g_.stack.hi
+               if size == 0 {
+                       size = 8192
+               }
+               _g_.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
+               _g_.stack.lo = _g_.stack.hi - size + 1024
+       }
+       // Initialize stack guards so that we can start calling
+       // both Go and C functions with stack growth prologues.
+       _g_.stackguard0 = _g_.stack.lo + _StackGuard
+       _g_.stackguard1 = _g_.stackguard0
+       mstart1()
+ }
+ 
+ func mstart1() {
+       _g_ := getg()
+ 
+       if _g_ != _g_.m.g0 {
+               gothrow("bad runtime·mstart")
+       }
+ 
+       // Record top of stack for use by mcall.
+       // Once we call schedule we're never coming back,
+       // so other calls can reuse this stack space.
+       gosave(&_g_.m.g0.sched)
+       _g_.m.g0.sched.pc = ^uintptr(0) // make sure it is never used
+       asminit()
+       minit()
+ 
+       // Install signal handlers; after minit so that minit can
+       // prepare the thread to be able to handle the signals.
+       if _g_.m == &m0 {
+               initsig()
+       }
+ 
+       if _g_.m.mstartfn != nil {
+               fn := *(*func())(unsafe.Pointer(&_g_.m.mstartfn))
+               fn()
+       }
+ 
+       if _g_.m.helpgc != 0 {
+               _g_.m.helpgc = 0
+               stopm()
+       } else if _g_.m != &m0 {
+               acquirep(_g_.m.nextp)
+               _g_.m.nextp = nil
+       }
+       schedule()
+ 
+       // TODO(brainman): This point is never reached, because scheduler
+       // does not release os threads at the moment. But once this path
+       // is enabled, we must remove our seh here.
+ }
+ 
+ // When running with cgo, we call _cgo_thread_start
+ // to start threads for us so that we can play nicely with
+ // foreign code.
+ var cgoThreadStart unsafe.Pointer
+ 
+ type cgothreadstart struct {
+       g   *g
+       tls *uint64
+       fn  unsafe.Pointer
+ }
+ 
+ // Allocate a new m unassociated with any thread.
+ // Can use p for allocation context if needed.
+ func allocm(_p_ *p) *m {
+       _g_ := getg()
+       _g_.m.locks++ // disable GC because it can be called from sysmon
+       if _g_.m.p == nil {
+               acquirep(_p_) // temporarily borrow p for mallocs in this function
+       }
+       mp := newM()
+       mcommoninit(mp)
+ 
+       // In case of cgo or Solaris, pthread_create will make us a stack.
+       // Windows and Plan 9 will layout sched stack on OS stack.
+       if iscgo || GOOS == "solaris" || GOOS == "windows" || GOOS == "plan9" {
+               mp.g0 = malg(-1)
+       } else {
+               mp.g0 = malg(8192)
+       }
+       mp.g0.m = mp
+ 
+       if _p_ == _g_.m.p {
+               releasep()
+       }
+       _g_.m.locks--
+       if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+               _g_.stackguard0 = stackPreempt
+       }
+ 
+       return mp
+ }
+ 
+ func allocg() *g {
+       return newG()
+ }
+ 
+ // needm is called when a cgo callback happens on a
+ // thread without an m (a thread not created by Go).
+ // In this case, needm is expected to find an m to use
+ // and return with m, g initialized correctly.
+ // Since m and g are not set now (likely nil, but see below)
+ // needm is limited in what routines it can call. In particular
+ // it can only call nosplit functions (textflag 7) and cannot
+ // do any scheduling that requires an m.
+ //
+ // In order to avoid needing heavy lifting here, we adopt
+ // the following strategy: there is a stack of available m's
+ // that can be stolen. Using compare-and-swap
+ // to pop from the stack has ABA races, so we simulate
+ // a lock by doing an exchange (via casp) to steal the stack
+ // head and replace the top pointer with MLOCKED (1).
+ // This serves as a simple spin lock that we can use even
+ // without an m. The thread that locks the stack in this way
+ // unlocks the stack by storing a valid stack head pointer.
+ //
+ // In order to make sure that there is always an m structure
+ // available to be stolen, we maintain the invariant that there
+ // is always one more than needed. At the beginning of the
+ // program (if cgo is in use) the list is seeded with a single m.
+ // If needm finds that it has taken the last m off the list, its job
+ // is - once it has installed its own m so that it can do things like
+ // allocate memory - to create a spare m and put it on the list.
+ //
+ // Each of these extra m's also has a g0 and a curg that are
+ // pressed into service as the scheduling stack and current
+ // goroutine for the duration of the cgo callback.
+ //
+ // When the callback is done with the m, it calls dropm to
+ // put the m back on the list.
+ //go:nosplit
+ func needm(x byte) {
+       if needextram != 0 {
+               // Can happen if C/C++ code calls Go from a global ctor.
+               // Can not throw, because scheduler is not initialized yet.
+               // XXX
+               // write(2, unsafe.Pointer("fatal error: cgo callback before cgo call\n"), sizeof("fatal error: cgo callback before cgo call\n") - 1)
+               exit(1)
+       }
+ 
+       // Lock extra list, take head, unlock popped list.
+       // nilokay=false is safe here because of the invariant above,
+       // that the extra list always contains or will soon contain
+       // at least one m.
+       mp := lockextra(false)
+ 
+       // Set needextram when we've just emptied the list,
+       // so that the eventual call into cgocallbackg will
+       // allocate a new m for the extra list. We delay the
+       // allocation until then so that it can be done
+       // after exitsyscall makes sure it is okay to be
+       // running at all (that is, there's no garbage collection
+       // running right now).
+       mp.needextram = mp.schedlink == nil
+       unlockextra(mp.schedlink)
+ 
+       // Install g (= m->g0) and set the stack bounds
+       // to match the current stack. We don't actually know
+       // how big the stack is, like we don't know how big any
+       // scheduling stack is, but we assume there's at least 32 kB,
+       // which is more than enough for us.
+       setg(mp.g0)
+       _g_ := getg()
+       _g_.stack.hi = uintptr(noescape(unsafe.Pointer(&x))) + 1024
+       _g_.stack.lo = uintptr(noescape(unsafe.Pointer(&x))) - 32*1024
+       _g_.stackguard0 = _g_.stack.lo + _StackGuard
+ 
+       // Initialize this thread to use the m.
+       asminit()
+       minit()
+ }
+ 
+ // newextram allocates an m and puts it on the extra list.
+ // It is called with a working local m, so that it can do things
+ // like call schedlock and allocate.
+ func newextram() {
+       // Create extra goroutine locked to extra m.
+       // The goroutine is the context in which the cgo callback will run.
+       // The sched.pc will never be returned to, but setting it to
+       // goexit makes clear to the traceback routines where
+       // the goroutine stack ends.
+       mp := allocm(nil)
+       gp := malg(4096)
+       gp.sched.pc = funcPC(goexit) + _PCQuantum
+       gp.sched.sp = gp.stack.hi
+       gp.sched.sp -= 4 * regSize // extra space in case of reads slightly beyond frame
+       gp.sched.lr = 0
+       gp.sched.g = gp
+       gp.syscallpc = gp.sched.pc
+       gp.syscallsp = gp.sched.sp
+       // malg returns status as Gidle, change to Gsyscall before adding to allg
+       // where GC will see it.
+       casgstatus(gp, _Gidle, _Gsyscall)
+       gp.m = mp
+       mp.curg = gp
+       mp.locked = _LockInternal
+       mp.lockedg = gp
+       gp.lockedm = mp
+       gp.goid = int64(xadd64(&sched.goidgen, 1))
+       if raceenabled {
+               gp.racectx = racegostart(funcPC(newextram))
+       }
+       // put on allg for garbage collector
+       allgadd(gp)
+ 
+       // Add m to the extra list.
+       mnext := lockextra(true)
+       mp.schedlink = mnext
+       unlockextra(mp)
+ }
+ 
+ // dropm is called when a cgo callback has called needm but is now
+ // done with the callback and returning back into the non-Go thread.
+ // It puts the current m back onto the extra list.
+ //
+ // The main expense here is the call to signalstack to release the
+ // m's signal stack, and then the call to needm on the next callback
+ // from this thread. It is tempting to try to save the m for next time,
+ // which would eliminate both these costs, but there might not be
+ // a next time: the current thread (which Go does not control) might exit.
+ // If we saved the m for that thread, there would be an m leak each time
+ // such a thread exited. Instead, we acquire and release an m on each
+ // call. These should typically not be scheduling operations, just a few
+ // atomics, so the cost should be small.
+ //
+ // TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
+ // variable using pthread_key_create. Unlike the pthread keys we already use
+ // on OS X, this dummy key would never be read by Go code. It would exist
+ // only so that we could register at thread-exit-time destructor.
+ // That destructor would put the m back onto the extra list.
+ // This is purely a performance optimization. The current version,
+ // in which dropm happens on each cgo call, is still correct too.
+ // We may have to keep the current version on systems with cgo
+ // but without pthreads, like Windows.
+ func dropm() {
+       // Undo whatever initialization minit did during needm.
+       unminit()
+ 
+       // Clear m and g, and return m to the extra list.
+       // After the call to setmg we can only call nosplit functions.
+       mp := getg().m
+       setg(nil)
+ 
+       mnext := lockextra(true)
+       mp.schedlink = mnext
+       unlockextra(mp)
+ }
+ 
+ var extram uintptr
+ 
+ // lockextra locks the extra list and returns the list head.
+ // The caller must unlock the list by storing a new list head
+ // to extram. If nilokay is true, then lockextra will
+ // return a nil list head if that's what it finds. If nilokay is false,
+ // lockextra will keep waiting until the list head is no longer nil.
+ //go:nosplit
+ func lockextra(nilokay bool) *m {
+       const locked = 1
+ 
+       for {
+               old := atomicloaduintptr(&extram)
+               if old == locked {
+                       yield := osyield
+                       yield()
+                       continue
+               }
+               if old == 0 && !nilokay {
+                       usleep(1)
+                       continue
+               }
+               if casuintptr(&extram, old, locked) {
+                       return (*m)(unsafe.Pointer(old))
+               }
+               yield := osyield
+               yield()
+               continue
+       }
+ }
+ 
+ //go:nosplit
+ func unlockextra(mp *m) {
+       atomicstoreuintptr(&extram, uintptr(unsafe.Pointer(mp)))
+ }
+ 
+ // Create a new m.  It will start off with a call to fn, or else the scheduler.
+ func _newm(fn func(), _p_ *p) {
+       mp := allocm(_p_)
+       mp.nextp = _p_
+       mp.mstartfn = *(*unsafe.Pointer)(unsafe.Pointer(&fn))
+ 
+       if iscgo {
+               var ts cgothreadstart
+               if _cgo_thread_start == nil {
+                       gothrow("_cgo_thread_start missing")
+               }
+               ts.g = mp.g0
+               ts.tls = (*uint64)(unsafe.Pointer(&mp.tls[0]))
+               ts.fn = unsafe.Pointer(funcPC(mstart))
+               asmcgocall(_cgo_thread_start, unsafe.Pointer(&ts))
+               return
+       }
+       newosproc(mp, unsafe.Pointer(mp.g0.stack.hi))
+ }
+ 
+ // Stops execution of the current m until new work is available.
+ // Returns with acquired P.
+ func stopm() {
+       _g_ := getg()
+ 
+       if _g_.m.locks != 0 {
+               gothrow("stopm holding locks")
+       }
+       if _g_.m.p != nil {
+               gothrow("stopm holding p")
+       }
+       if _g_.m.spinning {
+               _g_.m.spinning = false
+               xadd(&sched.nmspinning, -1)
+       }
+ 
+ retry:
+       lock(&sched.lock)
+       mput(_g_.m)
+       unlock(&sched.lock)
+       notesleep(&_g_.m.park)
+       noteclear(&_g_.m.park)
+       if _g_.m.helpgc != 0 {
+               gchelper()
+               _g_.m.helpgc = 0
+               _g_.m.mcache = nil
+               goto retry
+       }
+       acquirep(_g_.m.nextp)
+       _g_.m.nextp = nil
+ }
+ 
+ func mspinning() {
+       getg().m.spinning = true
+ }
+ 
+ // Schedules some M to run the p (creates an M if necessary).
+ // If p==nil, tries to get an idle P, if no idle P's does nothing.
+ func startm(_p_ *p, spinning bool) {
+       lock(&sched.lock)
+       if _p_ == nil {
+               _p_ = pidleget()
+               if _p_ == nil {
+                       unlock(&sched.lock)
+                       if spinning {
+                               xadd(&sched.nmspinning, -1)
+                       }
+                       return
+               }
+       }
+       mp := mget()
+       unlock(&sched.lock)
+       if mp == nil {
+               var fn func()
+               if spinning {
+                       fn = mspinning
+               }
+               _newm(fn, _p_)
+               return
+       }
+       if mp.spinning {
+               gothrow("startm: m is spinning")
+       }
+       if mp.nextp != nil {
+               gothrow("startm: m has p")
+       }
+       mp.spinning = spinning
+       mp.nextp = _p_
+       notewakeup(&mp.park)
+ }
+ 
+ // Hands off P from syscall or locked M.
+ func handoffp(_p_ *p) {
+       // if it has local work, start it straight away
+       if _p_.runqhead != _p_.runqtail || sched.runqsize != 0 {
+               startm(_p_, false)
+               return
+       }
+       // no local work, check that there are no spinning/idle M's,
+       // otherwise our help is not required
+       if atomicload(&sched.nmspinning)+atomicload(&sched.npidle) == 0 && cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
+               startm(_p_, true)
+               return
+       }
+       lock(&sched.lock)
+       if sched.gcwaiting != 0 {
+               _p_.status = _Pgcstop
+               sched.stopwait--
+               if sched.stopwait == 0 {
+                       notewakeup(&sched.stopnote)
+               }
+               unlock(&sched.lock)
+               return
+       }
+       if sched.runqsize != 0 {
+               unlock(&sched.lock)
+               startm(_p_, false)
+               return
+       }
+       // If this is the last running P and nobody is polling network,
+       // need to wakeup another M to poll network.
+       if sched.npidle == uint32(gomaxprocs-1) && atomicload64(&sched.lastpoll) != 0 {
+               unlock(&sched.lock)
+               startm(_p_, false)
+               return
+       }
+       pidleput(_p_)
+       unlock(&sched.lock)
+ }
+ 
+ // Tries to add one more P to execute G's.
+ // Called when a G is made runnable (newproc, ready).
+ func wakep() {
+       // be conservative about spinning threads
+       if !cas(&sched.nmspinning, 0, 1) {
+               return
+       }
+       startm(nil, true)
+ }
+ 
+ // Stops execution of the current m that is locked to a g until the g is runnable again.
+ // Returns with acquired P.
+ func stoplockedm() {
+       _g_ := getg()
+ 
+       if _g_.m.lockedg == nil || _g_.m.lockedg.lockedm != _g_.m {
+               gothrow("stoplockedm: inconsistent locking")
+       }
+       if _g_.m.p != nil {
+               // Schedule another M to run this p.
+               _p_ := releasep()
+               handoffp(_p_)
+       }
+       incidlelocked(1)
+       // Wait until another thread schedules lockedg again.
+       notesleep(&_g_.m.park)
+       noteclear(&_g_.m.park)
+       status := readgstatus(_g_.m.lockedg)
+       if status&^_Gscan != _Grunnable {
+               print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
+               dumpgstatus(_g_)
+               gothrow("stoplockedm: not runnable")
+       }
+       acquirep(_g_.m.nextp)
+       _g_.m.nextp = nil
+ }
+ 
+ // Schedules the locked m to run the locked gp.
+ func startlockedm(gp *g) {
+       _g_ := getg()
+ 
+       mp := gp.lockedm
+       if mp == _g_.m {
+               gothrow("startlockedm: locked to me")
+       }
+       if mp.nextp != nil {
+               gothrow("startlockedm: m has p")
+       }
+       // directly handoff current P to the locked m
+       incidlelocked(-1)
+       _p_ := releasep()
+       mp.nextp = _p_
+       notewakeup(&mp.park)
+       stopm()
+ }
+ 
+ // Stops the current m for stoptheworld.
+ // Returns when the world is restarted.
+ func gcstopm() {
+       _g_ := getg()
+ 
+       if sched.gcwaiting == 0 {
+               gothrow("gcstopm: not waiting for gc")
+       }
+       if _g_.m.spinning {
+               _g_.m.spinning = false
+               xadd(&sched.nmspinning, -1)
+       }
+       _p_ := releasep()
+       lock(&sched.lock)
+       _p_.status = _Pgcstop
+       sched.stopwait--
+       if sched.stopwait == 0 {
+               notewakeup(&sched.stopnote)
+       }
+       unlock(&sched.lock)
+       stopm()
+ }
+ 
+ // Schedules gp to run on the current M.
+ // Never returns.
+ func execute(gp *g) {
+       _g_ := getg()
+ 
+       casgstatus(gp, _Grunnable, _Grunning)
+       gp.waitsince = 0
+       gp.preempt = false
+       gp.stackguard0 = gp.stack.lo + _StackGuard
+       _g_.m.p.schedtick++
+       _g_.m.curg = gp
+       gp.m = _g_.m
+ 
+       // Check whether the profiler needs to be turned on or off.
+       hz := sched.profilehz
+       if _g_.m.profilehz != hz {
+               resetcpuprofiler(hz)
+       }
+ 
+       gogo(&gp.sched)
+ }
+ 
+ // Finds a runnable goroutine to execute.
+ // Tries to steal from other P's, get g from global queue, poll network.
+ func findrunnable() *g {
+       _g_ := getg()
+ 
+ top:
+       if sched.gcwaiting != 0 {
+               gcstopm()
+               goto top
+       }
+       if fingwait && fingwake {
+               if gp := wakefing(); gp != nil {
+                       ready(gp)
+               }
+       }
+ 
+       // local runq
+       if gp := runqget(_g_.m.p); gp != nil {
+               return gp
+       }
+ 
+       // global runq
+       if sched.runqsize != 0 {
+               lock(&sched.lock)
+               gp := globrunqget(_g_.m.p, 0)
+               unlock(&sched.lock)
+               if gp != nil {
+                       return gp
+               }
+       }
+ 
+       // poll network - returns list of goroutines
+       if gp := netpoll(false); gp != nil { // non-blocking
+               injectglist(gp.schedlink)
+               casgstatus(gp, _Gwaiting, _Grunnable)
+               return gp
+       }
+ 
+       // If number of spinning M's >= number of busy P's, block.
+       // This is necessary to prevent excessive CPU consumption
+       // when GOMAXPROCS>>1 but the program parallelism is low.
+       if !_g_.m.spinning && 2*atomicload(&sched.nmspinning) >= uint32(gomaxprocs)-atomicload(&sched.npidle) { // TODO: fast atomic
+               goto stop
+       }
+       if !_g_.m.spinning {
+               _g_.m.spinning = true
+               xadd(&sched.nmspinning, 1)
+       }
+       // random steal from other P's
+       for i := 0; i < int(2*gomaxprocs); i++ {
+               if sched.gcwaiting != 0 {
+                       goto top
+               }
+               _p_ := allp[fastrand1()%uint32(gomaxprocs)]
+               var gp *g
+               if _p_ == _g_.m.p {
+                       gp = runqget(_p_)
+               } else {
+                       gp = runqsteal(_g_.m.p, _p_)
+               }
+               if gp != nil {
+                       return gp
+               }
+       }
+ stop:
+ 
+       // return P and block
+       lock(&sched.lock)
+       if sched.gcwaiting != 0 {
+               unlock(&sched.lock)
+               goto top
+       }
+       if sched.runqsize != 0 {
+               gp := globrunqget(_g_.m.p, 0)
+               unlock(&sched.lock)
+               return gp
+       }
+       _p_ := releasep()
+       pidleput(_p_)
+       unlock(&sched.lock)
+       if _g_.m.spinning {
+               _g_.m.spinning = false
+               xadd(&sched.nmspinning, -1)
+       }
+ 
+       // check all runqueues once again
+       for i := 0; i < int(gomaxprocs); i++ {
+               _p_ := allp[i]
+               if _p_ != nil && _p_.runqhead != _p_.runqtail {
+                       lock(&sched.lock)
+                       _p_ = pidleget()
+                       unlock(&sched.lock)
+                       if _p_ != nil {
+                               acquirep(_p_)
+                               goto top
+                       }
+                       break
+               }
+       }
+ 
+       // poll network
+       if xchg64(&sched.lastpoll, 0) != 0 {
+               if _g_.m.p != nil {
+                       gothrow("findrunnable: netpoll with p")
+               }
+               if _g_.m.spinning {
+                       gothrow("findrunnable: netpoll with spinning")
+               }
+               gp := netpoll(true) // block until new work is available
+               atomicstore64(&sched.lastpoll, uint64(nanotime()))
+               if gp != nil {
+                       lock(&sched.lock)
+                       _p_ = pidleget()
+                       unlock(&sched.lock)
+                       if _p_ != nil {
+                               acquirep(_p_)
+                               injectglist(gp.schedlink)
+                               casgstatus(gp, _Gwaiting, _Grunnable)
+                               return gp
+                       }
+                       injectglist(gp)
+               }
+       }
+       stopm()
+       goto top
+ }
+ 
+ func resetspinning() {
+       _g_ := getg()
+ 
+       var nmspinning uint32
+       if _g_.m.spinning {
+               _g_.m.spinning = false
+               nmspinning = xadd(&sched.nmspinning, -1)
+               if nmspinning < 0 {
+                       gothrow("findrunnable: negative nmspinning")
+               }
+       } else {
+               nmspinning = atomicload(&sched.nmspinning)
+       }
+ 
+       // M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
+       // so see if we need to wakeup another P here.
+       if nmspinning == 0 && atomicload(&sched.npidle) > 0 {
+               wakep()
+       }
+ }
+ 
+ // Injects the list of runnable G's into the scheduler.
+ // Can run concurrently with GC.
+ func injectglist(glist *g) {
+       if glist == nil {
+               return
+       }
+       lock(&sched.lock)
+       var n int
+       for n = 0; glist != nil; n++ {
+               gp := glist
+               glist = gp.schedlink
+               casgstatus(gp, _Gwaiting, _Grunnable)
+               globrunqput(gp)
+       }
+       unlock(&sched.lock)
+       for ; n != 0 && sched.npidle != 0; n-- {
+               startm(nil, false)
+       }
+ }
+ 
+ // One round of scheduler: find a runnable goroutine and execute it.
+ // Never returns.
+ func schedule() {
+       _g_ := getg()
+ 
+       if _g_.m.locks != 0 {
+               gothrow("schedule: holding locks")
+       }
+ 
+       if _g_.m.lockedg != nil {
+               stoplockedm()
+               execute(_g_.m.lockedg) // Never returns.
+       }
+ 
+ top:
+       if sched.gcwaiting != 0 {
+               gcstopm()
+               goto top
+       }
+ 
+       var gp *g
+       // Check the global runnable queue once in a while to ensure fairness.
+       // Otherwise two goroutines can completely occupy the local runqueue
+       // by constantly respawning each other.
+       tick := _g_.m.p.schedtick
+       // This is a fancy way to say tick%61==0,
+       // it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
+       if uint64(tick)-((uint64(tick)*0x4325c53f)>>36)*61 == 0 && sched.runqsize > 0 {
+               lock(&sched.lock)
+               gp = globrunqget(_g_.m.p, 1)
+               unlock(&sched.lock)
+               if gp != nil {
+                       resetspinning()
+               }
+       }
+       if gp == nil {
+               gp = runqget(_g_.m.p)
+               if gp != nil && _g_.m.spinning {
+                       gothrow("schedule: spinning with local work")
+               }
+       }
+       if gp == nil {
+               gp = findrunnable() // blocks until work is available
+               resetspinning()
+       }
+ 
+       if gp.lockedm != nil {
+               // Hands off own p to the locked m,
+               // then blocks waiting for a new p.
+               startlockedm(gp)
+               goto top
+       }
+ 
+       execute(gp)
+ }
+ 
+ // dropg removes the association between m and the current goroutine m->curg (gp for short).
+ // Typically a caller sets gp's status away from Grunning and then
+ // immediately calls dropg to finish the job. The caller is also responsible
+ // for arranging that gp will be restarted using ready at an
+ // appropriate time. After calling dropg and arranging for gp to be
+ // readied later, the caller can do other work but eventually should
+ // call schedule to restart the scheduling of goroutines on this m.
+ func dropg() {
+       _g_ := getg()
+ 
+       if _g_.m.lockedg == nil {
+               _g_.m.curg.m = nil
+               _g_.m.curg = nil
+       }
+ }
+ 
+ // Puts the current goroutine into a waiting state and calls unlockf.
+ // If unlockf returns false, the goroutine is resumed.
+ func park(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason string) {
+       _g_ := getg()
+ 
+       _g_.m.waitlock = lock
+       _g_.m.waitunlockf = *(*unsafe.Pointer)(unsafe.Pointer(&unlockf))
+       _g_.waitreason = reason
+       mcall(park_m)
+ }
+ 
+ func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
+       unlock((*mutex)(lock))
+       return true
+ }
+ 
+ // Puts the current goroutine into a waiting state and unlocks the lock.
+ // The goroutine can be made runnable again by calling ready(gp).
+ func parkunlock(lock *mutex, reason string) {
+       park(parkunlock_c, unsafe.Pointer(lock), reason)
+ }
+ 
+ // park continuation on g0.
+ func park_m(gp *g) {
+       _g_ := getg()
+ 
+       casgstatus(gp, _Grunning, _Gwaiting)
+       dropg()
+ 
+       if _g_.m.waitunlockf != nil {
+               fn := *(*func(*g, unsafe.Pointer) bool)(unsafe.Pointer(&_g_.m.waitunlockf))
+               ok := fn(gp, _g_.m.waitlock)
+               _g_.m.waitunlockf = nil
+               _g_.m.waitlock = nil
+               if !ok {
+                       casgstatus(gp, _Gwaiting, _Grunnable)
+                       execute(gp) // Schedule it back, never returns.
+               }
+       }
+       schedule()
+ }
+ 
+ // Gosched continuation on g0.
+ func gosched_m(gp *g) {
+       status := readgstatus(gp)
+       if status&^_Gscan != _Grunning {
+               dumpgstatus(gp)
+               gothrow("bad g status")
+       }
+       casgstatus(gp, _Grunning, _Grunnable)
+       dropg()
+       lock(&sched.lock)
+       globrunqput(gp)
+       unlock(&sched.lock)
+ 
+       schedule()
+ }
+ 
+ // Finishes execution of the current goroutine.
+ // Must be NOSPLIT because it is called from Go. (TODO - probably not anymore)
+ //go:nosplit
+ func goexit1() {
+       if raceenabled {
+               racegoend()
+       }
+       mcall(goexit0)
+ }
+ 
+ // goexit continuation on g0.
+ func goexit0(gp *g) {
+       _g_ := getg()
+ 
+       casgstatus(gp, _Grunning, _Gdead)
+       gp.m = nil
+       gp.lockedm = nil
+       _g_.m.lockedg = nil
+       gp.paniconfault = false
+       gp._defer = nil // should be true already but just in case.
+       gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
+       gp.writebuf = nil
+       gp.waitreason = ""
+       gp.param = nil
+ 
+       dropg()
+ 
+       if _g_.m.locked&^_LockExternal != 0 {
+               print("invalid m->locked = ", _g_.m.locked, "\n")
+               gothrow("internal lockOSThread error")
+       }
+       _g_.m.locked = 0
+       gfput(_g_.m.p, gp)
+       schedule()
+ }
+ 
+ //go:nosplit
+ func save(pc, sp uintptr) {
+       _g_ := getg()
+ 
+       _g_.sched.pc = pc
+       _g_.sched.sp = sp
+       _g_.sched.lr = 0
+       _g_.sched.ret = 0
+       _g_.sched.ctxt = nil
- -              systemstack(entersyscall_bad)
++      // write as uintptr to avoid write barrier, which will smash _g_.sched.
++      *(*uintptr)(unsafe.Pointer(&_g_.sched.g)) = uintptr(unsafe.Pointer(_g_))
+ }
+ 
+ // The goroutine g is about to enter a system call.
+ // Record that it's not using the cpu anymore.
+ // This is called only from the go syscall library and cgocall,
+ // not from the low-level system calls used by the
+ //
+ // Entersyscall cannot split the stack: the gosave must
+ // make g->sched refer to the caller's stack segment, because
+ // entersyscall is going to return immediately after.
+ //
+ // Nothing entersyscall calls can split the stack either.
+ // We cannot safely move the stack during an active call to syscall,
+ // because we do not know which of the uintptr arguments are
+ // really pointers (back into the stack).
+ // In practice, this means that we make the fast path run through
+ // entersyscall doing no-split things, and the slow path has to use systemstack
+ // to run bigger things on the system stack.
+ //
+ // reentersyscall is the entry point used by cgo callbacks, where explicitly
+ // saved SP and PC are restored. This is needed when exitsyscall will be called
+ // from a function further up in the call stack than the parent, as g->syscallsp
+ // must always point to a valid stack frame. entersyscall below is the normal
+ // entry point for syscalls, which obtains the SP and PC from the caller.
+ //go:nosplit
+ func reentersyscall(pc, sp uintptr) {
+       _g_ := getg()
+ 
+       // Disable preemption because during this function g is in Gsyscall status,
+       // but can have inconsistent g->sched, do not let GC observe it.
+       _g_.m.locks++
+ 
+       // Entersyscall must not call any function that might split/grow the stack.
+       // (See details in comment above.)
+       // Catch calls that might, by replacing the stack guard with something that
+       // will trip any stack check and leaving a flag to tell newstack to die.
+       _g_.stackguard0 = stackPreempt
+       _g_.throwsplit = true
+ 
+       // Leave SP around for GC and traceback.
+       save(pc, sp)
+       _g_.syscallsp = sp
+       _g_.syscallpc = pc
+       casgstatus(_g_, _Grunning, _Gsyscall)
+       if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
- -func entersyscall_bad() {
- -      var gp *g
- -      gp = getg().m.curg
- -      print("entersyscall inconsistent ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
- -      gothrow("entersyscall")
- -}
- -
++              systemstack(func() {
++                      print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
++                      gothrow("entersyscall")
++              })
+       }
+ 
+       if atomicload(&sched.sysmonwait) != 0 { // TODO: fast atomic
+               systemstack(entersyscall_sysmon)
+               save(pc, sp)
+       }
+ 
+       _g_.m.mcache = nil
+       _g_.m.p.m = nil
+       atomicstore(&_g_.m.p.status, _Psyscall)
+       if sched.gcwaiting != 0 {
+               systemstack(entersyscall_gcwait)
+               save(pc, sp)
+       }
+ 
+       // Goroutines must not split stacks in Gsyscall status (it would corrupt g->sched).
+       // We set _StackGuard to StackPreempt so that first split stack check calls morestack.
+       // Morestack detects this case and throws.
+       _g_.stackguard0 = stackPreempt
+       _g_.m.locks--
+ }
+ 
+ // Standard syscall entry used by the go syscall library and normal cgo calls.
+ //go:nosplit
+ func entersyscall(dummy int32) {
+       reentersyscall(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+ }
+ 
- -      save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+ func entersyscall_sysmon() {
+       lock(&sched.lock)
+       if atomicload(&sched.sysmonwait) != 0 {
+               atomicstore(&sched.sysmonwait, 0)
+               notewakeup(&sched.sysmonnote)
+       }
+       unlock(&sched.lock)
+ }
+ 
+ func entersyscall_gcwait() {
+       _g_ := getg()
+ 
+       lock(&sched.lock)
+       if sched.stopwait > 0 && cas(&_g_.m.p.status, _Psyscall, _Pgcstop) {
+               if sched.stopwait--; sched.stopwait == 0 {
+                       notewakeup(&sched.stopnote)
+               }
+       }
+       unlock(&sched.lock)
+ }
+ 
+ // The same as entersyscall(), but with a hint that the syscall is blocking.
+ //go:nosplit
+ func entersyscallblock(dummy int32) {
+       _g_ := getg()
+ 
+       _g_.m.locks++ // see comment in entersyscall
+       _g_.throwsplit = true
+       _g_.stackguard0 = stackPreempt // see comment in entersyscall
+ 
+       // Leave SP around for GC and traceback.
- -              systemstack(entersyscall_bad)
++      pc := getcallerpc(unsafe.Pointer(&dummy))
++      sp := getcallersp(unsafe.Pointer(&dummy))
++      save(pc, sp)
+       _g_.syscallsp = _g_.sched.sp
+       _g_.syscallpc = _g_.sched.pc
++      if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
++              sp1 := sp
++              sp2 := _g_.sched.sp
++              sp3 := _g_.syscallsp
++              systemstack(func() {
++                      print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
++                      gothrow("entersyscallblock")
++              })
++      }
+       casgstatus(_g_, _Grunning, _Gsyscall)
+       if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
++              systemstack(func() {
++                      print("entersyscallblock inconsistent ", hex(sp), " ", hex(_g_.sched.sp), " ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
++                      gothrow("entersyscallblock")
++              })
+       }
+ 
+       systemstack(entersyscallblock_handoff)
+ 
+       // Resave for traceback during blocked call.
+       save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+ 
+       _g_.m.locks--
+ }
+ 
+ func entersyscallblock_handoff() {
+       handoffp(releasep())
+ }
+ 
+ // The goroutine g exited its system call.
+ // Arrange for it to run on a cpu again.
+ // This is called only from the go syscall library, not
+ // from the low-level system calls used by the
+ //go:nosplit
+ func exitsyscall(dummy int32) {
+       _g_ := getg()
+ 
+       _g_.m.locks++ // see comment in entersyscall
+       if getcallersp(unsafe.Pointer(&dummy)) > _g_.syscallsp {
+               gothrow("exitsyscall: syscall frame is no longer valid")
+       }
+ 
+       _g_.waitsince = 0
+       if exitsyscallfast() {
+               if _g_.m.mcache == nil {
+                       gothrow("lost mcache")
+               }
+               // There's a cpu for us, so we can run.
+               _g_.m.p.syscalltick++
+               // We need to cas the status and scan before resuming...
+               casgstatus(_g_, _Gsyscall, _Grunning)
+ 
+               // Garbage collector isn't running (since we are),
+               // so okay to clear syscallsp.
+               _g_.syscallsp = 0
+               _g_.m.locks--
+               if _g_.preempt {
+                       // restore the preemption request in case we've cleared it in newstack
+                       _g_.stackguard0 = stackPreempt
+               } else {
+                       // otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
+                       _g_.stackguard0 = _g_.stack.lo + _StackGuard
+               }
+               _g_.throwsplit = false
+               return
+       }
+ 
+       _g_.m.locks--
+ 
+       // Call the scheduler.
+       mcall(exitsyscall0)
+ 
+       if _g_.m.mcache == nil {
+               gothrow("lost mcache")
+       }
+ 
+       // Scheduler returned, so we're allowed to run now.
+       // Delete the syscallsp information that we left for
+       // the garbage collector during the system call.
+       // Must wait until now because until gosched returns
+       // we don't know for sure that the garbage collector
+       // is not running.
+       _g_.syscallsp = 0
+       _g_.m.p.syscalltick++
+       _g_.throwsplit = false
+ }
+ 
+ //go:nosplit
+ func exitsyscallfast() bool {
+       _g_ := getg()
+ 
+       // Freezetheworld sets stopwait but does not retake P's.
+       if sched.stopwait != 0 {
++              _g_.m.mcache = nil
+               _g_.m.p = nil
+               return false
+       }
+ 
+       // Try to re-acquire the last P.
+       if _g_.m.p != nil && _g_.m.p.status == _Psyscall && cas(&_g_.m.p.status, _Psyscall, _Prunning) {
+               // There's a cpu for us, so we can run.
+               _g_.m.mcache = _g_.m.p.mcache
+               _g_.m.p.m = _g_.m
+               return true
+       }
+ 
+       // Try to get any other idle P.
++      _g_.m.mcache = nil
+       _g_.m.p = nil
+       if sched.pidle != nil {
+               var ok bool
+               systemstack(func() {
+                       ok = exitsyscallfast_pidle()
+               })
+               if ok {
+                       return true
+               }
+       }
+       return false
+ }
+ 
+ func exitsyscallfast_pidle() bool {
+       lock(&sched.lock)
+       _p_ := pidleget()
+       if _p_ != nil && atomicload(&sched.sysmonwait) != 0 {
+               atomicstore(&sched.sysmonwait, 0)
+               notewakeup(&sched.sysmonnote)
+       }
+       unlock(&sched.lock)
+       if _p_ != nil {
+               acquirep(_p_)
+               return true
+       }
+       return false
+ }
+ 
+ // exitsyscall slow path on g0.
+ // Failed to acquire P, enqueue gp as runnable.
+ func exitsyscall0(gp *g) {
+       _g_ := getg()
+ 
+       casgstatus(gp, _Gsyscall, _Grunnable)
+       dropg()
+       lock(&sched.lock)
+       _p_ := pidleget()
+       if _p_ == nil {
+               globrunqput(gp)
+       } else if atomicload(&sched.sysmonwait) != 0 {
+               atomicstore(&sched.sysmonwait, 0)
+               notewakeup(&sched.sysmonnote)
+       }
+       unlock(&sched.lock)
+       if _p_ != nil {
+               acquirep(_p_)
+               execute(gp) // Never returns.
+       }
+       if _g_.m.lockedg != nil {
+               // Wait until another thread schedules gp and so m again.
+               stoplockedm()
+               execute(gp) // Never returns.
+       }
+       stopm()
+       schedule() // Never returns.
+ }
+ 
+ func beforefork() {
+       gp := getg().m.curg
+ 
+       // Fork can hang if preempted with signals frequently enough (see issue 5517).
+       // Ensure that we stay on the same M where we disable profiling.
+       gp.m.locks++
+       if gp.m.profilehz != 0 {
+               resetcpuprofiler(0)
+       }
+ 
+       // This function is called before fork in syscall package.
+       // Code between fork and exec must not allocate memory nor even try to grow stack.
+       // Here we spoil g->_StackGuard to reliably detect any attempts to grow stack.
+       // runtime_AfterFork will undo this in parent process, but not in child.
+       gp.stackguard0 = stackFork
+ }
+ 
+ // Called from syscall package before fork.
+ //go:nosplit
+ func syscall_BeforeFork() {
+       systemstack(beforefork)
+ }
+ 
+ func afterfork() {
+       gp := getg().m.curg
+ 
+       // See the comment in beforefork.
+       gp.stackguard0 = gp.stack.lo + _StackGuard
+ 
+       hz := sched.profilehz
+       if hz != 0 {
+               resetcpuprofiler(hz)
+       }
+       gp.m.locks--
+ }
+ 
+ // Called from syscall package after fork in parent.
+ //go:nosplit
+ func syscall_AfterFork() {
+       systemstack(afterfork)
+ }
+ 
+ // Allocate a new g, with a stack big enough for stacksize bytes.
+ func malg(stacksize int32) *g {
+       newg := allocg()
+       if stacksize >= 0 {
+               stacksize = round2(_StackSystem + stacksize)
+               systemstack(func() {
+                       newg.stack = stackalloc(uint32(stacksize))
+               })
+               newg.stackguard0 = newg.stack.lo + _StackGuard
+               newg.stackguard1 = ^uintptr(0)
+       }
+       return newg
+ }
+ 
+ // Create a new g running fn with siz bytes of arguments.
+ // Put it on the queue of g's waiting to run.
+ // The compiler turns a go statement into a call to this.
+ // Cannot split the stack because it assumes that the arguments
+ // are available sequentially after &fn; they would not be
+ // copied if a stack split occurred.
+ //go:nosplit
+ func newproc(siz int32, fn *funcval) {
+       argp := add(unsafe.Pointer(&fn), ptrSize)
+       if hasLinkRegister {
+               argp = add(argp, ptrSize) // skip caller's saved LR
+       }
+ 
+       pc := getcallerpc(unsafe.Pointer(&siz))
+       systemstack(func() {
+               newproc1(fn, (*uint8)(argp), siz, 0, pc)
+       })
+ }
+ 
+ // Create a new g running fn with narg bytes of arguments starting
+ // at argp and returning nret bytes of results.  callerpc is the
+ // address of the go statement that created this.  The new g is put
+ // on the queue of g's waiting to run.
+ func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr) *g {
+       _g_ := getg()
+ 
+       if fn == nil {
+               _g_.m.throwing = -1 // do not dump full stacks
+               gothrow("go of nil func value")
+       }
+       _g_.m.locks++ // disable preemption because it can be holding p in a local var
+       siz := narg + nret
+       siz = (siz + 7) &^ 7
+ 
+       // We could allocate a larger initial stack if necessary.
+       // Not worth it: this is almost always an error.
+       // 4*sizeof(uintreg): extra space added below
+       // sizeof(uintreg): caller's LR (arm) or return address (x86, in gostartcall).
+       if siz >= _StackMin-4*regSize-regSize {
+               gothrow("newproc: function arguments too large for new goroutine")
+       }
+ 
+       _p_ := _g_.m.p
+       newg := gfget(_p_)
+       if newg == nil {
+               newg = malg(_StackMin)
+               casgstatus(newg, _Gidle, _Gdead)
+               allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
+       }
+       if newg.stack.hi == 0 {
+               gothrow("newproc1: newg missing stack")
+       }
+ 
+       if readgstatus(newg) != _Gdead {
+               gothrow("newproc1: new g is not Gdead")
+       }
+ 
+       sp := newg.stack.hi
+       sp -= 4 * regSize // extra space in case of reads slightly beyond frame
+       sp -= uintptr(siz)
+       memmove(unsafe.Pointer(sp), unsafe.Pointer(argp), uintptr(narg))
+       if hasLinkRegister {
+               // caller's LR
+               sp -= ptrSize
+               *(*unsafe.Pointer)(unsafe.Pointer(sp)) = nil
+       }
+ 
+       memclr(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
+       newg.sched.sp = sp
+       newg.sched.pc = funcPC(goexit) + _PCQuantum // +PCQuantum so that previous instruction is in same function
+       newg.sched.g = newg
+       gostartcallfn(&newg.sched, fn)
+       newg.gopc = callerpc
+       casgstatus(newg, _Gdead, _Grunnable)
+ 
+       if _p_.goidcache == _p_.goidcacheend {
+               // Sched.goidgen is the last allocated id,
+               // this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
+               // At startup sched.goidgen=0, so main goroutine receives goid=1.
+               _p_.goidcache = xadd64(&sched.goidgen, _GoidCacheBatch)
+               _p_.goidcache -= _GoidCacheBatch - 1
+               _p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
+       }
+       newg.goid = int64(_p_.goidcache)
+       _p_.goidcache++
+       if raceenabled {
+               newg.racectx = racegostart(callerpc)
+       }
+       runqput(_p_, newg)
+ 
+       if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 && unsafe.Pointer(fn.fn) != unsafe.Pointer(funcPC(main)) { // TODO: fast atomic
+               wakep()
+       }
+       _g_.m.locks--
+       if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+               _g_.stackguard0 = stackPreempt
+       }
+       return newg
+ }
+ 
+ // Put on gfree list.
+ // If local list is too long, transfer a batch to the global list.
+ func gfput(_p_ *p, gp *g) {
+       if readgstatus(gp) != _Gdead {
+               gothrow("gfput: bad status (not Gdead)")
+       }
+ 
+       stksize := gp.stack.hi - gp.stack.lo
+ 
+       if stksize != _FixedStack {
+               // non-standard stack size - free it.
+               stackfree(gp.stack)
+               gp.stack.lo = 0
+               gp.stack.hi = 0
+               gp.stackguard0 = 0
+       }
+ 
+       gp.schedlink = _p_.gfree
+       _p_.gfree = gp
+       _p_.gfreecnt++
+       if _p_.gfreecnt >= 64 {
+               lock(&sched.gflock)
+               for _p_.gfreecnt >= 32 {
+                       _p_.gfreecnt--
+                       gp = _p_.gfree
+                       _p_.gfree = gp.schedlink
+                       gp.schedlink = sched.gfree
+                       sched.gfree = gp
+                       sched.ngfree++
+               }
+               unlock(&sched.gflock)
+       }
+ }
+ 
+ // Get from gfree list.
+ // If local list is empty, grab a batch from global list.
+ func gfget(_p_ *p) *g {
+ retry:
+       gp := _p_.gfree
+       if gp == nil && sched.gfree != nil {
+               lock(&sched.gflock)
+               for _p_.gfreecnt < 32 && sched.gfree != nil {
+                       _p_.gfreecnt++
+                       gp = sched.gfree
+                       sched.gfree = gp.schedlink
+                       sched.ngfree--
+                       gp.schedlink = _p_.gfree
+                       _p_.gfree = gp
+               }
+               unlock(&sched.gflock)
+               goto retry
+       }
+       if gp != nil {
+               _p_.gfree = gp.schedlink
+               _p_.gfreecnt--
+               if gp.stack.lo == 0 {
+                       // Stack was deallocated in gfput.  Allocate a new one.
+                       systemstack(func() {
+                               gp.stack = stackalloc(_FixedStack)
+                       })
+                       gp.stackguard0 = gp.stack.lo + _StackGuard
+               } else {
+                       if raceenabled {
+                               racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
+                       }
+               }
+       }
+       return gp
+ }
+ 
+ // Purge all cached G's from gfree list to the global list.
+ func gfpurge(_p_ *p) {
+       lock(&sched.gflock)
+       for _p_.gfreecnt != 0 {
+               _p_.gfreecnt--
+               gp := _p_.gfree
+               _p_.gfree = gp.schedlink
+               gp.schedlink = sched.gfree
+               sched.gfree = gp
+               sched.ngfree++
+       }
+       unlock(&sched.gflock)
+ }
+ 
+ // Breakpoint executes a breakpoint trap.
+ func Breakpoint() {
+       breakpoint()
+ }
+ 
+ // dolockOSThread is called by LockOSThread and lockOSThread below
+ // after they modify m.locked. Do not allow preemption during this call,
+ // or else the m might be different in this function than in the caller.
+ //go:nosplit
+ func dolockOSThread() {
+       _g_ := getg()
+       _g_.m.lockedg = _g_
+       _g_.lockedm = _g_.m
+ }
+ 
+ //go:nosplit
+ 
+ // LockOSThread wires the calling goroutine to its current operating system thread.
+ // Until the calling goroutine exits or calls UnlockOSThread, it will always
+ // execute in that thread, and no other goroutine can.
+ func LockOSThread() {
+       getg().m.locked |= _LockExternal
+       dolockOSThread()
+ }
+ 
+ //go:nosplit
+ func lockOSThread() {
+       getg().m.locked += _LockInternal
+       dolockOSThread()
+ }
+ 
+ // dounlockOSThread is called by UnlockOSThread and unlockOSThread below
+ // after they update m->locked. Do not allow preemption during this call,
+ // or else the m might be in different in this function than in the caller.
+ //go:nosplit
+ func dounlockOSThread() {
+       _g_ := getg()
+       if _g_.m.locked != 0 {
+               return
+       }
+       _g_.m.lockedg = nil
+       _g_.lockedm = nil
+ }
+ 
+ //go:nosplit
+ 
+ // UnlockOSThread unwires the calling goroutine from its fixed operating system thread.
+ // If the calling goroutine has not called LockOSThread, UnlockOSThread is a no-op.
+ func UnlockOSThread() {
+       getg().m.locked &^= _LockExternal
+       dounlockOSThread()
+ }
+ 
+ //go:nosplit
+ func unlockOSThread() {
+       _g_ := getg()
+       if _g_.m.locked < _LockInternal {
+               systemstack(badunlockosthread)
+       }
+       _g_.m.locked -= _LockInternal
+       dounlockOSThread()
+ }
+ 
+ func badunlockosthread() {
+       gothrow("runtime: internal error: misuse of lockOSThread/unlockOSThread")
+ }
+ 
+ func gcount() int32 {
+       n := int32(allglen) - sched.ngfree
+       for i := 0; ; i++ {
+               _p_ := allp[i]
+               if _p_ == nil {
+                       break
+               }
+               n -= _p_.gfreecnt
+       }
+ 
+       // All these variables can be changed concurrently, so the result can be inconsistent.
+       // But at least the current goroutine is running.
+       if n < 1 {
+               n = 1
+       }
+       return n
+ }
+ 
+ func mcount() int32 {
+       return sched.mcount
+ }
+ 
+ var prof struct {
+       lock uint32
+       hz   int32
+ }
+ 
+ func _System()       { _System() }
+ func _ExternalCode() { _ExternalCode() }
+ func _GC()           { _GC() }
+ 
+ var etext struct{}
+ 
+ // Called if we receive a SIGPROF signal.
+ func sigprof(pc *uint8, sp *uint8, lr *uint8, gp *g, mp *m) {
+       var n int32
+       var traceback bool
+       var stk [100]uintptr
+ 
+       if prof.hz == 0 {
+               return
+       }
+ 
+       // Profiling runs concurrently with GC, so it must not allocate.
+       mp.mallocing++
+ 
+       // Define that a "user g" is a user-created goroutine, and a "system g"
+       // is one that is m->g0 or m->gsignal. We've only made sure that we
+       // can unwind user g's, so exclude the system g's.
+       //
+       // It is not quite as easy as testing gp == m->curg (the current user g)
+       // because we might be interrupted for profiling halfway through a
+       // goroutine switch. The switch involves updating three (or four) values:
+       // g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
+       // because once it gets updated the new g is running.
+       //
+       // When switching from a user g to a system g, LR is not considered live,
+       // so the update only affects g, SP, and PC. Since PC must be last, there
+       // the possible partial transitions in ordinary execution are (1) g alone is updated,
+       // (2) both g and SP are updated, and (3) SP alone is updated.
+       // If g is updated, we'll see a system g and not look closer.
+       // If SP alone is updated, we can detect the partial transition by checking
+       // whether the SP is within g's stack bounds. (We could also require that SP
+       // be changed only after g, but the stack bounds check is needed by other
+       // cases, so there is no need to impose an additional requirement.)
+       //
+       // There is one exceptional transition to a system g, not in ordinary execution.
+       // When a signal arrives, the operating system starts the signal handler running
+       // with an updated PC and SP. The g is updated last, at the beginning of the
+       // handler. There are two reasons this is okay. First, until g is updated the
+       // g and SP do not match, so the stack bounds check detects the partial transition.
+       // Second, signal handlers currently run with signals disabled, so a profiling
+       // signal cannot arrive during the handler.
+       //
+       // When switching from a system g to a user g, there are three possibilities.
+       //
+       // First, it may be that the g switch has no PC update, because the SP
+       // either corresponds to a user g throughout (as in asmcgocall)
+       // or because it has been arranged to look like a user g frame
+       // (as in cgocallback_gofunc). In this case, since the entire
+       // transition is a g+SP update, a partial transition updating just one of
+       // those will be detected by the stack bounds check.
+       //
+       // Second, when returning from a signal handler, the PC and SP updates
+       // are performed by the operating system in an atomic update, so the g
+       // update must be done before them. The stack bounds check detects
+       // the partial transition here, and (again) signal handlers run with signals
+       // disabled, so a profiling signal cannot arrive then anyway.
+       //
+       // Third, the common case: it may be that the switch updates g, SP, and PC
+       // separately, as in gogo.
+       //
+       // Because gogo is the only instance, we check whether the PC lies
+       // within that function, and if so, not ask for a traceback. This approach
+       // requires knowing the size of the gogo function, which we
+       // record in arch_*.h and check in runtime_test.go.
+       //
+       // There is another apparently viable approach, recorded here in case
+       // the "PC within gogo" check turns out not to be usable.
+       // It would be possible to delay the update of either g or SP until immediately
+       // before the PC update instruction. Then, because of the stack bounds check,
+       // the only problematic interrupt point is just before that PC update instruction,
+       // and the sigprof handler can detect that instruction and simulate stepping past
+       // it in order to reach a consistent state. On ARM, the update of g must be made
+       // in two places (in R10 and also in a TLS slot), so the delayed update would
+       // need to be the SP update. The sigprof handler must read the instruction at
+       // the current PC and if it was the known instruction (for example, JMP BX or
+       // MOV R2, PC), use that other register in place of the PC value.
+       // The biggest drawback to this solution is that it requires that we can tell
+       // whether it's safe to read from the memory pointed at by PC.
+       // In a correct program, we can test PC == nil and otherwise read,
+       // but if a profiling signal happens at the instant that a program executes
+       // a bad jump (before the program manages to handle the resulting fault)
+       // the profiling handler could fault trying to read nonexistent memory.
+       //
+       // To recap, there are no constraints on the assembly being used for the
+       // transition. We simply require that g and SP match and that the PC is not
+       // in gogo.
+       traceback = true
+       usp := uintptr(unsafe.Pointer(sp))
+       gogo := funcPC(gogo)
+       if gp == nil || gp != mp.curg ||
+               usp < gp.stack.lo || gp.stack.hi < usp ||
+               (gogo <= uintptr(unsafe.Pointer(pc)) && uintptr(unsafe.Pointer(pc)) < gogo+_RuntimeGogoBytes) {
+               traceback = false
+       }
+ 
+       n = 0
+       if traceback {
+               n = int32(gentraceback(uintptr(unsafe.Pointer(pc)), uintptr(unsafe.Pointer(sp)), uintptr(unsafe.Pointer(lr)), gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap))
+       }
+       if !traceback || n <= 0 {
+               // Normal traceback is impossible or has failed.
+               // See if it falls into several common cases.
+               n = 0
+               if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
+                       // Cgo, we can't unwind and symbolize arbitrary C code,
+                       // so instead collect Go stack that leads to the cgo call.
+                       // This is especially important on windows, since all syscalls are cgo calls.
+                       n = int32(gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[0], len(stk), nil, nil, 0))
+               }
+               if GOOS == "windows" && n == 0 && mp.libcallg != nil && mp.libcallpc != 0 && mp.libcallsp != 0 {
+                       // Libcall, i.e. runtime syscall on windows.
+                       // Collect Go stack that leads to the call.
+                       n = int32(gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg, 0, &stk[0], len(stk), nil, nil, 0))
+               }
+               if n == 0 {
+                       // If all of the above has failed, account it against abstract "System" or "GC".
+                       n = 2
+                       // "ExternalCode" is better than "etext".
+                       if uintptr(unsafe.Pointer(pc)) > uintptr(unsafe.Pointer(&etext)) {
+                               pc = (*uint8)(unsafe.Pointer(uintptr(funcPC(_ExternalCode) + _PCQuantum)))
+                       }
+                       stk[0] = uintptr(unsafe.Pointer(pc))
+                       if mp.gcing != 0 || mp.helpgc != 0 {
+                               stk[1] = funcPC(_GC) + _PCQuantum
+                       } else {
+                               stk[1] = funcPC(_System) + _PCQuantum
+                       }
+               }
+       }
+ 
+       if prof.hz != 0 {
+               // Simple cas-lock to coordinate with setcpuprofilerate.
+               for !cas(&prof.lock, 0, 1) {
+                       osyield()
+               }
+               if prof.hz != 0 {
+                       cpuproftick(&stk[0], n)
+               }
+               atomicstore(&prof.lock, 0)
+       }
+       mp.mallocing--
+ }
+ 
+ // Arrange to call fn with a traceback hz times a second.
+ func setcpuprofilerate_m(hz int32) {
+       // Force sane arguments.
+       if hz < 0 {
+               hz = 0
+       }
+ 
+       // Disable preemption, otherwise we can be rescheduled to another thread
+       // that has profiling enabled.
+       _g_ := getg()
+       _g_.m.locks++
+ 
+       // Stop profiler on this thread so that it is safe to lock prof.
+       // if a profiling signal came in while we had prof locked,
+       // it would deadlock.
+       resetcpuprofiler(0)
+ 
+       for !cas(&prof.lock, 0, 1) {
+               osyield()
+       }
+       prof.hz = hz
+       atomicstore(&prof.lock, 0)
+ 
+       lock(&sched.lock)
+       sched.profilehz = hz
+       unlock(&sched.lock)
+ 
+       if hz != 0 {
+               resetcpuprofiler(hz)
+       }
+ 
+       _g_.m.locks--
+ }
+ 
+ // Change number of processors.  The world is stopped, sched is locked.
++// gcworkbufs are not being modified by either the GC or
++// the write barrier code.
+ func procresize(new int32) {
+       old := gomaxprocs
+       if old < 0 || old > _MaxGomaxprocs || new <= 0 || new > _MaxGomaxprocs {
+               gothrow("procresize: invalid arg")
+       }
+ 
+       // initialize new P's
+       for i := int32(0); i < new; i++ {
+               p := allp[i]
+               if p == nil {
+                       p = newP()
+                       p.id = i
+                       p.status = _Pgcstop
+                       atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(p))
+               }
+               if p.mcache == nil {
+                       if old == 0 && i == 0 {
+                               if getg().m.mcache == nil {
+                                       gothrow("missing mcache?")
+                               }
+                               p.mcache = getg().m.mcache // bootstrap
+                       } else {
+                               p.mcache = allocmcache()
+                       }
+               }
+       }
+ 
+       // redistribute runnable G's evenly
+       // collect all runnable goroutines in global queue preserving FIFO order
+       // FIFO order is required to ensure fairness even during frequent GCs
+       // see http://golang.org/issue/7126
+       empty := false
+       for !empty {
+               empty = true
+               for i := int32(0); i < old; i++ {
+                       p := allp[i]
+                       if p.runqhead == p.runqtail {
+                               continue
+                       }
+                       empty = false
+                       // pop from tail of local queue
+                       p.runqtail--
+                       gp := p.runq[p.runqtail%uint32(len(p.runq))]
+                       // push onto head of global queue
+                       gp.schedlink = sched.runqhead
+                       sched.runqhead = gp
+                       if sched.runqtail == nil {
+                               sched.runqtail = gp
+                       }
+                       sched.runqsize++
+               }
+       }
+ 
+       // fill local queues with at most len(p.runq)/2 goroutines
+       // start at 1 because current M already executes some G and will acquire allp[0] below,
+       // so if we have a spare G we want to put it into allp[1].
+       var _p_ p
+       for i := int32(1); i < new*int32(len(_p_.runq))/2 && sched.runqsize > 0; i++ {
+               gp := sched.runqhead
+               sched.runqhead = gp.schedlink
+               if sched.runqhead == nil {
+                       sched.runqtail = nil
+               }
+               sched.runqsize--
+               runqput(allp[i%new], gp)
+       }
+ 
+       // free unused P's
+       for i := new; i < old; i++ {
+               p := allp[i]
+               freemcache(p.mcache)
+               p.mcache = nil
+               gfpurge(p)
+               p.status = _Pdead
+               // can't free P itself because it can be referenced by an M in syscall
+       }
+ 
+       _g_ := getg()
+       if _g_.m.p != nil {
+               _g_.m.p.m = nil
+       }
+       _g_.m.p = nil
+       _g_.m.mcache = nil
+       p := allp[0]
+       p.m = nil
+       p.status = _Pidle
+       acquirep(p)
+       for i := new - 1; i > 0; i-- {
+               p := allp[i]
+               p.status = _Pidle
+               pidleput(p)
+       }
+       var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
+       atomicstore((*uint32)(unsafe.Pointer(int32p)), uint32(new))
+ }
+ 
+ // Associate p and the current m.
+ func acquirep(_p_ *p) {
+       _g_ := getg()
+ 
+       if _g_.m.p != nil || _g_.m.mcache != nil {
+               gothrow("acquirep: already in go")
+       }
+       if _p_.m != nil || _p_.status != _Pidle {
+               id := int32(0)
+               if _p_.m != nil {
+                       id = _p_.m.id
+               }
+               print("acquirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
+               gothrow("acquirep: invalid p state")
+       }
+       _g_.m.mcache = _p_.mcache
+       _g_.m.p = _p_
+       _p_.m = _g_.m
+       _p_.status = _Prunning
+ }
+ 
+ // Disassociate p and the current m.
+ func releasep() *p {
+       _g_ := getg()
+ 
+       if _g_.m.p == nil || _g_.m.mcache == nil {
+               gothrow("releasep: invalid arg")
+       }
+       _p_ := _g_.m.p
+       if _p_.m != _g_.m || _p_.mcache != _g_.m.mcache || _p_.status != _Prunning {
+               print("releasep: m=", _g_.m, " m->p=", _g_.m.p, " p->m=", _p_.m, " m->mcache=", _g_.m.mcache, " p->mcache=", _p_.mcache, " p->status=", _p_.status, "\n")
+               gothrow("releasep: invalid p state")
+       }
+       _g_.m.p = nil
+       _g_.m.mcache = nil
+       _p_.m = nil
+       _p_.status = _Pidle
+       return _p_
+ }
+ 
+ func incidlelocked(v int32) {
+       lock(&sched.lock)
+       sched.nmidlelocked += v
+       if v > 0 {
+               checkdead()
+       }
+       unlock(&sched.lock)
+ }
+ 
+ // Check for deadlock situation.
+ // The check is based on number of running M's, if 0 -> deadlock.
+ func checkdead() {
+       // If we are dying because of a signal caught on an already idle thread,
+       // freezetheworld will cause all running threads to block.
+       // And runtime will essentially enter into deadlock state,
+       // except that there is a thread that will call exit soon.
+       if panicking > 0 {
+               return
+       }
+ 
+       // -1 for sysmon
+       run := sched.mcount - sched.nmidle - sched.nmidlelocked - 1
+       if run > 0 {
+               return
+       }
+       if run < 0 {
+               print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", sched.mcount, "\n")
+               gothrow("checkdead: inconsistent counts")
+       }
+ 
+       grunning := 0
+       lock(&allglock)
+       for i := 0; i < len(allgs); i++ {
+               gp := allgs[i]
+               if gp.issystem {
+                       continue
+               }
+               s := readgstatus(gp)
+               switch s &^ _Gscan {
+               case _Gwaiting:
+                       grunning++
+               case _Grunnable,
+                       _Grunning,
+                       _Gsyscall:
+                       unlock(&allglock)
+                       print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
+                       gothrow("checkdead: runnable g")
+               }
+       }
+       unlock(&allglock)
+       if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
+               gothrow("no goroutines (main called runtime.Goexit) - deadlock!")
+       }
+ 
+       // Maybe jump time forward for playground.
+       gp := timejump()
+       if gp != nil {
+               casgstatus(gp, _Gwaiting, _Grunnable)
+               globrunqput(gp)
+               _p_ := pidleget()
+               if _p_ == nil {
+                       gothrow("checkdead: no p for timer")
+               }
+               mp := mget()
+               if mp == nil {
+                       _newm(nil, _p_)
+               } else {
+                       mp.nextp = _p_
+                       notewakeup(&mp.park)
+               }
+               return
+       }
+ 
+       getg().m.throwing = -1 // do not dump full stacks
+       gothrow("all goroutines are asleep - deadlock!")
+ }
+ 
+ func sysmon() {
+       // If we go two minutes without a garbage collection, force one to run.
+       forcegcperiod := int64(2 * 60 * 1e9)
+ 
+       // If a heap span goes unused for 5 minutes after a garbage collection,
+       // we hand it back to the operating system.
+       scavengelimit := int64(5 * 60 * 1e9)
+ 
+       if debug.scavenge > 0 {
+               // Scavenge-a-lot for testing.
+               forcegcperiod = 10 * 1e6
+               scavengelimit = 20 * 1e6
+       }
+ 
+       lastscavenge := nanotime()
+       nscavenge := 0
+ 
+       // Make wake-up period small enough for the sampling to be correct.
+       maxsleep := forcegcperiod / 2
+       if scavengelimit < forcegcperiod {
+               maxsleep = scavengelimit / 2
+       }
+ 
+       lasttrace := int64(0)
+       idle := 0 // how many cycles in succession we had not wokeup somebody
+       delay := uint32(0)
+       for {
+               if idle == 0 { // start with 20us sleep...
+                       delay = 20
+               } else if idle > 50 { // start doubling the sleep after 1ms...
+                       delay *= 2
+               }
+               if delay > 10*1000 { // up to 10ms
+                       delay = 10 * 1000
+               }
+               usleep(delay)
+               if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomicload(&sched.npidle) == uint32(gomaxprocs)) { // TODO: fast atomic
+                       lock(&sched.lock)
+                       if atomicload(&sched.gcwaiting) != 0 || atomicload(&sched.npidle) == uint32(gomaxprocs) {
+                               atomicstore(&sched.sysmonwait, 1)
+                               unlock(&sched.lock)
+                               notetsleep(&sched.sysmonnote, maxsleep)
+                               lock(&sched.lock)
+                               atomicstore(&sched.sysmonwait, 0)
+                               noteclear(&sched.sysmonnote)
+                               idle = 0
+                               delay = 20
+                       }
+                       unlock(&sched.lock)
+               }
+               // poll network if not polled for more than 10ms
+               lastpoll := int64(atomicload64(&sched.lastpoll))
+               now := nanotime()
+               unixnow := unixnanotime()
+               if lastpoll != 0 && lastpoll+10*1000*1000 < now {
+                       cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
+                       gp := netpoll(false) // non-blocking - returns list of goroutines
+                       if gp != nil {
+                               // Need to decrement number of idle locked M's
+                               // (pretending that one more is running) before injectglist.
+                               // Otherwise it can lead to the following situation:
+                               // injectglist grabs all P's but before it starts M's to run the P's,
+                               // another M returns from syscall, finishes running its G,
+                               // observes that there is no work to do and no other running M's
+                               // and reports deadlock.
+                               incidlelocked(-1)
+                               injectglist(gp)
+                               incidlelocked(1)
+                       }
+               }
+               // retake P's blocked in syscalls
+               // and preempt long running G's
+               if retake(now) != 0 {
+                       idle = 0
+               } else {
+                       idle++
+               }
+               // check if we need to force a GC
+               lastgc := int64(atomicload64(&memstats.last_gc))
+               if lastgc != 0 && unixnow-lastgc > forcegcperiod && atomicload(&forcegc.idle) != 0 {
+                       lock(&forcegc.lock)
+                       forcegc.idle = 0
+                       forcegc.g.schedlink = nil
+                       injectglist(forcegc.g)
+                       unlock(&forcegc.lock)
+               }
+               // scavenge heap once in a while
+               if lastscavenge+scavengelimit/2 < now {
+                       mHeap_Scavenge(int32(nscavenge), uint64(now), uint64(scavengelimit))
+                       lastscavenge = now
+                       nscavenge++
+               }
+               if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace*1000000) <= now {
+                       lasttrace = now
+                       schedtrace(debug.scheddetail > 0)
+               }
+       }
+ }
+ 
+ var pdesc [_MaxGomaxprocs]struct {
+       schedtick   uint32
+       schedwhen   int64
+       syscalltick uint32
+       syscallwhen int64
+ }
+ 
+ func retake(now int64) uint32 {
+       n := 0
+       for i := int32(0); i < gomaxprocs; i++ {
+               _p_ := allp[i]
+               if _p_ == nil {
+                       continue
+               }
+               pd := &pdesc[i]
+               s := _p_.status
+               if s == _Psyscall {
+                       // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
+                       t := int64(_p_.syscalltick)
+                       if int64(pd.syscalltick) != t {
+                               pd.syscalltick = uint32(t)
+                               pd.syscallwhen = now
+                               continue
+                       }
+                       // On the one hand we don't want to retake Ps if there is no other work to do,
+                       // but on the other hand we want to retake them eventually
+                       // because they can prevent the sysmon thread from deep sleep.
+                       if _p_.runqhead == _p_.runqtail && atomicload(&sched.nmspinning)+atomicload(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
+                               continue
+                       }
+                       // Need to decrement number of idle locked M's
+                       // (pretending that one more is running) before the CAS.
+                       // Otherwise the M from which we retake can exit the syscall,
+                       // increment nmidle and report deadlock.
+                       incidlelocked(-1)
+                       if cas(&_p_.status, s, _Pidle) {
+                               n++
+                               handoffp(_p_)
+                       }
+                       incidlelocked(1)
+               } else if s == _Prunning {
+                       // Preempt G if it's running for more than 10ms.
+                       t := int64(_p_.schedtick)
+                       if int64(pd.schedtick) != t {
+                               pd.schedtick = uint32(t)
+                               pd.schedwhen = now
+                               continue
+                       }
+                       if pd.schedwhen+10*1000*1000 > now {
+                               continue
+                       }
+                       preemptone(_p_)
+               }
+       }
+       return uint32(n)
+ }
+ 
+ // Tell all goroutines that they have been preempted and they should stop.
+ // This function is purely best-effort.  It can fail to inform a goroutine if a
+ // processor just started running it.
+ // No locks need to be held.
+ // Returns true if preemption request was issued to at least one goroutine.
+ func preemptall() bool {
+       res := false
+       for i := int32(0); i < gomaxprocs; i++ {
+               _p_ := allp[i]
+               if _p_ == nil || _p_.status != _Prunning {
+                       continue
+               }
+               if preemptone(_p_) {
+                       res = true
+               }
+       }
+       return res
+ }
+ 
+ // Tell the goroutine running on processor P to stop.
+ // This function is purely best-effort.  It can incorrectly fail to inform the
+ // goroutine.  It can send inform the wrong goroutine.  Even if it informs the
+ // correct goroutine, that goroutine might ignore the request if it is
+ // simultaneously executing newstack.
+ // No lock needs to be held.
+ // Returns true if preemption request was issued.
+ // The actual preemption will happen at some point in the future
+ // and will be indicated by the gp->status no longer being
+ // Grunning
+ func preemptone(_p_ *p) bool {
+       mp := _p_.m
+       if mp == nil || mp == getg().m {
+               return false
+       }
+       gp := mp.curg
+       if gp == nil || gp == mp.g0 {
+               return false
+       }
+ 
+       gp.preempt = true
+ 
+       // Every call in a go routine checks for stack overflow by
+       // comparing the current stack pointer to gp->stackguard0.
+       // Setting gp->stackguard0 to StackPreempt folds
+       // preemption into the normal stack overflow check.
+       gp.stackguard0 = stackPreempt
+       return true
+ }
+ 
+ var starttime int64
+ 
+ func schedtrace(detailed bool) {
+       now := nanotime()
+       if starttime == 0 {
+               starttime = now
+       }
+ 
+       lock(&sched.lock)
+       print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", sched.mcount, " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
+       if detailed {
+               print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
+       }
+       // We must be careful while reading data from P's, M's and G's.
+       // Even if we hold schedlock, most data can be changed concurrently.
+       // E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
+       for i := int32(0); i < gomaxprocs; i++ {
+               _p_ := allp[i]
+               if _p_ == nil {
+                       continue
+               }
+               mp := _p_.m
+               h := atomicload(&_p_.runqhead)
+               t := atomicload(&_p_.runqtail)
+               if detailed {
+                       id := int32(-1)
+                       if mp != nil {
+                               id = mp.id
+                       }
+                       print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gfreecnt, "\n")
+               } else {
+                       // In non-detailed mode format lengths of per-P run queues as:
+                       // [len1 len2 len3 len4]
+                       print(" ")
+                       if i == 0 {
+                               print("[")
+                       }
+                       print(t - h)
+                       if i == gomaxprocs-1 {
+                               print("]\n")
+                       }
+               }
+       }
+ 
+       if !detailed {
+               unlock(&sched.lock)
+               return
+       }
+ 
+       for mp := allm; mp != nil; mp = mp.alllink {
+               _p_ := mp.p
+               gp := mp.curg
+               lockedg := mp.lockedg
+               id1 := int32(-1)
+               if _p_ != nil {
+                       id1 = _p_.id
+               }
+               id2 := int64(-1)
+               if gp != nil {
+                       id2 = gp.goid
+               }
+               id3 := int64(-1)
+               if lockedg != nil {
+                       id3 = lockedg.goid
+               }
+               print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " gcing=", mp.gcing, ""+" locks=", mp.locks, " dying=", mp.dying, " helpgc=", mp.helpgc, " spinning=", mp.spinning, " blocked=", getg().m.blocked, " lockedg=", id3, "\n")
+       }
+ 
+       lock(&allglock)
+       for gi := 0; gi < len(allgs); gi++ {
+               gp := allgs[gi]
+               mp := gp.m
+               lockedm := gp.lockedm
+               id1 := int32(-1)
+               if mp != nil {
+                       id1 = mp.id
+               }
+               id2 := int32(-1)
+               if lockedm != nil {
+                       id2 = lockedm.id
+               }
+               print("  G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason, ") m=", id1, " lockedm=", id2, "\n")
+       }
+       unlock(&allglock)
+       unlock(&sched.lock)
+ }
+ 
+ // Put mp on midle list.
+ // Sched must be locked.
+ func mput(mp *m) {
+       mp.schedlink = sched.midle
+       sched.midle = mp
+       sched.nmidle++
+       checkdead()
+ }
+ 
+ // Try to get an m from midle list.
+ // Sched must be locked.
+ func mget() *m {
+       mp := sched.midle
+       if mp != nil {
+               sched.midle = mp.schedlink
+               sched.nmidle--
+       }
+       return mp
+ }
+ 
+ // Put gp on the global runnable queue.
+ // Sched must be locked.
+ func globrunqput(gp *g) {
+       gp.schedlink = nil
+       if sched.runqtail != nil {
+               sched.runqtail.schedlink = gp
+       } else {
+               sched.runqhead = gp
+       }
+       sched.runqtail = gp
+       sched.runqsize++
+ }
+ 
+ // Put a batch of runnable goroutines on the global runnable queue.
+ // Sched must be locked.
+ func globrunqputbatch(ghead *g, gtail *g, n int32) {
+       gtail.schedlink = nil
+       if sched.runqtail != nil {
+               sched.runqtail.schedlink = ghead
+       } else {
+               sched.runqhead = ghead
+       }
+       sched.runqtail = gtail
+       sched.runqsize += n
+ }
+ 
+ // Try get a batch of G's from the global runnable queue.
+ // Sched must be locked.
+ func globrunqget(_p_ *p, max int32) *g {
+       if sched.runqsize == 0 {
+               return nil
+       }
+ 
+       n := sched.runqsize/gomaxprocs + 1
+       if n > sched.runqsize {
+               n = sched.runqsize
+       }
+       if max > 0 && n > max {
+               n = max
+       }
+       if n > int32(len(_p_.runq))/2 {
+               n = int32(len(_p_.runq)) / 2
+       }
+ 
+       sched.runqsize -= n
+       if sched.runqsize == 0 {
+               sched.runqtail = nil
+       }
+ 
+       gp := sched.runqhead
+       sched.runqhead = gp.schedlink
+       n--
+       for ; n > 0; n-- {
+               gp1 := sched.runqhead
+               sched.runqhead = gp1.schedlink
+               runqput(_p_, gp1)
+       }
+       return gp
+ }
+ 
+ // Put p to on _Pidle list.
+ // Sched must be locked.
+ func pidleput(_p_ *p) {
+       _p_.link = sched.pidle
+       sched.pidle = _p_
+       xadd(&sched.npidle, 1) // TODO: fast atomic
+ }
+ 
+ // Try get a p from _Pidle list.
+ // Sched must be locked.
+ func pidleget() *p {
+       _p_ := sched.pidle
+       if _p_ != nil {
+               sched.pidle = _p_.link
+               xadd(&sched.npidle, -1) // TODO: fast atomic
+       }
+       return _p_
+ }
+ 
+ // Try to put g on local runnable queue.
+ // If it's full, put onto global queue.
+ // Executed only by the owner P.
+ func runqput(_p_ *p, gp *g) {
+ retry:
+       h := atomicload(&_p_.runqhead) // load-acquire, synchronize with consumers
+       t := _p_.runqtail
+       if t-h < uint32(len(_p_.runq)) {
+               _p_.runq[t%uint32(len(_p_.runq))] = gp
+               atomicstore(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
+               return
+       }
+       if runqputslow(_p_, gp, h, t) {
+               return
+       }
+       // the queue is not full, now the put above must suceed
+       goto retry
+ }
+ 
+ // Put g and a batch of work from local runnable queue on global queue.
+ // Executed only by the owner P.
+ func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
+       var batch [len(_p_.runq)/2 + 1]*g
+ 
+       // First, grab a batch from local queue.
+       n := t - h
+       n = n / 2
+       if n != uint32(len(_p_.runq)/2) {
+               gothrow("runqputslow: queue is not full")
+       }
+       for i := uint32(0); i < n; i++ {
+               batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))]
+       }
+       if !cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+               return false
+       }
+       batch[n] = gp
+ 
+       // Link the goroutines.
+       for i := uint32(0); i < n; i++ {
+               batch[i].schedlink = batch[i+1]
+       }
+ 
+       // Now put the batch on global queue.
+       lock(&sched.lock)
+       globrunqputbatch(batch[0], batch[n], int32(n+1))
+       unlock(&sched.lock)
+       return true
+ }
+ 
+ // Get g from local runnable queue.
+ // Executed only by the owner P.
+ func runqget(_p_ *p) *g {
+       for {
+               h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
+               t := _p_.runqtail
+               if t == h {
+                       return nil
+               }
+               gp := _p_.runq[h%uint32(len(_p_.runq))]
+               if cas(&_p_.runqhead, h, h+1) { // cas-release, commits consume
+                       return gp
+               }
+       }
+ }
+ 
+ // Grabs a batch of goroutines from local runnable queue.
+ // batch array must be of size len(p->runq)/2. Returns number of grabbed goroutines.
+ // Can be executed by any P.
+ func runqgrab(_p_ *p, batch []*g) uint32 {
+       for {
+               h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
+               t := atomicload(&_p_.runqtail) // load-acquire, synchronize with the producer
+               n := t - h
+               n = n - n/2
+               if n == 0 {
+                       return 0
+               }
+               if n > uint32(len(_p_.runq)/2) { // read inconsistent h and t
+                       continue
+               }
+               for i := uint32(0); i < n; i++ {
+                       batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))]
+               }
+               if cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+                       return n
+               }
+       }
+ }
+ 
+ // Steal half of elements from local runnable queue of p2
+ // and put onto local runnable queue of p.
+ // Returns one of the stolen elements (or nil if failed).
+ func runqsteal(_p_, p2 *p) *g {
+       var batch [len(_p_.runq) / 2]*g
+ 
+       n := runqgrab(p2, batch[:])
+       if n == 0 {
+               return nil
+       }
+       n--
+       gp := batch[n]
+       if n == 0 {
+               return gp
+       }
+       h := atomicload(&_p_.runqhead) // load-acquire, synchronize with consumers
+       t := _p_.runqtail
+       if t-h+n >= uint32(len(_p_.runq)) {
+               gothrow("runqsteal: runq overflow")
+       }
+       for i := uint32(0); i < n; i++ {
+               _p_.runq[(t+i)%uint32(len(_p_.runq))] = batch[i]
+       }
+       atomicstore(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
+       return gp
+ }
+ 
+ func testSchedLocalQueue() {
+       _p_ := new(p)
+       gs := make([]g, len(_p_.runq))
+       for i := 0; i < len(_p_.runq); i++ {
+               if runqget(_p_) != nil {
+                       gothrow("runq is not empty initially")
+               }
+               for j := 0; j < i; j++ {
+                       runqput(_p_, &gs[i])
+               }
+               for j := 0; j < i; j++ {
+                       if runqget(_p_) != &gs[i] {
+                               print("bad element at iter ", i, "/", j, "\n")
+                               gothrow("bad element")
+                       }
+               }
+               if runqget(_p_) != nil {
+                       gothrow("runq is not empty afterwards")
+               }
+       }
+ }
+ 
+ func testSchedLocalQueueSteal() {
+       p1 := new(p)
+       p2 := new(p)
+       gs := make([]g, len(p1.runq))
+       for i := 0; i < len(p1.runq); i++ {
+               for j := 0; j < i; j++ {
+                       gs[j].sig = 0
+                       runqput(p1, &gs[j])
+               }
+               gp := runqsteal(p2, p1)
+               s := 0
+               if gp != nil {
+                       s++
+                       gp.sig++
+               }
+               for {
+                       gp = runqget(p2)
+                       if gp == nil {
+                               break
+                       }
+                       s++
+                       gp.sig++
+               }
+               for {
+                       gp = runqget(p1)
+                       if gp == nil {
+                               break
+                       }
+                       gp.sig++
+               }
+               for j := 0; j < i; j++ {
+                       if gs[j].sig != 1 {
+                               print("bad element ", j, "(", gs[j].sig, ") at iter ", i, "\n")
+                               gothrow("bad element")
+                       }
+               }
+               if s != i/2 && s != i/2+1 {
+                       print("bad steal ", s, ", want ", i/2, " or ", i/2+1, ", iter ", i, "\n")
+                       gothrow("bad steal")
+               }
+       }
+ }
+ 
+ func setMaxThreads(in int) (out int) {
+       lock(&sched.lock)
+       out = int(sched.maxmcount)
+       sched.maxmcount = int32(in)
+       checkmcount()
+       unlock(&sched.lock)
+       return
+ }
+ 
+ var goexperiment string = "GOEXPERIMENT" // TODO: defined in zaexperiment.h
+ 
+ func haveexperiment(name string) bool {
+       x := goexperiment
+       for x != "" {
+               xname := ""
+               i := index(x, ",")
+               if i < 0 {
+                       xname, x = x, ""
+               } else {
+                       xname, x = x[:i], x[i+1:]
+               }
+               if xname == name {
+                       return true
+               }
+       }
+       return false
+ }
+ 
+ //go:nosplit
+ func sync_procPin() int {
+       _g_ := getg()
+       mp := _g_.m
+ 
+       mp.locks++
+       return int(mp.p.id)
+ }
+ 
+ //go:nosplit
+ func sync_procUnpin() {
+       _g_ := getg()
+       _g_.m.locks--
+ }
diff --cc src/runtime/rt0_linux_386.s

index 352e594d53f5c1648c8ee39f3f1307a85bbd0728,352e594d53f5c1648c8ee39f3f1307a85bbd0728..47fd908e7800dd332f5feb9baa948300c8496c10
--- 1/src/runtime/rt0_linux_386.s
--- 2/src/runtime/rt0_linux_386.s
+++ b/src/runtime/rt0_linux_386.s
@@@ -9,7 -9,7 +9,6 @@@ TEXT _rt0_386_linux(SB),NOSPLIT,$
         LEAL    12(SP), BX
         MOVL    AX, 0(SP)
         MOVL    BX, 4(SP)
--      CALL    runtime·linux_setup_vdso(SB)
         CALL    main(SB)
         INT     $3
   
diff --cc src/runtime/runtime2.go

index 0000000000000000000000000000000000000000,c999b3072dbd993cbe19a2cba8b5185edf0f630f..7625a2dd81f9d1ee28750ebc343feef75ac087a5

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@@ -1,0 -1,608 +1,613 @@@
- -      next    *lfnode
+ // Copyright 2009 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ /*
+  * defined constants
+  */
+ const (
+       // G status
+       //
+       // If you add to this list, add to the list
+       // of "okay during garbage collection" status
+       // in mgc0.c too.
+       _Gidle            = iota // 0
+       _Grunnable               // 1 runnable and on a run queue
+       _Grunning                // 2
+       _Gsyscall                // 3
+       _Gwaiting                // 4
+       _Gmoribund_unused        // 5 currently unused, but hardcoded in gdb scripts
+       _Gdead                   // 6
+       _Genqueue                // 7 Only the Gscanenqueue is used.
+       _Gcopystack              // 8 in this state when newstack is moving the stack
+       // the following encode that the GC is scanning the stack and what to do when it is done
+       _Gscan = 0x1000 // atomicstatus&~Gscan = the non-scan state,
+       // _Gscanidle =     _Gscan + _Gidle,      // Not used. Gidle only used with newly malloced gs
+       _Gscanrunnable = _Gscan + _Grunnable //  0x1001 When scanning complets make Grunnable (it is already on run queue)
+       _Gscanrunning  = _Gscan + _Grunning  //  0x1002 Used to tell preemption newstack routine to scan preempted stack.
+       _Gscansyscall  = _Gscan + _Gsyscall  //  0x1003 When scanning completes make is Gsyscall
+       _Gscanwaiting  = _Gscan + _Gwaiting  //  0x1004 When scanning completes make it Gwaiting
+       // _Gscanmoribund_unused,               //  not possible
+       // _Gscandead,                          //  not possible
+       _Gscanenqueue = _Gscan + _Genqueue //  When scanning completes make it Grunnable and put on runqueue
+ )
+ 
+ const (
+       // P status
+       _Pidle = iota
+       _Prunning
+       _Psyscall
+       _Pgcstop
+       _Pdead
+ )
+ 
+ // XXX inserting below here
+ 
+ type mutex struct {
+       // Futex-based impl treats it as uint32 key,
+       // while sema-based impl as M* waitm.
+       // Used to be a union, but unions break precise GC.
+       key uintptr
+ }
+ 
+ type note struct {
+       // Futex-based impl treats it as uint32 key,
+       // while sema-based impl as M* waitm.
+       // Used to be a union, but unions break precise GC.
+       key uintptr
+ }
+ 
+ type _string struct {
+       str *byte
+       len int
+ }
+ 
+ type funcval struct {
+       fn uintptr
+       // variable-size, fn-specific data here
+ }
+ 
+ type iface struct {
+       tab  *itab
+       data unsafe.Pointer
+ }
+ 
+ type eface struct {
+       _type *_type
+       data  unsafe.Pointer
+ }
+ 
+ type slice struct {
+       array *byte // actual data
+       len   uint  // number of elements
+       cap   uint  // allocated number of elements
+ }
+ 
+ type gobuf struct {
+       // The offsets of sp, pc, and g are known to (hard-coded in) libmach.
+       sp   uintptr
+       pc   uintptr
+       g    *g
+       ctxt unsafe.Pointer // this has to be a pointer so that gc scans it
+       ret  uintreg
+       lr   uintptr
+ }
+ 
+ // Known to compiler.
+ // Changes here must also be made in src/cmd/gc/select.c's selecttype.
+ type sudog struct {
+       g           *g
+       selectdone  *uint32
+       next        *sudog
+       prev        *sudog
+       elem        unsafe.Pointer // data element
+       releasetime int64
+       nrelease    int32  // -1 for acquire
+       waitlink    *sudog // g.waiting list
+ }
+ 
+ type gcstats struct {
+       // the struct must consist of only uint64's,
+       // because it is casted to uint64[].
+       nhandoff    uint64
+       nhandoffcnt uint64
+       nprocyield  uint64
+       nosyield    uint64
+       nsleep      uint64
+ }
+ 
+ type libcall struct {
+       fn   uintptr
+       n    uintptr // number of parameters
+       args uintptr // parameters
+       r1   uintptr // return values
+       r2   uintptr
+       err  uintptr // error number
+ }
+ 
+ // describes how to handle callback
+ type wincallbackcontext struct {
+       gobody       unsafe.Pointer // go function to call
+       argsize      uintptr        // callback arguments size (in bytes)
+       restorestack uintptr        // adjust stack on return by (in bytes) (386 only)
+       cleanstack   bool
+ }
+ 
+ // Stack describes a Go execution stack.
+ // The bounds of the stack are exactly [lo, hi),
+ // with no implicit data structures on either side.
+ type stack struct {
+       lo uintptr
+       hi uintptr
+ }
+ 
+ type g struct {
+       // Stack parameters.
+       // stack describes the actual stack memory: [stack.lo, stack.hi).
+       // stackguard0 is the stack pointer compared in the Go stack growth prologue.
+       // It is stack.lo+StackGuard normally, but can be StackPreempt to trigger a preemption.
+       // stackguard1 is the stack pointer compared in the C stack growth prologue.
+       // It is stack.lo+StackGuard on g0 and gsignal stacks.
+       // It is ~0 on other goroutine stacks, to trigger a call to morestackc (and crash).
+       stack       stack   // offset known to runtime/cgo
+       stackguard0 uintptr // offset known to liblink
+       stackguard1 uintptr // offset known to liblink
+ 
+       _panic       *_panic // innermost panic - offset known to liblink
+       _defer       *_defer // innermost defer
+       sched        gobuf
+       syscallsp    uintptr        // if status==gsyscall, syscallsp = sched.sp to use during gc
+       syscallpc    uintptr        // if status==gsyscall, syscallpc = sched.pc to use during gc
+       param        unsafe.Pointer // passed parameter on wakeup
+       atomicstatus uint32
+       goid         int64
+       waitsince    int64  // approx time when the g become blocked
+       waitreason   string // if status==gwaiting
+       schedlink    *g
+       issystem     bool // do not output in stack dump, ignore in deadlock detector
+       preempt      bool // preemption signal, duplicates stackguard0 = stackpreempt
+       paniconfault bool // panic (instead of crash) on unexpected fault address
+       preemptscan  bool // preempted g does scan for gc
+       gcworkdone   bool // debug: cleared at begining of gc work phase cycle, set by gcphasework, tested at end of cycle
+       throwsplit   bool // must not split stack
+       raceignore   int8 // ignore race detection events
+       m            *m   // for debuggers, but offset not hard-coded
+       lockedm      *m
+       sig          uint32
+       writebuf     []byte
+       sigcode0     uintptr
+       sigcode1     uintptr
+       sigpc        uintptr
+       gopc         uintptr // pc of go statement that created this goroutine
+       racectx      uintptr
+       waiting      *sudog // sudog structures this g is waiting on (that have a valid elem ptr)
+       end          [0]byte
+ }
+ 
+ type mts struct {
+       tv_sec  int64
+       tv_nsec int64
+ }
+ 
+ type mscratch struct {
+       v [6]uintptr
+ }
+ 
+ type m struct {
+       g0      *g    // goroutine with scheduling stack
+       morebuf gobuf // gobuf arg to morestack
+ 
+       // Fields not known to debuggers.
+       procid        uint64         // for debuggers, but offset not hard-coded
+       gsignal       *g             // signal-handling g
+       tls           [4]uintptr     // thread-local storage (for x86 extern register)
+       mstartfn      unsafe.Pointer // todo go func()
+       curg          *g             // current running goroutine
+       caughtsig     *g             // goroutine running during fatal signal
+       p             *p             // attached p for executing go code (nil if not executing go code)
+       nextp         *p
+       id            int32
+       mallocing     int32
+       throwing      int32
+       gcing         int32
+       locks         int32
+       softfloat     int32
+       dying         int32
+       profilehz     int32
+       helpgc        int32
+       spinning      bool // m is out of work and is actively looking for work
+       blocked       bool // m is blocked on a note
++      inwb          bool // m is executing a write barrier
++      printlock     int8
+       fastrand      uint32
+       ncgocall      uint64 // number of cgo calls in total
+       ncgo          int32  // number of cgo calls currently in progress
+       cgomal        *cgomal
+       park          note
+       alllink       *m // on allm
+       schedlink     *m
+       machport      uint32 // return address for mach ipc (os x)
+       mcache        *mcache
+       lockedg       *g
+       createstack   [32]uintptr // stack that created this thread.
+       freglo        [16]uint32  // d[i] lsb and f[i]
+       freghi        [16]uint32  // d[i] msb and f[i+16]
+       fflag         uint32      // floating point compare flags
+       locked        uint32      // tracking for lockosthread
+       nextwaitm     *m          // next m waiting for lock
+       waitsema      uintptr     // semaphore for parking on locks
+       waitsemacount uint32
+       waitsemalock  uint32
+       gcstats       gcstats
+       needextram    bool
+       traceback     uint8
+       waitunlockf   unsafe.Pointer // todo go func(*g, unsafe.pointer) bool
+       waitlock      unsafe.Pointer
+       //#ifdef GOOS_windows
+       thread uintptr // thread handle
+       // these are here because they are too large to be on the stack
+       // of low-level NOSPLIT functions.
+       libcall   libcall
+       libcallpc uintptr // for cpu profiler
+       libcallsp uintptr
+       libcallg  *g
+       //#endif
+       //#ifdef GOOS_solaris
+       perrno *int32 // pointer to tls errno
+       // these are here because they are too large to be on the stack
+       // of low-level NOSPLIT functions.
+       //LibCall       libcall;
+       ts      mts
+       scratch mscratch
+       //#endif
+       //#ifdef GOOS_plan9
+       notesig *int8
+       errstr  *byte
+       //#endif
+       end [0]byte
+ }
+ 
+ type p struct {
+       lock mutex
+ 
+       id          int32
+       status      uint32 // one of pidle/prunning/...
+       link        *p
+       schedtick   uint32 // incremented on every scheduler call
+       syscalltick uint32 // incremented on every system call
+       m           *m     // back-link to associated m (nil if idle)
+       mcache      *mcache
+       deferpool   [5]*_defer // pool of available defer structs of different sizes (see panic.c)
+ 
+       // Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
+       goidcache    uint64
+       goidcacheend uint64
+ 
+       // Queue of runnable goroutines.
+       runqhead uint32
+       runqtail uint32
+       runq     [256]*g
+ 
+       // Available G's (status == Gdead)
+       gfree    *g
+       gfreecnt int32
+ 
+       pad [64]byte
+ }
+ 
+ const (
+       // The max value of GOMAXPROCS.
+       // There are no fundamental restrictions on the value.
+       _MaxGomaxprocs = 1 << 8
+ )
+ 
+ type schedt struct {
+       lock mutex
+ 
+       goidgen uint64
+ 
+       midle        *m    // idle m's waiting for work
+       nmidle       int32 // number of idle m's waiting for work
+       nmidlelocked int32 // number of locked m's waiting for work
+       mcount       int32 // number of m's that have been created
+       maxmcount    int32 // maximum number of m's allowed (or die)
+ 
+       pidle      *p // idle p's
+       npidle     uint32
+       nmspinning uint32
+ 
+       // Global runnable queue.
+       runqhead *g
+       runqtail *g
+       runqsize int32
+ 
+       // Global cache of dead G's.
+       gflock mutex
+       gfree  *g
+       ngfree int32
+ 
+       gcwaiting  uint32 // gc is waiting to run
+       stopwait   int32
+       stopnote   note
+       sysmonwait uint32
+       sysmonnote note
+       lastpoll   uint64
+ 
+       profilehz int32 // cpu profiling rate
+ }
+ 
+ // The m->locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
+ // The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
+ // External locks are not recursive; a second lock is silently ignored.
+ // The upper bits of m->lockedcount record the nesting depth of calls to lockOSThread
+ // (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
+ // Internal locks can be recursive. For instance, a lock for cgo can occur while the main
+ // goroutine is holding the lock during the initialization phase.
+ const (
+       _LockExternal = 1
+       _LockInternal = 2
+ )
+ 
+ type sigtabtt struct {
+       flags int32
+       name  *int8
+ }
+ 
+ const (
+       _SigNotify   = 1 << 0 // let signal.Notify have signal, even if from kernel
+       _SigKill     = 1 << 1 // if signal.Notify doesn't take it, exit quietly
+       _SigThrow    = 1 << 2 // if signal.Notify doesn't take it, exit loudly
+       _SigPanic    = 1 << 3 // if the signal is from the kernel, panic
+       _SigDefault  = 1 << 4 // if the signal isn't explicitly requested, don't monitor it
+       _SigHandling = 1 << 5 // our signal handler is registered
+       _SigIgnored  = 1 << 6 // the signal was ignored before we registered for it
+       _SigGoExit   = 1 << 7 // cause all runtime procs to exit (only used on Plan 9).
+ )
+ 
+ // Layout of in-memory per-function information prepared by linker
+ // See http://golang.org/s/go12symtab.
+ // Keep in sync with linker and with ../../libmach/sym.c
+ // and with package debug/gosym and with symtab.go in package runtime.
+ type _func struct {
+       entry   uintptr // start pc
+       nameoff int32   // function name
+ 
+       args  int32 // in/out args size
+       frame int32 // legacy frame size; use pcsp if possible
+ 
+       pcsp      int32
+       pcfile    int32
+       pcln      int32
+       npcdata   int32
+       nfuncdata int32
+ }
+ 
+ // layout of Itab known to compilers
+ // allocated in non-garbage-collected memory
+ type itab struct {
+       inter  *interfacetype
+       _type  *_type
+       link   *itab
+       bad    int32
+       unused int32
+       fun    [0]uintptr
+ }
+ 
+ const (
+       // TODO: Generate in cmd/dist.
+       _NaCl    = 0
+       _Windows = 0
+       _Solaris = 0
+       _Plan9   = 0
+ )
+ 
+ // Lock-free stack node.
++// // Also known to export_test.go.
+ type lfnode struct {
- -      _GCoff     = iota // stop and start             nop
- -      _GCquiesce        // stop and start             nop
- -      _GCstw            // stop the ps                nop
- -      _GCmark           // scan the stacks and start  no white to black
- -      _GCsweep          // stop and start             nop
++      next    uint64
+       pushcnt uintptr
+ }
+ 
+ // Parallel for descriptor.
+ type parfor struct {
+       body    unsafe.Pointer // go func(*parfor, uint32), executed for each element
+       done    uint32         // number of idle threads
+       nthr    uint32         // total number of threads
+       nthrmax uint32         // maximum number of threads
+       thrseq  uint32         // thread id sequencer
+       cnt     uint32         // iteration space [0, cnt)
+       ctx     unsafe.Pointer // arbitrary user context
+       wait    bool           // if true, wait while all threads finish processing,
+       // otherwise parfor may return while other threads are still working
+       thr *parforthread // array of thread descriptors
+       pad uint32        // to align parforthread.pos for 64-bit atomic operations
+       // stats
+       nsteal     uint64
+       nstealcnt  uint64
+       nprocyield uint64
+       nosyield   uint64
+       nsleep     uint64
+ }
+ 
+ // Track memory allocated by code not written in Go during a cgo call,
+ // so that the garbage collector can see them.
+ type cgomal struct {
+       next  *cgomal
+       alloc unsafe.Pointer
+ }
+ 
+ // Holds variables parsed from GODEBUG env var.
+ type debugvars struct {
+       allocfreetrace int32
+       efence         int32
+       gctrace        int32
+       gcdead         int32
+       scheddetail    int32
+       schedtrace     int32
+       scavenge       int32
+ }
+ 
+ // Indicates to write barrier and sychronization task to preform.
+ const (
++      _GCoff             = iota // GC not running, write barrier disabled
++      _GCquiesce                // unused state
++      _GCstw                    // unused state
++      _GCscan                   // GC collecting roots into workbufs, write barrier disabled
++      _GCmark                   // GC marking from workbufs, write barrier ENABLED
++      _GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
++      _GCsweep                  // GC mark completed; sweeping in background, write barrier disabled
+ )
+ 
+ type forcegcstate struct {
+       lock mutex
+       g    *g
+       idle uint32
+ }
+ 
+ var gcphase uint32
+ 
+ /*
+  * known to compiler
+  */
+ const (
+       _Structrnd = regSize
+ )
+ 
+ var startup_random_data *byte
+ var startup_random_data_len uint32
+ 
+ var invalidptr int32
+ 
+ const (
+       // hashinit wants this many random bytes
+       _HashRandomBytes = 32
+ )
+ 
+ /*
+  * deferred subroutine calls
+  */
+ type _defer struct {
+       siz     int32
+       started bool
+       argp    uintptr // where args were copied from
+       pc      uintptr
+       fn      *funcval
+       _panic  *_panic // panic that is running defer
+       link    *_defer
+ }
+ 
+ /*
+  * panics
+  */
+ type _panic struct {
+       argp      unsafe.Pointer // pointer to arguments of deferred call run during panic; cannot move - known to liblink
+       arg       interface{}    // argument to panic
+       link      *_panic        // link to earlier panic
+       recovered bool           // whether this panic is over
+       aborted   bool           // the panic was aborted
+ }
+ 
+ /*
+  * stack traces
+  */
+ 
+ type stkframe struct {
+       fn       *_func     // function being run
+       pc       uintptr    // program counter within fn
+       continpc uintptr    // program counter where execution can continue, or 0 if not
+       lr       uintptr    // program counter at caller aka link register
+       sp       uintptr    // stack pointer at pc
+       fp       uintptr    // stack pointer at caller aka frame pointer
+       varp     uintptr    // top of local variables
+       argp     uintptr    // pointer to function arguments
+       arglen   uintptr    // number of bytes at argp
+       argmap   *bitvector // force use of this argmap
+ }
+ 
+ const (
+       _TraceRuntimeFrames = 1 << 0 // include frames for internal runtime functions.
+       _TraceTrap          = 1 << 1 // the initial PC, SP are from a trap, not a return PC from a call
+ )
+ 
+ const (
+       // The maximum number of frames we print for a traceback
+       _TracebackMaxFrames = 100
+ )
+ 
+ var (
+       emptystring string
+       allg        **g
+       allglen     uintptr
+       lastg       *g
+       allm        *m
+       allp        [_MaxGomaxprocs + 1]*p
+       gomaxprocs  int32
+       needextram  uint32
+       panicking   uint32
+       goos        *int8
+       ncpu        int32
+       iscgo       bool
+       cpuid_ecx   uint32
+       cpuid_edx   uint32
+       debug       debugvars
+       signote     note
+       forcegc     forcegcstate
+       sched       schedt
+       newprocs    int32
+ )
+ 
+ /*
+  * mutual exclusion locks.  in the uncontended case,
+  * as fast as spin locks (just a few user-level instructions),
+  * but on the contention path they sleep in the kernel.
+  * a zeroed Mutex is unlocked (no need to initialize each lock).
+  */
+ 
+ /*
+  * sleep and wakeup on one-time events.
+  * before any calls to notesleep or notewakeup,
+  * must call noteclear to initialize the Note.
+  * then, exactly one thread can call notesleep
+  * and exactly one thread can call notewakeup (once).
+  * once notewakeup has been called, the notesleep
+  * will return.  future notesleep will return immediately.
+  * subsequent noteclear must be called only after
+  * previous notesleep has returned, e.g. it's disallowed
+  * to call noteclear straight after notewakeup.
+  *
+  * notetsleep is like notesleep but wakes up after
+  * a given number of nanoseconds even if the event
+  * has not yet happened.  if a goroutine uses notetsleep to
+  * wake up early, it must wait to call noteclear until it
+  * can be sure that no other goroutine is calling
+  * notewakeup.
+  *
+  * notesleep/notetsleep are generally called on g0,
+  * notetsleepg is similar to notetsleep but is called on user g.
+  */
+ // bool       runtime·notetsleep(Note*, int64);  // false - timeout
+ // bool       runtime·notetsleepg(Note*, int64);  // false - timeout
+ 
+ /*
+  * Lock-free stack.
+  * Initialize uint64 head to 0, compare with 0 to test for emptiness.
+  * The stack does not keep pointers to nodes,
+  * so they can be garbage collected if there are no other pointers to nodes.
+  */
+ 
+ /*
+  * Parallel for over [0, n).
+  * body() is executed for each iteration.
+  * nthr - total number of worker threads.
+  * ctx - arbitrary user context.
+  * if wait=true, threads return from parfor() when all work is done;
+  * otherwise, threads can return while other threads are still finishing processing.
+  */
+ 
+ // for mmap, we only pass the lower 32 bits of file offset to the
+ // assembly routine; the higher bits (if required), should be provided
+ // by the assembly routine as 0.
diff --cc src/runtime/select.go
Simple merge
diff --cc src/runtime/stack1.go

index 0000000000000000000000000000000000000000,40dfc76a6dde6c4555a5930c3c5bf11784f7fc4c..963f4fa731a531ea1e27e0857252ec29d292dccc

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/stack1.go
+++ b/src/runtime/stack1.go
@@@ -1,0 -1,807 +1,818 @@@
- -      oldstatus := readgstatus(gp)
- -      oldstatus &^= _Gscan
- -      if oldstatus == _Gwaiting || oldstatus == _Grunnable {
- -              casgstatus(gp, oldstatus, _Gcopystack) // oldstatus is Gwaiting or Grunnable
- -      } else {
- -              gothrow("copystack: bad status, not Gwaiting or Grunnable")
- -      }
- -
+ // Copyright 2013 The Go Authors. All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ package runtime
+ 
+ import "unsafe"
+ 
+ const (
+       // StackDebug == 0: no logging
+       //            == 1: logging of per-stack operations
+       //            == 2: logging of per-frame operations
+       //            == 3: logging of per-word updates
+       //            == 4: logging of per-word reads
+       stackDebug       = 0
+       stackFromSystem  = 0 // allocate stacks from system memory instead of the heap
+       stackFaultOnFree = 0 // old stacks are mapped noaccess to detect use after free
+       stackPoisonCopy  = 0 // fill stack that should not be accessed with garbage, to detect bad dereferences during copy
+ 
+       stackCache = 1
+ )
+ 
+ const (
+       uintptrMask = 1<<(8*ptrSize) - 1
+       poisonGC    = uintptrMask & 0xf969696969696969
+       poisonStack = uintptrMask & 0x6868686868686868
+ 
+       // Goroutine preemption request.
+       // Stored into g->stackguard0 to cause split stack check failure.
+       // Must be greater than any real sp.
+       // 0xfffffade in hex.
+       stackPreempt = uintptrMask & -1314
+ 
+       // Thread is forking.
+       // Stored into g->stackguard0 to cause split stack check failure.
+       // Must be greater than any real sp.
+       stackFork = uintptrMask & -1234
+ )
+ 
+ // Global pool of spans that have free stacks.
+ // Stacks are assigned an order according to size.
+ //     order = log_2(size/FixedStack)
+ // There is a free list for each order.
+ // TODO: one lock per order?
+ var stackpool [_NumStackOrders]mspan
+ var stackpoolmu mutex
+ 
+ var stackfreequeue stack
+ 
+ func stackinit() {
+       if _StackCacheSize&_PageMask != 0 {
+               gothrow("cache size must be a multiple of page size")
+       }
+       for i := range stackpool {
+               mSpanList_Init(&stackpool[i])
+       }
+ }
+ 
+ // Allocates a stack from the free pool.  Must be called with
+ // stackpoolmu held.
+ func stackpoolalloc(order uint8) *mlink {
+       list := &stackpool[order]
+       s := list.next
+       if s == list {
+               // no free stacks.  Allocate another span worth.
+               s = mHeap_AllocStack(&mheap_, _StackCacheSize>>_PageShift)
+               if s == nil {
+                       gothrow("out of memory")
+               }
+               if s.ref != 0 {
+                       gothrow("bad ref")
+               }
+               if s.freelist != nil {
+                       gothrow("bad freelist")
+               }
+               for i := uintptr(0); i < _StackCacheSize; i += _FixedStack << order {
+                       x := (*mlink)(unsafe.Pointer(uintptr(s.start)<<_PageShift + i))
+                       x.next = s.freelist
+                       s.freelist = x
+               }
+               mSpanList_Insert(list, s)
+       }
+       x := s.freelist
+       if x == nil {
+               gothrow("span has no free stacks")
+       }
+       s.freelist = x.next
+       s.ref++
+       if s.freelist == nil {
+               // all stacks in s are allocated.
+               mSpanList_Remove(s)
+       }
+       return x
+ }
+ 
+ // Adds stack x to the free pool.  Must be called with stackpoolmu held.
+ func stackpoolfree(x *mlink, order uint8) {
+       s := mHeap_Lookup(&mheap_, (unsafe.Pointer)(x))
+       if s.state != _MSpanStack {
+               gothrow("freeing stack not in a stack span")
+       }
+       if s.freelist == nil {
+               // s will now have a free stack
+               mSpanList_Insert(&stackpool[order], s)
+       }
+       x.next = s.freelist
+       s.freelist = x
+       s.ref--
+       if s.ref == 0 {
+               // span is completely free - return to heap
+               mSpanList_Remove(s)
+               s.freelist = nil
+               mHeap_FreeStack(&mheap_, s)
+       }
+ }
+ 
+ // stackcacherefill/stackcacherelease implement a global pool of stack segments.
+ // The pool is required to prevent unlimited growth of per-thread caches.
+ func stackcacherefill(c *mcache, order uint8) {
+       if stackDebug >= 1 {
+               print("stackcacherefill order=", order, "\n")
+       }
+ 
+       // Grab some stacks from the global cache.
+       // Grab half of the allowed capacity (to prevent thrashing).
+       var list *mlink
+       var size uintptr
+       lock(&stackpoolmu)
+       for size < _StackCacheSize/2 {
+               x := stackpoolalloc(order)
+               x.next = list
+               list = x
+               size += _FixedStack << order
+       }
+       unlock(&stackpoolmu)
+       c.stackcache[order].list = list
+       c.stackcache[order].size = size
+ }
+ 
+ func stackcacherelease(c *mcache, order uint8) {
+       if stackDebug >= 1 {
+               print("stackcacherelease order=", order, "\n")
+       }
+       x := c.stackcache[order].list
+       size := c.stackcache[order].size
+       lock(&stackpoolmu)
+       for size > _StackCacheSize/2 {
+               y := x.next
+               stackpoolfree(x, order)
+               x = y
+               size -= _FixedStack << order
+       }
+       unlock(&stackpoolmu)
+       c.stackcache[order].list = x
+       c.stackcache[order].size = size
+ }
+ 
+ func stackcache_clear(c *mcache) {
+       if stackDebug >= 1 {
+               print("stackcache clear\n")
+       }
+       lock(&stackpoolmu)
+       for order := uint8(0); order < _NumStackOrders; order++ {
+               x := c.stackcache[order].list
+               for x != nil {
+                       y := x.next
+                       stackpoolfree(x, order)
+                       x = y
+               }
+               c.stackcache[order].list = nil
+               c.stackcache[order].size = 0
+       }
+       unlock(&stackpoolmu)
+ }
+ 
+ func stackalloc(n uint32) stack {
+       // Stackalloc must be called on scheduler stack, so that we
+       // never try to grow the stack during the code that stackalloc runs.
+       // Doing so would cause a deadlock (issue 1547).
+       thisg := getg()
+       if thisg != thisg.m.g0 {
+               gothrow("stackalloc not on scheduler stack")
+       }
+       if n&(n-1) != 0 {
+               gothrow("stack size not a power of 2")
+       }
+       if stackDebug >= 1 {
+               print("stackalloc ", n, "\n")
+       }
+ 
+       if debug.efence != 0 || stackFromSystem != 0 {
+               v := sysAlloc(round(uintptr(n), _PageSize), &memstats.stacks_sys)
+               if v == nil {
+                       gothrow("out of memory (stackalloc)")
+               }
+               return stack{uintptr(v), uintptr(v) + uintptr(n)}
+       }
+ 
+       // Small stacks are allocated with a fixed-size free-list allocator.
+       // If we need a stack of a bigger size, we fall back on allocating
+       // a dedicated span.
+       var v unsafe.Pointer
+       if stackCache != 0 && n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
+               order := uint8(0)
+               n2 := n
+               for n2 > _FixedStack {
+                       order++
+                       n2 >>= 1
+               }
+               var x *mlink
+               c := thisg.m.mcache
+               if c == nil || thisg.m.gcing != 0 || thisg.m.helpgc != 0 {
+                       // c == nil can happen in the guts of exitsyscall or
+                       // procresize. Just get a stack from the global pool.
+                       // Also don't touch stackcache during gc
+                       // as it's flushed concurrently.
+                       lock(&stackpoolmu)
+                       x = stackpoolalloc(order)
+                       unlock(&stackpoolmu)
+               } else {
+                       x = c.stackcache[order].list
+                       if x == nil {
+                               stackcacherefill(c, order)
+                               x = c.stackcache[order].list
+                       }
+                       c.stackcache[order].list = x.next
+                       c.stackcache[order].size -= uintptr(n)
+               }
+               v = (unsafe.Pointer)(x)
+       } else {
+               s := mHeap_AllocStack(&mheap_, round(uintptr(n), _PageSize)>>_PageShift)
+               if s == nil {
+                       gothrow("out of memory")
+               }
+               v = (unsafe.Pointer)(s.start << _PageShift)
+       }
+ 
+       if raceenabled {
+               racemalloc(v, uintptr(n))
+       }
+       if stackDebug >= 1 {
+               print("  allocated ", v, "\n")
+       }
+       return stack{uintptr(v), uintptr(v) + uintptr(n)}
+ }
+ 
+ func stackfree(stk stack) {
+       gp := getg()
+       n := stk.hi - stk.lo
+       v := (unsafe.Pointer)(stk.lo)
+       if n&(n-1) != 0 {
+               gothrow("stack not a power of 2")
+       }
+       if stackDebug >= 1 {
+               println("stackfree", v, n)
+               memclr(v, n) // for testing, clobber stack data
+       }
+       if debug.efence != 0 || stackFromSystem != 0 {
+               if debug.efence != 0 || stackFaultOnFree != 0 {
+                       sysFault(v, n)
+               } else {
+                       sysFree(v, n, &memstats.stacks_sys)
+               }
+               return
+       }
+       if stackCache != 0 && n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
+               order := uint8(0)
+               n2 := n
+               for n2 > _FixedStack {
+                       order++
+                       n2 >>= 1
+               }
+               x := (*mlink)(v)
+               c := gp.m.mcache
+               if c == nil || gp.m.gcing != 0 || gp.m.helpgc != 0 {
+                       lock(&stackpoolmu)
+                       stackpoolfree(x, order)
+                       unlock(&stackpoolmu)
+               } else {
+                       if c.stackcache[order].size >= _StackCacheSize {
+                               stackcacherelease(c, order)
+                       }
+                       x.next = c.stackcache[order].list
+                       c.stackcache[order].list = x
+                       c.stackcache[order].size += n
+               }
+       } else {
+               s := mHeap_Lookup(&mheap_, v)
+               if s.state != _MSpanStack {
+                       println(hex(s.start<<_PageShift), v)
+                       gothrow("bad span state")
+               }
+               mHeap_FreeStack(&mheap_, s)
+       }
+ }
+ 
+ var maxstacksize uintptr = 1 << 20 // enough until runtime.main sets it for real
+ 
+ var mapnames = []string{
+       _BitsDead:    "---",
+       _BitsScalar:  "scalar",
+       _BitsPointer: "ptr",
+ }
+ 
+ // Stack frame layout
+ //
+ // (x86)
+ // +------------------+
+ // | args from caller |
+ // +------------------+ <- frame->argp
+ // |  return address  |
+ // +------------------+ <- frame->varp
+ // |     locals       |
+ // +------------------+
+ // |  args to callee  |
+ // +------------------+ <- frame->sp
+ //
+ // (arm)
+ // +------------------+
+ // | args from caller |
+ // +------------------+ <- frame->argp
+ // | caller's retaddr |
+ // +------------------+ <- frame->varp
+ // |     locals       |
+ // +------------------+
+ // |  args to callee  |
+ // +------------------+
+ // |  return address  |
+ // +------------------+ <- frame->sp
+ 
+ type adjustinfo struct {
+       old   stack
+       delta uintptr // ptr distance from old to new stack (newbase - oldbase)
+ }
+ 
+ // Adjustpointer checks whether *vpp is in the old stack described by adjinfo.
+ // If so, it rewrites *vpp to point into the new stack.
+ func adjustpointer(adjinfo *adjustinfo, vpp unsafe.Pointer) {
+       pp := (*unsafe.Pointer)(vpp)
+       p := *pp
+       if stackDebug >= 4 {
+               print("        ", pp, ":", p, "\n")
+       }
+       if adjinfo.old.lo <= uintptr(p) && uintptr(p) < adjinfo.old.hi {
+               *pp = add(p, adjinfo.delta)
+               if stackDebug >= 3 {
+                       print("        adjust ptr ", pp, ":", p, " -> ", *pp, "\n")
+               }
+       }
+ }
+ 
+ type gobitvector struct {
+       n        uintptr
+       bytedata []uint8
+ }
+ 
+ func gobv(bv bitvector) gobitvector {
+       return gobitvector{
+               uintptr(bv.n),
+               (*[1 << 30]byte)(unsafe.Pointer(bv.bytedata))[:(bv.n+7)/8],
+       }
+ }
+ 
+ func ptrbits(bv *gobitvector, i uintptr) uint8 {
+       return (bv.bytedata[i/4] >> ((i & 3) * 2)) & 3
+ }
+ 
+ // bv describes the memory starting at address scanp.
+ // Adjust any pointers contained therein.
+ func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f *_func) {
+       bv := gobv(*cbv)
+       minp := adjinfo.old.lo
+       maxp := adjinfo.old.hi
+       delta := adjinfo.delta
+       num := uintptr(bv.n / _BitsPerPointer)
+       for i := uintptr(0); i < num; i++ {
+               if stackDebug >= 4 {
+                       print("        ", add(scanp, i*ptrSize), ":", mapnames[ptrbits(&bv, i)], ":", hex(*(*uintptr)(add(scanp, i*ptrSize))), " # ", i, " ", bv.bytedata[i/4], "\n")
+               }
+               switch ptrbits(&bv, i) {
+               default:
+                       gothrow("unexpected pointer bits")
+               case _BitsDead:
+                       if debug.gcdead != 0 {
+                               *(*unsafe.Pointer)(add(scanp, i*ptrSize)) = unsafe.Pointer(uintptr(poisonStack))
+                       }
+               case _BitsScalar:
+                       // ok
+               case _BitsPointer:
+                       p := *(*unsafe.Pointer)(add(scanp, i*ptrSize))
+                       up := uintptr(p)
+                       if f != nil && 0 < up && up < _PageSize && invalidptr != 0 || up == poisonGC || up == poisonStack {
+                               // Looks like a junk value in a pointer slot.
+                               // Live analysis wrong?
+                               getg().m.traceback = 2
+                               print("runtime: bad pointer in frame ", gofuncname(f), " at ", add(scanp, i*ptrSize), ": ", p, "\n")
+                               gothrow("invalid stack pointer")
+                       }
+                       if minp <= up && up < maxp {
+                               if stackDebug >= 3 {
+                                       print("adjust ptr ", p, " ", gofuncname(f), "\n")
+                               }
+                               *(*unsafe.Pointer)(add(scanp, i*ptrSize)) = unsafe.Pointer(up + delta)
+                       }
+               }
+       }
+ }
+ 
+ // Note: the argument/return area is adjusted by the callee.
+ func adjustframe(frame *stkframe, arg unsafe.Pointer) bool {
+       adjinfo := (*adjustinfo)(arg)
+       targetpc := frame.continpc
+       if targetpc == 0 {
+               // Frame is dead.
+               return true
+       }
+       f := frame.fn
+       if stackDebug >= 2 {
+               print("    adjusting ", funcname(f), " frame=[", hex(frame.sp), ",", hex(frame.fp), "] pc=", hex(frame.pc), " continpc=", hex(frame.continpc), "\n")
+       }
+       if f.entry == systemstack_switchPC {
+               // A special routine at the bottom of stack of a goroutine that does an systemstack call.
+               // We will allow it to be copied even though we don't
+               // have full GC info for it (because it is written in asm).
+               return true
+       }
+       if targetpc != f.entry {
+               targetpc--
+       }
+       pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+       if pcdata == -1 {
+               pcdata = 0 // in prologue
+       }
+ 
+       // Adjust local variables if stack frame has been allocated.
+       size := frame.varp - frame.sp
+       var minsize uintptr
+       if thechar != '6' && thechar != '8' {
+               minsize = ptrSize
+       } else {
+               minsize = 0
+       }
+       if size > minsize {
+               var bv bitvector
+               stackmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+               if stackmap == nil || stackmap.n <= 0 {
+                       print("runtime: frame ", funcname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
+                       gothrow("missing stackmap")
+               }
+               // Locals bitmap information, scan just the pointers in locals.
+               if pcdata < 0 || pcdata >= stackmap.n {
+                       // don't know where we are
+                       print("runtime: pcdata is ", pcdata, " and ", stackmap.n, " locals stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
+                       gothrow("bad symbol table")
+               }
+               bv = stackmapdata(stackmap, pcdata)
+               size = (uintptr(bv.n) * ptrSize) / _BitsPerPointer
+               if stackDebug >= 3 {
+                       print("      locals ", pcdata, "/", stackmap.n, " ", size/ptrSize, " words ", bv.bytedata, "\n")
+               }
+               adjustpointers(unsafe.Pointer(frame.varp-size), &bv, adjinfo, f)
+       }
+ 
+       // Adjust arguments.
+       if frame.arglen > 0 {
+               var bv bitvector
+               if frame.argmap != nil {
+                       bv = *frame.argmap
+               } else {
+                       stackmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+                       if stackmap == nil || stackmap.n <= 0 {
+                               print("runtime: frame ", funcname(f), " untyped args ", frame.argp, "+", uintptr(frame.arglen), "\n")
+                               gothrow("missing stackmap")
+                       }
+                       if pcdata < 0 || pcdata >= stackmap.n {
+                               // don't know where we are
+                               print("runtime: pcdata is ", pcdata, " and ", stackmap.n, " args stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
+                               gothrow("bad symbol table")
+                       }
+                       bv = stackmapdata(stackmap, pcdata)
+               }
+               if stackDebug >= 3 {
+                       print("      args\n")
+               }
+               adjustpointers(unsafe.Pointer(frame.argp), &bv, adjinfo, nil)
+       }
+       return true
+ }
+ 
+ func adjustctxt(gp *g, adjinfo *adjustinfo) {
+       adjustpointer(adjinfo, (unsafe.Pointer)(&gp.sched.ctxt))
+ }
+ 
+ func adjustdefers(gp *g, adjinfo *adjustinfo) {
+       // Adjust defer argument blocks the same way we adjust active stack frames.
+       tracebackdefers(gp, adjustframe, noescape(unsafe.Pointer(adjinfo)))
+ 
+       // Adjust pointers in the Defer structs.
+       // Defer structs themselves are never on the stack.
+       for d := gp._defer; d != nil; d = d.link {
+               adjustpointer(adjinfo, (unsafe.Pointer)(&d.fn))
+               adjustpointer(adjinfo, (unsafe.Pointer)(&d.argp))
+               adjustpointer(adjinfo, (unsafe.Pointer)(&d._panic))
+       }
+ }
+ 
+ func adjustpanics(gp *g, adjinfo *adjustinfo) {
+       // Panics are on stack and already adjusted.
+       // Update pointer to head of list in G.
+       adjustpointer(adjinfo, (unsafe.Pointer)(&gp._panic))
+ }
+ 
+ func adjustsudogs(gp *g, adjinfo *adjustinfo) {
+       // the data elements pointed to by a SudoG structure
+       // might be in the stack.
+       for s := gp.waiting; s != nil; s = s.waitlink {
+               adjustpointer(adjinfo, (unsafe.Pointer)(&s.elem))
+               adjustpointer(adjinfo, (unsafe.Pointer)(&s.selectdone))
+       }
+ }
+ 
+ func fillstack(stk stack, b byte) {
+       for p := stk.lo; p < stk.hi; p++ {
+               *(*byte)(unsafe.Pointer(p)) = b
+       }
+ }
+ 
+ // Copies gp's stack to a new stack of a different size.
++// Caller must have changed gp status to Gcopystack.
+ func copystack(gp *g, newsize uintptr) {
+       if gp.syscallsp != 0 {
+               gothrow("stack growth not allowed in system call")
+       }
+       old := gp.stack
+       if old.lo == 0 {
+               gothrow("nil stackbase")
+       }
+       used := old.hi - gp.sched.sp
+ 
+       // allocate new stack
+       new := stackalloc(uint32(newsize))
+       if stackPoisonCopy != 0 {
+               fillstack(new, 0xfd)
+       }
+       if stackDebug >= 1 {
+               print("copystack gp=", gp, " [", hex(old.lo), " ", hex(old.hi-used), " ", hex(old.hi), "]/", old.hi-old.lo, " -> [", hex(new.lo), " ", hex(new.hi-used), " ", hex(new.hi), "]/", newsize, "\n")
+       }
+ 
+       // adjust pointers in the to-be-copied frames
+       var adjinfo adjustinfo
+       adjinfo.old = old
+       adjinfo.delta = new.hi - old.hi
+       gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, adjustframe, noescape(unsafe.Pointer(&adjinfo)), 0)
+ 
+       // adjust other miscellaneous things that have pointers into stacks.
+       adjustctxt(gp, &adjinfo)
+       adjustdefers(gp, &adjinfo)
+       adjustpanics(gp, &adjinfo)
+       adjustsudogs(gp, &adjinfo)
+ 
+       // copy the stack to the new location
+       if stackPoisonCopy != 0 {
+               fillstack(new, 0xfb)
+       }
+       memmove(unsafe.Pointer(new.hi-used), unsafe.Pointer(old.hi-used), used)
+ 
- -      casgstatus(gp, _Gcopystack, oldstatus) // oldstatus is Gwaiting or Grunnable
- -
+       // Swap out old stack for new one
+       gp.stack = new
+       gp.stackguard0 = new.lo + _StackGuard // NOTE: might clobber a preempt request
+       gp.sched.sp = new.hi - used
+ 
- -      // Note that the concurrent GC might be scanning the stack as we try to replace it.
- -      // copystack takes care of the appropriate coordination with the stack scanner.
+       // free old stack
+       if stackPoisonCopy != 0 {
+               fillstack(old, 0xfc)
+       }
+       if newsize > old.hi-old.lo {
+               // growing, free stack immediately
+               stackfree(old)
+       } else {
+               // shrinking, queue up free operation.  We can't actually free the stack
+               // just yet because we might run into the following situation:
+               // 1) GC starts, scans a SudoG but does not yet mark the SudoG.elem pointer
+               // 2) The stack that pointer points to is shrunk
+               // 3) The old stack is freed
+               // 4) The containing span is marked free
+               // 5) GC attempts to mark the SudoG.elem pointer.  The marking fails because
+               //    the pointer looks like a pointer into a free span.
+               // By not freeing, we prevent step #4 until GC is done.
+               lock(&stackpoolmu)
+               *(*stack)(unsafe.Pointer(old.lo)) = stackfreequeue
+               stackfreequeue = old
+               unlock(&stackpoolmu)
+       }
+ }
+ 
+ // round x up to a power of 2.
+ func round2(x int32) int32 {
+       s := uint(0)
+       for 1<<s < x {
+               s++
+       }
+       return 1 << s
+ }
+ 
+ // Called from runtime·morestack when more stack is needed.
+ // Allocate larger stack and relocate to new stack.
+ // Stack growth is multiplicative, for constant amortized cost.
+ //
+ // g->atomicstatus will be Grunning or Gscanrunning upon entry.
+ // If the GC is trying to stop this g then it will set preemptscan to true.
+ func newstack() {
+       thisg := getg()
+       // TODO: double check all gp. shouldn't be getg().
+       if thisg.m.morebuf.g.stackguard0 == stackFork {
+               gothrow("stack growth after fork")
+       }
+       if thisg.m.morebuf.g != thisg.m.curg {
+               print("runtime: newstack called from g=", thisg.m.morebuf.g, "\n"+"\tm=", thisg.m, " m->curg=", thisg.m.curg, " m->g0=", thisg.m.g0, " m->gsignal=", thisg.m.gsignal, "\n")
+               morebuf := thisg.m.morebuf
+               traceback(morebuf.pc, morebuf.sp, morebuf.lr, morebuf.g)
+               gothrow("runtime: wrong goroutine in newstack")
+       }
+       if thisg.m.curg.throwsplit {
+               gp := thisg.m.curg
+               // Update syscallsp, syscallpc in case traceback uses them.
+               morebuf := thisg.m.morebuf
+               gp.syscallsp = morebuf.sp
+               gp.syscallpc = morebuf.pc
+               print("runtime: newstack sp=", hex(gp.sched.sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n",
+                       "\tmorebuf={pc:", hex(morebuf.pc), " sp:", hex(morebuf.sp), " lr:", hex(morebuf.lr), "}\n",
+                       "\tsched={pc:", hex(gp.sched.pc), " sp:", hex(gp.sched.sp), " lr:", hex(gp.sched.lr), " ctxt:", gp.sched.ctxt, "}\n")
+               gothrow("runtime: stack split at bad time")
+       }
+ 
+       // The goroutine must be executing in order to call newstack,
+       // so it must be Grunning or Gscanrunning.
+ 
+       gp := thisg.m.curg
+       morebuf := thisg.m.morebuf
+       thisg.m.morebuf.pc = 0
+       thisg.m.morebuf.lr = 0
+       thisg.m.morebuf.sp = 0
+       thisg.m.morebuf.g = nil
+ 
+       casgstatus(gp, _Grunning, _Gwaiting)
+       gp.waitreason = "stack growth"
+ 
+       rewindmorestack(&gp.sched)
+ 
+       if gp.stack.lo == 0 {
+               gothrow("missing stack in newstack")
+       }
+       sp := gp.sched.sp
+       if thechar == '6' || thechar == '8' {
+               // The call to morestack cost a word.
+               sp -= ptrSize
+       }
+       if stackDebug >= 1 || sp < gp.stack.lo {
+               print("runtime: newstack sp=", hex(sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n",
+                       "\tmorebuf={pc:", hex(morebuf.pc), " sp:", hex(morebuf.sp), " lr:", hex(morebuf.lr), "}\n",
+                       "\tsched={pc:", hex(gp.sched.pc), " sp:", hex(gp.sched.sp), " lr:", hex(gp.sched.lr), " ctxt:", gp.sched.ctxt, "}\n")
+       }
+       if sp < gp.stack.lo {
+               print("runtime: gp=", gp, ", gp->status=", hex(readgstatus(gp)), "\n ")
+               print("runtime: split stack overflow: ", hex(sp), " < ", hex(gp.stack.lo), "\n")
+               gothrow("runtime: split stack overflow")
+       }
+ 
++      if gp.sched.ctxt != nil {
++              // morestack wrote sched.ctxt on its way in here,
++              // without a write barrier. Run the write barrier now.
++              // It is not possible to be preempted between then
++              // and now, so it's okay.
++              writebarrierptr_nostore((*uintptr)(unsafe.Pointer(&gp.sched.ctxt)), uintptr(gp.sched.ctxt))
++      }
++
+       if gp.stackguard0 == stackPreempt {
+               if gp == thisg.m.g0 {
+                       gothrow("runtime: preempt g0")
+               }
+               if thisg.m.p == nil && thisg.m.locks == 0 {
+                       gothrow("runtime: g is running but p is not")
+               }
+               if gp.preemptscan {
+                       gcphasework(gp)
+                       casgstatus(gp, _Gwaiting, _Grunning)
+                       gp.stackguard0 = gp.stack.lo + _StackGuard
+                       gp.preempt = false
+                       gp.preemptscan = false // Tells the GC premption was successful.
+                       gogo(&gp.sched)        // never return
+               }
+ 
+               // Be conservative about where we preempt.
+               // We are interested in preempting user Go code, not runtime code.
+               if thisg.m.locks != 0 || thisg.m.mallocing != 0 || thisg.m.gcing != 0 || thisg.m.p.status != _Prunning {
+                       // Let the goroutine keep running for now.
+                       // gp->preempt is set, so it will be preempted next time.
+                       gp.stackguard0 = gp.stack.lo + _StackGuard
+                       casgstatus(gp, _Gwaiting, _Grunning)
+                       gogo(&gp.sched) // never return
+               }
+ 
+               // Act like goroutine called runtime.Gosched.
+               casgstatus(gp, _Gwaiting, _Grunning)
+               gosched_m(gp) // never return
+       }
+ 
+       // Allocate a bigger segment and move the stack.
+       oldsize := int(gp.stack.hi - gp.stack.lo)
+       newsize := oldsize * 2
+       if uintptr(newsize) > maxstacksize {
+               print("runtime: goroutine stack exceeds ", maxstacksize, "-byte limit\n")
+               gothrow("stack overflow")
+       }
+ 
- -      casgstatus(gp, _Gwaiting, _Grunning)
++      oldstatus := readgstatus(gp)
++      oldstatus &^= _Gscan
++      casgstatus(gp, oldstatus, _Gcopystack) // oldstatus is Gwaiting or Grunnable
++
++      // The concurrent GC will not scan the stack while we are doing the copy since
++      // the gp is in a Gcopystack status.
+       copystack(gp, uintptr(newsize))
+       if stackDebug >= 1 {
+               print("stack grow done\n")
+       }
- -
- -      /* TODO
- -      if _Windows && gp.m != nil && gp.m.libcallsp != 0 {
++      casgstatus(gp, _Gcopystack, _Grunning)
+       gogo(&gp.sched)
+ }
+ 
+ //go:nosplit
+ func nilfunc() {
+       *(*uint8)(nil) = 0
+ }
+ 
+ // adjust Gobuf as if it executed a call to fn
+ // and then did an immediate gosave.
+ func gostartcallfn(gobuf *gobuf, fv *funcval) {
+       var fn unsafe.Pointer
+       if fv != nil {
+               fn = (unsafe.Pointer)(fv.fn)
+       } else {
+               fn = unsafe.Pointer(funcPC(nilfunc))
+       }
+       gostartcall(gobuf, fn, (unsafe.Pointer)(fv))
+ }
+ 
+ // Maybe shrink the stack being used by gp.
+ // Called at garbage collection time.
+ func shrinkstack(gp *g) {
+       if readgstatus(gp) == _Gdead {
+               if gp.stack.lo != 0 {
+                       // Free whole stack - it will get reallocated
+                       // if G is used again.
+                       stackfree(gp.stack)
+                       gp.stack.lo = 0
+                       gp.stack.hi = 0
+               }
+               return
+       }
+       if gp.stack.lo == 0 {
+               gothrow("missing stack in shrinkstack")
+       }
+ 
+       oldsize := gp.stack.hi - gp.stack.lo
+       newsize := oldsize / 2
+       if newsize < _FixedStack {
+               return // don't shrink below the minimum-sized stack
+       }
+       used := gp.stack.hi - gp.sched.sp
+       if used >= oldsize/4 {
+               return // still using at least 1/4 of the segment.
+       }
+ 
+       // We can't copy the stack if we're in a syscall.
+       // The syscall might have pointers into the stack.
+       if gp.syscallsp != 0 {
+               return
+       }
- -      */
++      if _Windows != 0 && gp.m != nil && gp.m.libcallsp != 0 {
+               return
+       }
+ 
+       if stackDebug > 0 {
+               print("shrinking stack ", oldsize, "->", newsize, "\n")
+       }
++
++      // This is being done in a Gscan state and was initiated by the GC so no need to move to
++      // the Gcopystate.
++      // The world is stopped, so the goroutine must be Gwaiting or Grunnable,
++      // and what it is is not changing underfoot.
++      oldstatus := readgstatus(gp) &^ _Gscan
++      if oldstatus != _Gwaiting && oldstatus != _Grunnable {
++              gothrow("status is not Gwaiting or Grunnable")
++      }
++      casgstatus(gp, oldstatus, _Gcopystack)
+       copystack(gp, newsize)
++      casgstatus(gp, _Gcopystack, oldstatus)
+ }
+ 
+ // Do any delayed stack freeing that was queued up during GC.
+ func shrinkfinish() {
+       lock(&stackpoolmu)
+       s := stackfreequeue
+       stackfreequeue = stack{}
+       unlock(&stackpoolmu)
+       for s.lo != 0 {
+               t := *(*stack)(unsafe.Pointer(s.lo))
+               stackfree(s)
+               s = t
+       }
+ }
+ 
+ //go:nosplit
+ func morestackc() {
+       systemstack(func() {
+               gothrow("attempt to execute C code on Go stack")
+       })
+ }
diff --cc src/runtime/vdso_none.go

index 0000000000000000000000000000000000000000,ac6f8cb18d409a6681b3fed957753d48978e6d24..6f83ecc8950eb231d0804911691d8eb151e49af3

mode 000000,100644..100644
--- /dev/null
--- 2/src/runtime/vdso_none.go
+++ b/src/runtime/vdso_none.go
@@@ -1,0 -1,10 +1,11 @@@
+ // Copyright 2014 The Go Authors.  All rights reserved.
+ // Use of this source code is governed by a BSD-style
+ // license that can be found in the LICENSE file.
+ 
+ // +build !linux !amd64
++// +build !linux !386
+ 
+ package runtime
+ 
+ func sysargs(argc int32, argv **byte) {
+ }
author	Russ Cox <rsc@golang.org>
	Sat, 15 Nov 2014 13:00:38 +0000 (08:00 -0500)
committer	Russ Cox <rsc@golang.org>
	Sat, 15 Nov 2014 13:00:38 +0000 (08:00 -0500)
		1	2
src/cmd/gc/go.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/run.bash	patch \|	diff1 \|	diff2 \|	blob \| history
src/runtime/export_test.go	patch \|	diff1 \|	diff2 \|	blob \| history
src/runtime/lfstack.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/malloc.go	patch \|	diff1 \|	diff2 \|	blob \| history
src/runtime/malloc2.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/mcache.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/mgc.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/mgc0.go	patch \|	diff1 \|	diff2 \|	blob \| history
src/runtime/mgc0.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/runtime/mgc1.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/os1_freebsd.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/os1_linux.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/os1_openbsd.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/os_linux_386.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/proc.go	patch \|	diff1 \|	diff2 \|	blob \| history
src/runtime/proc1.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/rt0_linux_386.s	patch \|	diff1 \|	diff2 \|	blob \| history
src/runtime/runtime2.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/select.go	patch \|	diff1 \|	diff2 \|	blob \| history
src/runtime/stack1.go	patch \|	\|	diff2 \|	blob \| history
src/runtime/vdso_none.go	patch \|	\|	diff2 \|	blob \| history