// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Malloc profiling. // Patterned after tcmalloc's algorithms; shorter code. package runtime import ( "runtime/internal/atomic" "unsafe" ) // NOTE(rsc): Everything here could use cas if contention became an issue. var proflock mutex // All memory allocations are local and do not escape outside of the profiler. // The profiler is forbidden from referring to garbage-collected memory. const ( // profile types memProfile bucketType = 1 + iota blockProfile mutexProfile // size of bucket hash table buckHashSize = 179999 // max depth of stack to record in bucket maxStack = 32 ) type bucketType int // A bucket holds per-call-stack profiling information. // The representation is a bit sleazy, inherited from C. // This struct defines the bucket header. It is followed in // memory by the stack words and then the actual record // data, either a memRecord or a blockRecord. // // Per-call-stack profiling information. // Lookup by hashing call stack into a linked-list hash table. // // No heap pointers. // //go:notinheap type bucket struct { next *bucket allnext *bucket typ bucketType // memBucket or blockBucket (includes mutexProfile) hash uintptr size uintptr nstk uintptr } // A memRecord is the bucket data for a bucket of type memProfile, // part of the memory profile. type memRecord struct { // The following complex 3-stage scheme of stats accumulation // is required to obtain a consistent picture of mallocs and frees // for some point in time. // The problem is that mallocs come in real time, while frees // come only after a GC during concurrent sweeping. So if we would // naively count them, we would get a skew toward mallocs. // // Hence, we delay information to get consistent snapshots as // of mark termination. Allocations count toward the next mark // termination's snapshot, while sweep frees count toward the // previous mark termination's snapshot: // // MT MT MT MT // .·| .·| .·| .·| // .·˙ | .·˙ | .·˙ | .·˙ | // .·˙ | .·˙ | .·˙ | .·˙ | // .·˙ |.·˙ |.·˙ |.·˙ | // // alloc → ▲ ← free // ┠┅┅┅┅┅┅┅┅┅┅┅P // r_a → p_a → allocs // p_f → frees // // alloc → ▲ ← free // ┠┅┅┅┅┅┅┅┅┅┅┅P // r_a → p_a → alloc // p_f → frees // // Since we can't publish a consistent snapshot until all of // the sweep frees are accounted for, we wait until the next // mark termination ("MT" above) to publish the previous mark // termination's snapshot ("P" above). To do this, information // is delayed through "recent" and "prev" stages ("r_*" and // "p_*" above). Specifically: // // Mallocs are accounted in recent stats. // Explicit frees are accounted in recent stats. // GC frees are accounted in prev stats. // After GC prev stats are added to final stats and // recent stats are moved into prev stats. // active is the currently published profile. A profiling // cycle can be accumulated into active once its complete. active memRecordCycle // changes between next-to-last GC and last GC prev memRecordCycle // changes since last GC recent memRecordCycle } // memRecordCycle type memRecordCycle struct { allocs, frees uintptr alloc_bytes, free_bytes uintptr } // add accumulates b into a. It does not zero b. func (a *memRecordCycle) add(b *memRecordCycle) { a.allocs += b.allocs a.frees += b.frees a.alloc_bytes += b.alloc_bytes a.free_bytes += b.free_bytes } // A blockRecord is the bucket data for a bucket of type blockProfile, // which is used in blocking and mutex profiles. type blockRecord struct { count int64 cycles int64 } var ( mbuckets *bucket // memory profile buckets bbuckets *bucket // blocking profile buckets xbuckets *bucket // mutex profile buckets buckhash *[179999]*bucket bucketmem uintptr ) // newBucket allocates a bucket with the given type and number of stack entries. func newBucket(typ bucketType, nstk int) *bucket { size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr(0)) switch typ { default: throw("invalid profile bucket type") case memProfile: size += unsafe.Sizeof(memRecord{}) case blockProfile, mutexProfile: size += unsafe.Sizeof(blockRecord{}) } b := (*bucket)(persistentalloc(size, 0, &memstats.buckhash_sys)) bucketmem += size b.typ = typ b.nstk = uintptr(nstk) return b } // stk returns the slice in b holding the stack. func (b *bucket) stk() []uintptr { stk := (*[maxStack]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b))) return stk[:b.nstk:b.nstk] } // mp returns the memRecord associated with the memProfile bucket b. func (b *bucket) mp() *memRecord { if b.typ != memProfile { throw("bad use of bucket.mp") } data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0))) return (*memRecord)(data) } // bp returns the blockRecord associated with the blockProfile bucket b. func (b *bucket) bp() *blockRecord { if b.typ != blockProfile && b.typ != mutexProfile { throw("bad use of bucket.bp") } data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0))) return (*blockRecord)(data) } // Return the bucket for stk[0:nstk], allocating new bucket if needed. func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket { if buckhash == nil { buckhash = (*[buckHashSize]*bucket)(sysAlloc(unsafe.Sizeof(*buckhash), &memstats.buckhash_sys)) if buckhash == nil { throw("runtime: cannot allocate memory") } } // Hash stack. var h uintptr for _, pc := range stk { h += pc h += h << 10 h ^= h >> 6 } // hash in size h += size h += h << 10 h ^= h >> 6 // finalize h += h << 3 h ^= h >> 11 i := int(h % buckHashSize) for b := buckhash[i]; b != nil; b = b.next { if b.typ == typ && b.hash == h && b.size == size && eqslice(b.stk(), stk) { return b } } if !alloc { return nil } // Create new bucket. b := newBucket(typ, len(stk)) copy(b.stk(), stk) b.hash = h b.size = size b.next = buckhash[i] buckhash[i] = b if typ == memProfile { b.allnext = mbuckets mbuckets = b } else if typ == mutexProfile { b.allnext = xbuckets xbuckets = b } else { b.allnext = bbuckets bbuckets = b } return b } func eqslice(x, y []uintptr) bool { if len(x) != len(y) { return false } for i, xi := range x { if xi != y[i] { return false } } return true } func mprof_GC() { for b := mbuckets; b != nil; b = b.allnext { mp := b.mp() mp.active.add(&mp.prev) mp.prev = mp.recent mp.recent = memRecordCycle{} } } // Record that a gc just happened: all the 'recent' statistics are now real. func mProf_GC() { lock(&proflock) mprof_GC() unlock(&proflock) } // Called by malloc to record a profiled block. func mProf_Malloc(p unsafe.Pointer, size uintptr) { var stk [maxStack]uintptr nstk := callers(4, stk[:]) lock(&proflock) b := stkbucket(memProfile, size, stk[:nstk], true) mp := b.mp() mp.recent.allocs++ mp.recent.alloc_bytes += size unlock(&proflock) // Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock. // This reduces potential contention and chances of deadlocks. // Since the object must be alive during call to mProf_Malloc, // it's fine to do this non-atomically. systemstack(func() { setprofilebucket(p, b) }) } // Called when freeing a profiled block. func mProf_Free(b *bucket, size uintptr) { lock(&proflock) mp := b.mp() mp.prev.frees++ mp.prev.free_bytes += size unlock(&proflock) } var blockprofilerate uint64 // in CPU ticks // SetBlockProfileRate controls the fraction of goroutine blocking events // that are reported in the blocking profile. The profiler aims to sample // an average of one blocking event per rate nanoseconds spent blocked. // // To include every blocking event in the profile, pass rate = 1. // To turn off profiling entirely, pass rate <= 0. func SetBlockProfileRate(rate int) { var r int64 if rate <= 0 { r = 0 // disable profiling } else if rate == 1 { r = 1 // profile everything } else { // convert ns to cycles, use float64 to prevent overflow during multiplication r = int64(float64(rate) * float64(tickspersecond()) / (1000 * 1000 * 1000)) if r == 0 { r = 1 } } atomic.Store64(&blockprofilerate, uint64(r)) } func blockevent(cycles int64, skip int) { if cycles <= 0 { cycles = 1 } if blocksampled(cycles) { saveblockevent(cycles, skip+1, blockProfile) } } func blocksampled(cycles int64) bool { rate := int64(atomic.Load64(&blockprofilerate)) if rate <= 0 || (rate > cycles && int64(fastrand())%rate > cycles) { return false } return true } func saveblockevent(cycles int64, skip int, which bucketType) { gp := getg() var nstk int var stk [maxStack]uintptr if gp.m.curg == nil || gp.m.curg == gp { nstk = callers(skip, stk[:]) } else { nstk = gcallers(gp.m.curg, skip, stk[:]) } lock(&proflock) b := stkbucket(which, 0, stk[:nstk], true) b.bp().count++ b.bp().cycles += cycles unlock(&proflock) } var mutexprofilerate uint64 // fraction sampled // SetMutexProfileFraction controls the fraction of mutex contention events // that are reported in the mutex profile. On average 1/rate events are // reported. The previous rate is returned. // // To turn off profiling entirely, pass rate 0. // To just read the current rate, pass rate -1. // (For n>1 the details of sampling may change.) func SetMutexProfileFraction(rate int) int { if rate < 0 { return int(mutexprofilerate) } old := mutexprofilerate atomic.Store64(&mutexprofilerate, uint64(rate)) return int(old) } //go:linkname mutexevent sync.event func mutexevent(cycles int64, skip int) { if cycles < 0 { cycles = 0 } rate := int64(atomic.Load64(&mutexprofilerate)) // TODO(pjw): measure impact of always calling fastrand vs using something // like malloc.go:nextSample() if rate > 0 && int64(fastrand())%rate == 0 { saveblockevent(cycles, skip+1, mutexProfile) } } // Go interface to profile data. // A StackRecord describes a single execution stack. type StackRecord struct { Stack0 [32]uintptr // stack trace for this record; ends at first 0 entry } // Stack returns the stack trace associated with the record, // a prefix of r.Stack0. func (r *StackRecord) Stack() []uintptr { for i, v := range r.Stack0 { if v == 0 { return r.Stack0[0:i] } } return r.Stack0[0:] } // MemProfileRate controls the fraction of memory allocations // that are recorded and reported in the memory profile. // The profiler aims to sample an average of // one allocation per MemProfileRate bytes allocated. // // To include every allocated block in the profile, set MemProfileRate to 1. // To turn off profiling entirely, set MemProfileRate to 0. // // The tools that process the memory profiles assume that the // profile rate is constant across the lifetime of the program // and equal to the current value. Programs that change the // memory profiling rate should do so just once, as early as // possible in the execution of the program (for example, // at the beginning of main). var MemProfileRate int = 512 * 1024 // A MemProfileRecord describes the live objects allocated // by a particular call sequence (stack trace). type MemProfileRecord struct { AllocBytes, FreeBytes int64 // number of bytes allocated, freed AllocObjects, FreeObjects int64 // number of objects allocated, freed Stack0 [32]uintptr // stack trace for this record; ends at first 0 entry } // InUseBytes returns the number of bytes in use (AllocBytes - FreeBytes). func (r *MemProfileRecord) InUseBytes() int64 { return r.AllocBytes - r.FreeBytes } // InUseObjects returns the number of objects in use (AllocObjects - FreeObjects). func (r *MemProfileRecord) InUseObjects() int64 { return r.AllocObjects - r.FreeObjects } // Stack returns the stack trace associated with the record, // a prefix of r.Stack0. func (r *MemProfileRecord) Stack() []uintptr { for i, v := range r.Stack0 { if v == 0 { return r.Stack0[0:i] } } return r.Stack0[0:] } // MemProfile returns a profile of memory allocated and freed per allocation // site. // // MemProfile returns n, the number of records in the current memory profile. // If len(p) >= n, MemProfile copies the profile into p and returns n, true. // If len(p) < n, MemProfile does not change p and returns n, false. // // If inuseZero is true, the profile includes allocation records // where r.AllocBytes > 0 but r.AllocBytes == r.FreeBytes. // These are sites where memory was allocated, but it has all // been released back to the runtime. // // The returned profile may be up to two garbage collection cycles old. // This is to avoid skewing the profile toward allocations; because // allocations happen in real time but frees are delayed until the garbage // collector performs sweeping, the profile only accounts for allocations // that have had a chance to be freed by the garbage collector. // // Most clients should use the runtime/pprof package or // the testing package's -test.memprofile flag instead // of calling MemProfile directly. func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) { lock(&proflock) clear := true for b := mbuckets; b != nil; b = b.allnext { mp := b.mp() if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes { n++ } if mp.active.allocs != 0 || mp.active.frees != 0 { clear = false } } if clear { // Absolutely no data, suggesting that a garbage collection // has not yet happened. In order to allow profiling when // garbage collection is disabled from the beginning of execution, // accumulate stats as if a GC just happened, and recount buckets. mprof_GC() mprof_GC() n = 0 for b := mbuckets; b != nil; b = b.allnext { mp := b.mp() if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes { n++ } } } if n <= len(p) { ok = true idx := 0 for b := mbuckets; b != nil; b = b.allnext { mp := b.mp() if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes { record(&p[idx], b) idx++ } } } unlock(&proflock) return } // Write b's data to r. func record(r *MemProfileRecord, b *bucket) { mp := b.mp() r.AllocBytes = int64(mp.active.alloc_bytes) r.FreeBytes = int64(mp.active.free_bytes) r.AllocObjects = int64(mp.active.allocs) r.FreeObjects = int64(mp.active.frees) if raceenabled { racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(unsafe.Pointer(&r)), funcPC(MemProfile)) } if msanenabled { msanwrite(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0)) } copy(r.Stack0[:], b.stk()) for i := int(b.nstk); i < len(r.Stack0); i++ { r.Stack0[i] = 0 } } func iterate_memprof(fn func(*bucket, uintptr, *uintptr, uintptr, uintptr, uintptr)) { lock(&proflock) for b := mbuckets; b != nil; b = b.allnext { mp := b.mp() fn(b, b.nstk, &b.stk()[0], b.size, mp.active.allocs, mp.active.frees) } unlock(&proflock) } // BlockProfileRecord describes blocking events originated // at a particular call sequence (stack trace). type BlockProfileRecord struct { Count int64 Cycles int64 StackRecord } // BlockProfile returns n, the number of records in the current blocking profile. // If len(p) >= n, BlockProfile copies the profile into p and returns n, true. // If len(p) < n, BlockProfile does not change p and returns n, false. // // Most clients should use the runtime/pprof package or // the testing package's -test.blockprofile flag instead // of calling BlockProfile directly. func BlockProfile(p []BlockProfileRecord) (n int, ok bool) { lock(&proflock) for b := bbuckets; b != nil; b = b.allnext { n++ } if n <= len(p) { ok = true for b := bbuckets; b != nil; b = b.allnext { bp := b.bp() r := &p[0] r.Count = bp.count r.Cycles = bp.cycles if raceenabled { racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(unsafe.Pointer(&p)), funcPC(BlockProfile)) } if msanenabled { msanwrite(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0)) } i := copy(r.Stack0[:], b.stk()) for ; i < len(r.Stack0); i++ { r.Stack0[i] = 0 } p = p[1:] } } unlock(&proflock) return } // MutexProfile returns n, the number of records in the current mutex profile. // If len(p) >= n, MutexProfile copies the profile into p and returns n, true. // Otherwise, MutexProfile does not change p, and returns n, false. // // Most clients should use the runtime/pprof package // instead of calling MutexProfile directly. func MutexProfile(p []BlockProfileRecord) (n int, ok bool) { lock(&proflock) for b := xbuckets; b != nil; b = b.allnext { n++ } if n <= len(p) { ok = true for b := xbuckets; b != nil; b = b.allnext { bp := b.bp() r := &p[0] r.Count = int64(bp.count) r.Cycles = bp.cycles i := copy(r.Stack0[:], b.stk()) for ; i < len(r.Stack0); i++ { r.Stack0[i] = 0 } p = p[1:] } } unlock(&proflock) return } // ThreadCreateProfile returns n, the number of records in the thread creation profile. // If len(p) >= n, ThreadCreateProfile copies the profile into p and returns n, true. // If len(p) < n, ThreadCreateProfile does not change p and returns n, false. // // Most clients should use the runtime/pprof package instead // of calling ThreadCreateProfile directly. func ThreadCreateProfile(p []StackRecord) (n int, ok bool) { first := (*m)(atomic.Loadp(unsafe.Pointer(&allm))) for mp := first; mp != nil; mp = mp.alllink { n++ } if n <= len(p) { ok = true i := 0 for mp := first; mp != nil; mp = mp.alllink { p[i].Stack0 = mp.createstack i++ } } return } // GoroutineProfile returns n, the number of records in the active goroutine stack profile. // If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true. // If len(p) < n, GoroutineProfile does not change p and returns n, false. // // Most clients should use the runtime/pprof package instead // of calling GoroutineProfile directly. func GoroutineProfile(p []StackRecord) (n int, ok bool) { gp := getg() isOK := func(gp1 *g) bool { // Checking isSystemGoroutine here makes GoroutineProfile // consistent with both NumGoroutine and Stack. return gp1 != gp && readgstatus(gp1) != _Gdead && !isSystemGoroutine(gp1) } stopTheWorld("profile") n = 1 for _, gp1 := range allgs { if isOK(gp1) { n++ } } if n <= len(p) { ok = true r := p // Save current goroutine. sp := getcallersp(unsafe.Pointer(&p)) pc := getcallerpc(unsafe.Pointer(&p)) systemstack(func() { saveg(pc, sp, gp, &r[0]) }) r = r[1:] // Save other goroutines. for _, gp1 := range allgs { if isOK(gp1) { if len(r) == 0 { // Should be impossible, but better to return a // truncated profile than to crash the entire process. break } saveg(^uintptr(0), ^uintptr(0), gp1, &r[0]) r = r[1:] } } } startTheWorld() return n, ok } func saveg(pc, sp uintptr, gp *g, r *StackRecord) { n := gentraceback(pc, sp, 0, gp, 0, &r.Stack0[0], len(r.Stack0), nil, nil, 0) if n < len(r.Stack0) { r.Stack0[n] = 0 } } // Stack formats a stack trace of the calling goroutine into buf // and returns the number of bytes written to buf. // If all is true, Stack formats stack traces of all other goroutines // into buf after the trace for the current goroutine. func Stack(buf []byte, all bool) int { if all { stopTheWorld("stack trace") } n := 0 if len(buf) > 0 { gp := getg() sp := getcallersp(unsafe.Pointer(&buf)) pc := getcallerpc(unsafe.Pointer(&buf)) systemstack(func() { g0 := getg() // Force traceback=1 to override GOTRACEBACK setting, // so that Stack's results are consistent. // GOTRACEBACK is only about crash dumps. g0.m.traceback = 1 g0.writebuf = buf[0:0:len(buf)] goroutineheader(gp) traceback(pc, sp, 0, gp) if all { tracebackothers(gp) } g0.m.traceback = 0 n = len(g0.writebuf) g0.writebuf = nil }) } if all { startTheWorld() } return n } // Tracing of alloc/free/gc. var tracelock mutex func tracealloc(p unsafe.Pointer, size uintptr, typ *_type) { lock(&tracelock) gp := getg() gp.m.traceback = 2 if typ == nil { print("tracealloc(", p, ", ", hex(size), ")\n") } else { print("tracealloc(", p, ", ", hex(size), ", ", typ.string(), ")\n") } if gp.m.curg == nil || gp == gp.m.curg { goroutineheader(gp) pc := getcallerpc(unsafe.Pointer(&p)) sp := getcallersp(unsafe.Pointer(&p)) systemstack(func() { traceback(pc, sp, 0, gp) }) } else { goroutineheader(gp.m.curg) traceback(^uintptr(0), ^uintptr(0), 0, gp.m.curg) } print("\n") gp.m.traceback = 0 unlock(&tracelock) } func tracefree(p unsafe.Pointer, size uintptr) { lock(&tracelock) gp := getg() gp.m.traceback = 2 print("tracefree(", p, ", ", hex(size), ")\n") goroutineheader(gp) pc := getcallerpc(unsafe.Pointer(&p)) sp := getcallersp(unsafe.Pointer(&p)) systemstack(func() { traceback(pc, sp, 0, gp) }) print("\n") gp.m.traceback = 0 unlock(&tracelock) } func tracegc() { lock(&tracelock) gp := getg() gp.m.traceback = 2 print("tracegc()\n") // running on m->g0 stack; show all non-g0 goroutines tracebackothers(gp) print("end tracegc\n") print("\n") gp.m.traceback = 0 unlock(&tracelock) }