--- /dev/null
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.pagetrace
+
+// Page tracer.
+//
+// This file contains an implementation of page trace instrumentation for tracking
+// the way the Go runtime manages pages of memory. The trace may be enabled at program
+// startup with the GODEBUG option pagetrace.
+//
+// Each page trace event is either 8 or 16 bytes wide. The first
+// 8 bytes follow this format for non-sync events:
+//
+// [16 timestamp delta][35 base address][10 npages][1 isLarge][2 pageTraceEventType]
+//
+// If the "large" bit is set then the event is 16 bytes wide with the second 8 byte word
+// containing the full npages value (the npages bitfield is 0).
+//
+// The base address's bottom pageShift bits are always zero hence why we can pack other
+// data in there. We ignore the top 16 bits, assuming a 48 bit address space for the
+// heap.
+//
+// The timestamp delta is computed from the difference between the current nanotime
+// timestamp and the last sync event's timestamp. The bottom pageTraceTimeLostBits of
+// this delta is removed and only the next pageTraceTimeDeltaBits are kept.
+//
+// A sync event is emitted at the beginning of each trace buffer and whenever the
+// timestamp delta would not fit in an event.
+//
+// Sync events have the following structure:
+//
+// [61 timestamp or P ID][1 isPID][2 pageTraceSyncEvent]
+//
+// In essence, the "large" bit repurposed to indicate whether it's a timestamp or a P ID
+// (these are typically uint32). Note that we only have 61 bits for the 64-bit timestamp,
+// but like for the delta we drop the bottom pageTraceTimeLostBits here as well.
+
+package runtime
+
+import (
+ "runtime/internal/sys"
+ "unsafe"
+)
+
+// pageTraceAlloc records a page trace allocation event.
+// pp may be nil. Call only if debug.pagetracefd != 0.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceAlloc(pp *p, now int64, base, npages uintptr) {
+ if pageTrace.enabled {
+ if now == 0 {
+ now = nanotime()
+ }
+ pageTraceEmit(pp, now, base, npages, pageTraceAllocEvent)
+ }
+}
+
+// pageTraceFree records a page trace free event.
+// pp may be nil. Call only if debug.pagetracefd != 0.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceFree(pp *p, now int64, base, npages uintptr) {
+ if pageTrace.enabled {
+ if now == 0 {
+ now = nanotime()
+ }
+ pageTraceEmit(pp, now, base, npages, pageTraceFreeEvent)
+ }
+}
+
+// pageTraceScav records a page trace scavenge event.
+// pp may be nil. Call only if debug.pagetracefd != 0.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceScav(pp *p, now int64, base, npages uintptr) {
+ if pageTrace.enabled {
+ if now == 0 {
+ now = nanotime()
+ }
+ pageTraceEmit(pp, now, base, npages, pageTraceScavEvent)
+ }
+}
+
+// pageTraceEventType is a page trace event type.
+type pageTraceEventType uint8
+
+const (
+ pageTraceSyncEvent pageTraceEventType = iota // Timestamp emission.
+ pageTraceAllocEvent // Allocation of pages.
+ pageTraceFreeEvent // Freeing pages.
+ pageTraceScavEvent // Scavenging pages.
+)
+
+// pageTraceEmit emits a page trace event.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceEmit(pp *p, now int64, base, npages uintptr, typ pageTraceEventType) {
+ // Get a buffer.
+ var tbp *pageTraceBuf
+ pid := int32(-1)
+ if pp == nil {
+ // We have no P, so take the global buffer.
+ lock(&pageTrace.lock)
+ tbp = &pageTrace.buf
+ } else {
+ tbp = &pp.pageTraceBuf
+ pid = pp.id
+ }
+
+ // Initialize the buffer if necessary.
+ tb := *tbp
+ if tb.buf == nil {
+ tb.buf = (*pageTraceEvents)(sysAlloc(pageTraceBufSize, &memstats.other_sys))
+ tb = tb.writePid(pid)
+ }
+
+ // Handle timestamp and emit a sync event if necessary.
+ if now < tb.timeBase {
+ now = tb.timeBase
+ }
+ if now-tb.timeBase >= pageTraceTimeMaxDelta {
+ tb.timeBase = now
+ tb = tb.writeSync(pid)
+ }
+
+ // Emit the event.
+ tb = tb.writeEvent(pid, now, base, npages, typ)
+
+ // Write back the buffer.
+ *tbp = tb
+ if pp == nil {
+ unlock(&pageTrace.lock)
+ }
+}
+
+const (
+ pageTraceBufSize = 32 << 10
+
+ // These constants describe the per-event timestamp delta encoding.
+ pageTraceTimeLostBits = 7 // How many bits of precision we lose in the delta.
+ pageTraceTimeDeltaBits = 16 // Size of the delta in bits.
+ pageTraceTimeMaxDelta = 1 << (pageTraceTimeLostBits + pageTraceTimeDeltaBits)
+)
+
+// pageTraceEvents is the low-level buffer containing the trace data.
+type pageTraceEvents struct {
+ _ sys.NotInHeap
+ events [pageTraceBufSize / 8]uint64
+}
+
+// pageTraceBuf is a wrapper around pageTraceEvents that knows how to write events
+// to the buffer. It tracks state necessary to do so.
+type pageTraceBuf struct {
+ buf *pageTraceEvents
+ len int // How many events have been written so far.
+ timeBase int64 // The current timestamp base from which deltas are produced.
+ finished bool // Whether this trace buf should no longer flush anything out.
+}
+
+// writePid writes a P ID event indicating which P we're running on.
+//
+// Assumes there's always space in the buffer since this is only called at the
+// beginning of a new buffer.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) writePid(pid int32) pageTraceBuf {
+ e := uint64(int64(pid))<<3 | 0b100 | uint64(pageTraceSyncEvent)
+ tb.buf.events[tb.len] = e
+ tb.len++
+ return tb
+}
+
+// writeSync writes a sync event, which is just a timestamp. Handles flushing.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) writeSync(pid int32) pageTraceBuf {
+ if tb.len+1 > len(tb.buf.events) {
+ // N.B. flush will writeSync again.
+ return tb.flush(pid, tb.timeBase)
+ }
+ e := ((uint64(tb.timeBase) >> pageTraceTimeLostBits) << 3) | uint64(pageTraceSyncEvent)
+ tb.buf.events[tb.len] = e
+ tb.len++
+ return tb
+}
+
+// writeEvent handles writing all non-sync and non-pid events. Handles flushing if necessary.
+//
+// pid indicates the P we're currently running on. Necessary in case we need to flush.
+// now is the current nanotime timestamp.
+// base is the base address of whatever group of pages this event is happening to.
+// npages is the length of the group of pages this event is happening to.
+// typ is the event that's happening to these pages.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) writeEvent(pid int32, now int64, base, npages uintptr, typ pageTraceEventType) pageTraceBuf {
+ large := 0
+ np := npages
+ if npages >= 1024 {
+ large = 1
+ np = 0
+ }
+ if tb.len+1+large > len(tb.buf.events) {
+ tb = tb.flush(pid, now)
+ }
+ if base%pageSize != 0 {
+ throw("base address not page aligned")
+ }
+ e := uint64(base)
+ // The pageShift low-order bits are zero.
+ e |= uint64(typ) // 2 bits
+ e |= uint64(large) << 2 // 1 bit
+ e |= uint64(np) << 3 // 10 bits
+ // Write the timestamp delta in the upper pageTraceTimeDeltaBits.
+ e |= uint64((now-tb.timeBase)>>pageTraceTimeLostBits) << (64 - pageTraceTimeDeltaBits)
+ tb.buf.events[tb.len] = e
+ if large != 0 {
+ // npages doesn't fit in 10 bits, so write an additional word with that data.
+ tb.buf.events[tb.len+1] = uint64(npages)
+ }
+ tb.len += 1 + large
+ return tb
+}
+
+// flush writes out the contents of the buffer to pageTrace.fd and resets the buffer.
+// It then writes out a P ID event and the first sync event for the new buffer.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) flush(pid int32, now int64) pageTraceBuf {
+ if !tb.finished {
+ lock(&pageTrace.fdLock)
+ writeFull(uintptr(pageTrace.fd), (*byte)(unsafe.Pointer(&tb.buf.events[0])), tb.len*8)
+ unlock(&pageTrace.fdLock)
+ }
+ tb.len = 0
+ tb.timeBase = now
+ return tb.writePid(pid).writeSync(pid)
+}
+
+var pageTrace struct {
+ // enabled indicates whether tracing is enabled. If true, fd >= 0.
+ //
+ // Safe to read without synchronization because it's only set once
+ // at program initialization.
+ enabled bool
+
+ // buf is the page trace buffer used if there is no P.
+ //
+ // lock protects buf.
+ lock mutex
+ buf pageTraceBuf
+
+ // fdLock protects writing to fd.
+ //
+ // fd is the file to write the page trace to.
+ fdLock mutex
+ fd int32
+}
+
+// initPageTrace initializes the page tracing infrastructure from GODEBUG.
+//
+// env must be the value of the GODEBUG environment variable.
+func initPageTrace(env string) {
+ var value string
+ for env != "" {
+ elt, rest := env, ""
+ for i := 0; i < len(env); i++ {
+ if env[i] == ',' {
+ elt, rest = env[:i], env[i+1:]
+ break
+ }
+ }
+ env = rest
+ if hasPrefix(elt, "pagetrace=") {
+ value = elt[len("pagetrace="):]
+ break
+ }
+ }
+ pageTrace.fd = -1
+ if canCreateFile && value != "" {
+ var tmp [4096]byte
+ if len(value) != 0 && len(value) < 4096 {
+ copy(tmp[:], value)
+ pageTrace.fd = create(&tmp[0], 0o664)
+ }
+ }
+ pageTrace.enabled = pageTrace.fd >= 0
+}
+
+// finishPageTrace flushes all P's trace buffers and disables page tracing.
+func finishPageTrace() {
+ if !pageTrace.enabled {
+ return
+ }
+ // Grab worldsema as we're about to execute a ragged barrier.
+ semacquire(&worldsema)
+ systemstack(func() {
+ // Disable tracing. This isn't strictly necessary and it's best-effort.
+ pageTrace.enabled = false
+
+ // Execute a ragged barrier, flushing each trace buffer.
+ forEachP(func(pp *p) {
+ if pp.pageTraceBuf.buf != nil {
+ pp.pageTraceBuf = pp.pageTraceBuf.flush(pp.id, nanotime())
+ }
+ pp.pageTraceBuf.finished = true
+ })
+
+ // Write the global have-no-P buffer.
+ lock(&pageTrace.lock)
+ if pageTrace.buf.buf != nil {
+ pageTrace.buf = pageTrace.buf.flush(-1, nanotime())
+ }
+ pageTrace.buf.finished = true
+ unlock(&pageTrace.lock)
+
+ // Safely close the file as nothing else should be allowed to write to the fd.
+ lock(&pageTrace.fdLock)
+ closefd(pageTrace.fd)
+ pageTrace.fd = -1
+ unlock(&pageTrace.fdLock)
+ })
+ semrelease(&worldsema)
+}
+
+// writeFull ensures that a complete write of bn bytes from b is made to fd.
+func writeFull(fd uintptr, b *byte, bn int) {
+ for bn > 0 {
+ n := write(fd, unsafe.Pointer(b), int32(bn))
+ if n == -_EINTR || n == -_EAGAIN {
+ continue
+ }
+ if n < 0 {
+ print("errno=", -n, "\n")
+ throw("writeBytes: bad write")
+ }
+ bn -= int(n)
+ b = addb(b, uintptr(n))
+ }
+}
_GoidCacheBatch = 16
)
-// cpuinit extracts the environment variable GODEBUG from the environment on
-// Unix-like operating systems and calls internal/cpu.Initialize.
-func cpuinit() {
- const prefix = "GODEBUG="
- var env string
-
+// cpuinit sets up CPU feature flags and calls internal/cpu.Initialize. env should be the complete
+// value of the GODEBUG environment variable.
+func cpuinit(env string) {
switch GOOS {
case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
cpu.DebugOptions = true
+ }
+ cpu.Initialize(env)
+
+ // Support cpu feature variables are used in code generated by the compiler
+ // to guard execution of instructions that can not be assumed to be always supported.
+ switch GOARCH {
+ case "386", "amd64":
+ x86HasPOPCNT = cpu.X86.HasPOPCNT
+ x86HasSSE41 = cpu.X86.HasSSE41
+ x86HasFMA = cpu.X86.HasFMA
+
+ case "arm":
+ armHasVFPv4 = cpu.ARM.HasVFPv4
+
+ case "arm64":
+ arm64HasATOMICS = cpu.ARM64.HasATOMICS
+ }
+}
+// getGodebugEarly extracts the environment variable GODEBUG from the environment on
+// Unix-like operating systems and returns it. This function exists to extract GODEBUG
+// early before much of the runtime is initialized.
+func getGodebugEarly() string {
+ const prefix = "GODEBUG="
+ var env string
+ switch GOOS {
+ case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
// Similar to goenv_unix but extracts the environment value for
// GODEBUG directly.
// TODO(moehrmann): remove when general goenvs() can be called before cpuinit()
}
}
}
-
- cpu.Initialize(env)
-
- // Support cpu feature variables are used in code generated by the compiler
- // to guard execution of instructions that can not be assumed to be always supported.
- switch GOARCH {
- case "386", "amd64":
- x86HasPOPCNT = cpu.X86.HasPOPCNT
- x86HasSSE41 = cpu.X86.HasSSE41
- x86HasFMA = cpu.X86.HasFMA
-
- case "arm":
- armHasVFPv4 = cpu.ARM.HasVFPv4
-
- case "arm64":
- arm64HasATOMICS = cpu.ARM64.HasATOMICS
- }
+ return env
}
// The bootstrap sequence is:
moduledataverify()
stackinit()
mallocinit()
- cpuinit() // must run before alginit
- alginit() // maps, hash, fastrand must not be used before this call
- fastrandinit() // must run before mcommoninit
+ godebug := getGodebugEarly()
+ initPageTrace(godebug) // must run after mallocinit but before anything allocates
+ cpuinit(godebug) // must run before alginit
+ alginit() // maps, hash, fastrand must not be used before this call
+ fastrandinit() // must run before mcommoninit
mcommoninit(gp.m, -1)
modulesinit() // provides activeModules
typelinksinit() // uses maps, activeModules