From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Wed, 1 Nov 2023 20:00:27 +0000 (+0000)
Subject: runtime: improve tickspersecond
X-Git-Tag: go1.22rc1~357
X-Git-Url: http://www.git.cypherpunks.ru/?a=commitdiff_plain;h=130baf3d42f0d7e911e9cae7e7ded8e0c6dd6919;p=gostls13.git

runtime: improve tickspersecond

Currently tickspersecond forces a 100 millisecond sleep the first time
it's called. This isn't great for profiling short-lived programs, since
both CPU profiling and block profiling might call into it.

100 milliseconds is a long time, but it's chosen to try and capture a
decent estimate of the conversion on platform with course-granularity
clocks. If the granularity is 15 ms, it'll only be 15% off at worst.

Let's try a different strategy. First, let's require 5 milliseconds of
time to have elapsed at a minimum. This should be plenty on platforms
with nanosecond time granularity from the system clock, provided the
caller of tickspersecond intends to use it for calculating durations,
not timestamps. Next, grab a timestamp as close to process start as
possible, so that we can cover some of that 5 millisecond just during
runtime start.

Finally, this function is only ever called from normal goroutine
contexts. Let's do a regular goroutine sleep instead of a thread-level
sleep under a runtime lock, which has all sorts of nasty effects on
preemption.

While we're here, let's also rename tickspersecond to ticksPerSecond.
Also, let's write down some explicit rules of thumb on when to use this
function. Clocks are hard, and using this for timestamp conversion is
likely to make lining up those timestamps with other clocks on the
system difficult if not impossible.

Note that while this improves ticksPerSecond on platforms with good
clocks, we still end up with a pretty coarse sleep on platforms with
coarse clocks, and a pretty coarse result. On these platforms, keep the
minimum required elapsed time at 100 ms. There's not much we can do
about these platforms except spin and try to catch the clock boundary,
but at 10+ ms of granularity, that might be a lot of spinning.

Fixes #63103.
Fixes #63078.

Change-Id: Ic32a4ba70a03bdf5c13cb80c2669c4064aa4cca2
Reviewed-on: https://go-review.googlesource.com/c/go/+/538898
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---

diff --git a/src/runtime/cpuprof.go b/src/runtime/cpuprof.go
index 72f589944d..331484b1ff 100644
--- a/src/runtime/cpuprof.go
+++ b/src/runtime/cpuprof.go
@@ -211,7 +211,7 @@ func CPUProfile() []byte {
 
 //go:linkname runtime_pprof_runtime_cyclesPerSecond runtime/pprof.runtime_cyclesPerSecond
 func runtime_pprof_runtime_cyclesPerSecond() int64 {
-	return tickspersecond()
+	return ticksPerSecond()
 }
 
 // readProfile, provided to runtime/pprof, returns the next chunk of
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index df939b6532..ef485a3cfc 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -471,7 +471,7 @@ func SetBlockProfileRate(rate int) {
 		r = 1 // profile everything
 	} else {
 		// convert ns to cycles, use float64 to prevent overflow during multiplication
-		r = int64(float64(rate) * float64(tickspersecond()) / (1000 * 1000 * 1000))
+		r = int64(float64(rate) * float64(ticksPerSecond()) / (1000 * 1000 * 1000))
 		if r == 0 {
 			r = 1
 		}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 159c19caf3..d0cb998cff 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -8,6 +8,7 @@ import (
 	"internal/abi"
 	"internal/cpu"
 	"internal/goarch"
+	"internal/goos"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -772,6 +773,7 @@ func schedinit() {
 	// The world starts stopped.
 	worldStopped()
 
+	ticks.init() // run as early as possible
 	moduledataverify()
 	stackinit()
 	mallocinit()
@@ -937,6 +939,20 @@ func (mp *m) hasCgoOnStack() bool {
 	return mp.ncgo > 0 || mp.isextra
 }
 
+const (
+	// osHasLowResTimer indicates that the platform's internal timer system has a low resolution,
+	// typically on the order of 1 ms or more.
+	osHasLowResTimer = GOOS == "windows" || GOOS == "openbsd" || GOOS == "netbsd"
+
+	// osHasLowResClockInt is osHasLowResClock but in integer form, so it can be used to create
+	// constants conditionally.
+	osHasLowResClockInt = goos.IsWindows
+
+	// osHasLowResClock indicates that timestamps produced by nanotime on the platform have a
+	// low resolution, typically on the order of 1 ms or more.
+	osHasLowResClock = osHasLowResClockInt > 0
+)
+
 var fastrandseed uintptr
 
 func fastrandinit() {
@@ -6599,7 +6615,7 @@ func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool)
 						// between different Ps.
 						// A sync chan send/recv takes ~50ns as of time of
 						// writing, so 3us gives ~50x overshoot.
-						if GOOS != "windows" && GOOS != "openbsd" && GOOS != "netbsd" {
+						if !osHasLowResTimer {
 							usleep(3)
 						} else {
 							// On some platforms system timer granularity is
diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go
index 0822d0e805..0829a84e43 100644
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go
@@ -17,34 +17,100 @@ import (
 var ticks ticksType
 
 type ticksType struct {
-	lock mutex
-	val  atomic.Int64
+	// lock protects access to start* and val.
+	lock       mutex
+	startTicks int64
+	startTime  int64
+	val        atomic.Int64
 }
 
-// Note: Called by runtime/pprof in addition to runtime code.
-func tickspersecond() int64 {
+// init initializes ticks to maximize the chance that we have a good ticksPerSecond reference.
+//
+// Must not run concurrently with ticksPerSecond.
+func (t *ticksType) init() {
+	lock(&ticks.lock)
+	t.startTime = nanotime()
+	t.startTicks = cputicks()
+	unlock(&ticks.lock)
+}
+
+// minTimeForTicksPerSecond is the minimum elapsed time we require to consider our ticksPerSecond
+// measurement to be of decent enough quality for profiling.
+//
+// There's a linear relationship here between minimum time and error from the true value.
+// The error from the true ticks-per-second in a linux/amd64 VM seems to be:
+// -   1 ms -> ~0.02% error
+// -   5 ms -> ~0.004% error
+// -  10 ms -> ~0.002% error
+// -  50 ms -> ~0.0003% error
+// - 100 ms -> ~0.0001% error
+//
+// We're willing to take 0.004% error here, because ticksPerSecond is intended to be used for
+// converting durations, not timestamps. Durations are usually going to be much larger, and so
+// the tiny error doesn't matter. The error is definitely going to be a problem when trying to
+// use this for timestamps, as it'll make those timestamps much less likely to line up.
+const minTimeForTicksPerSecond = 5_000_000*(1-osHasLowResClockInt) + 100_000_000*osHasLowResClockInt
+
+// ticksPerSecond returns a conversion rate between the cputicks clock and the nanotime clock.
+//
+// Note: Clocks are hard. Using this as an actual conversion rate for timestamps is ill-advised
+// and should be avoided when possible. Use only for durations, where a tiny error term isn't going
+// to make a meaningful difference in even a 1ms duration. If an accurate timestamp is needed,
+// use nanotime instead. (The entire Windows platform is a broad exception to this rule, where nanotime
+// produces timestamps on such a coarse granularity that the error from this conversion is actually
+// preferable.)
+//
+// The strategy for computing the conversion rate is to write down nanotime and cputicks as
+// early in process startup as possible. From then, we just need to wait until we get values
+// from nanotime that we can use (some platforms have a really coarse system time granularity).
+// We require some amount of time to pass to ensure that the conversion rate is fairly accurate
+// in aggregate. But because we compute this rate lazily, there's a pretty good chance a decent
+// amount of time has passed by the time we get here.
+//
+// Must be called from a normal goroutine context (running regular goroutine with a P).
+//
+// Called by runtime/pprof in addition to runtime code.
+//
+// TODO(mknyszek): This doesn't account for things like CPU frequency scaling. Consider
+// a more sophisticated and general approach in the future.
+func ticksPerSecond() int64 {
+	// Get the conversion rate if we've already computed it.
 	r := ticks.val.Load()
 	if r != 0 {
 		return r
 	}
-	lock(&ticks.lock)
-	r = ticks.val.Load()
-	if r == 0 {
-		t0 := nanotime()
-		c0 := cputicks()
-		usleep(100 * 1000)
-		t1 := nanotime()
-		c1 := cputicks()
-		if t1 == t0 {
-			t1++
+
+	// Compute the conversion rate.
+	for {
+		lock(&ticks.lock)
+		r = ticks.val.Load()
+		if r != 0 {
+			unlock(&ticks.lock)
+			return r
 		}
-		r = (c1 - c0) * 1000 * 1000 * 1000 / (t1 - t0)
-		if r == 0 {
-			r++
+
+		// Grab the current time in both clocks.
+		nowTime := nanotime()
+		nowTicks := cputicks()
+
+		// See if we can use these times.
+		if nowTicks > ticks.startTicks && nowTime-ticks.startTime > minTimeForTicksPerSecond {
+			// Perform the calculation with floats. We don't want to risk overflow.
+			r = int64(float64(nowTicks-ticks.startTicks) * 1e9 / float64(nowTime-ticks.startTime))
+			if r == 0 {
+				// Zero is both a sentinel value and it would be bad if callers used this as
+				// a divisor. We tried out best, so just make it 1.
+				r++
+			}
+			ticks.val.Store(r)
+			unlock(&ticks.lock)
+			break
 		}
-		ticks.val.Store(r)
+		unlock(&ticks.lock)
+
+		// Sleep in one millisecond increments until we have a reliable time.
+		timeSleep(1_000_000)
 	}
-	unlock(&ticks.lock)
 	return r
 }