runtime: improve tickspersecond

author Michael Anthony Knyszek <mknyszek@google.com>

Wed, 1 Nov 2023 20:00:27 +0000 (20:00 +0000)

committer Gopher Robot <gobot@golang.org>

Thu, 9 Nov 2023 22:38:24 +0000 (22:38 +0000)
author Michael Anthony Knyszek <mknyszek@google.com>
Wed, 1 Nov 2023 20:00:27 +0000 (20:00 +0000)
committer Gopher Robot <gobot@golang.org>
Thu, 9 Nov 2023 22:38:24 +0000 (22:38 +0000)
diff --git a/src/runtime/cpuprof.go b/src/runtime/cpuprof.go

index 72f589944d1a50c6df760b8aa1f120aaf058bdf3..331484b1ff2640b64336b321d4ab7a19fc926f5f 100644 (file)
--- a/src/runtime/cpuprof.go
+++ b/src/runtime/cpuprof.go
@@ -211,7 +211,7 @@ func CPUProfile() []byte {
  
  //go:linkname runtime_pprof_runtime_cyclesPerSecond runtime/pprof.runtime_cyclesPerSecond
  func runtime_pprof_runtime_cyclesPerSecond() int64 {
-       return tickspersecond()
+       return ticksPerSecond()
  }
  
  // readProfile, provided to runtime/pprof, returns the next chunk of
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go

index df939b653217874069c763a806c45b61304148a7..ef485a3cfcdfb9d4ccbd28a038b481e8028eea81 100644 (file)
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -471,7 +471,7 @@ func SetBlockProfileRate(rate int) {
                 r = 1 // profile everything
         } else {
                 // convert ns to cycles, use float64 to prevent overflow during multiplication
-               r = int64(float64(rate) * float64(tickspersecond()) / (1000 * 1000 * 1000))
+               r = int64(float64(rate) * float64(ticksPerSecond()) / (1000 * 1000 * 1000))
                 if r == 0 {
                         r = 1
                 }
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index 159c19caf35dcacd8b81c659121d648b542daf06..d0cb998cffd89acaf5185d288a1c73fc8aaf6e5e 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -8,6 +8,7 @@ import (
         "internal/abi"
         "internal/cpu"
         "internal/goarch"
+       "internal/goos"
         "runtime/internal/atomic"
         "runtime/internal/sys"
         "unsafe"
@@ -772,6 +773,7 @@ func schedinit() {
         // The world starts stopped.
         worldStopped()
  
+       ticks.init() // run as early as possible
         moduledataverify()
         stackinit()
         mallocinit()
@@ -937,6 +939,20 @@ func (mp *m) hasCgoOnStack() bool {
         return mp.ncgo > 0 || mp.isextra
  }
  
+const (
+       // osHasLowResTimer indicates that the platform's internal timer system has a low resolution,
+       // typically on the order of 1 ms or more.
+       osHasLowResTimer = GOOS == "windows" || GOOS == "openbsd" || GOOS == "netbsd"
+
+       // osHasLowResClockInt is osHasLowResClock but in integer form, so it can be used to create
+       // constants conditionally.
+       osHasLowResClockInt = goos.IsWindows
+
+       // osHasLowResClock indicates that timestamps produced by nanotime on the platform have a
+       // low resolution, typically on the order of 1 ms or more.
+       osHasLowResClock = osHasLowResClockInt > 0
+)
+
  var fastrandseed uintptr
  
  func fastrandinit() {
@@ -6599,7 +6615,7 @@ func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool)
                                                 // between different Ps.
                                                 // A sync chan send/recv takes ~50ns as of time of
                                                 // writing, so 3us gives ~50x overshoot.
-                                               if GOOS != "windows" && GOOS != "openbsd" && GOOS != "netbsd" {
+                                               if !osHasLowResTimer {
                                                         usleep(3)
                                                 } else {
                                                         // On some platforms system timer granularity is
diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go

index 0822d0e8054e7b53f6347f8bcdbb1228dd82d80a..0829a84e436cdc8412ec8a8d84d082adf94796d6 100644 (file)
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go
@@ -17,34 +17,100 @@ import (
  var ticks ticksType
  
  type ticksType struct {
-       lock mutex
-       val  atomic.Int64
+       // lock protects access to start* and val.
+       lock       mutex
+       startTicks int64
+       startTime  int64
+       val        atomic.Int64
  }
  
-// Note: Called by runtime/pprof in addition to runtime code.
-func tickspersecond() int64 {
+// init initializes ticks to maximize the chance that we have a good ticksPerSecond reference.
+//
+// Must not run concurrently with ticksPerSecond.
+func (t *ticksType) init() {
+       lock(&ticks.lock)
+       t.startTime = nanotime()
+       t.startTicks = cputicks()
+       unlock(&ticks.lock)
+}
+
+// minTimeForTicksPerSecond is the minimum elapsed time we require to consider our ticksPerSecond
+// measurement to be of decent enough quality for profiling.
+//
+// There's a linear relationship here between minimum time and error from the true value.
+// The error from the true ticks-per-second in a linux/amd64 VM seems to be:
+// -   1 ms -> ~0.02% error
+// -   5 ms -> ~0.004% error
+// -  10 ms -> ~0.002% error
+// -  50 ms -> ~0.0003% error
+// - 100 ms -> ~0.0001% error
+//
+// We're willing to take 0.004% error here, because ticksPerSecond is intended to be used for
+// converting durations, not timestamps. Durations are usually going to be much larger, and so
+// the tiny error doesn't matter. The error is definitely going to be a problem when trying to
+// use this for timestamps, as it'll make those timestamps much less likely to line up.
+const minTimeForTicksPerSecond = 5_000_000*(1-osHasLowResClockInt) + 100_000_000*osHasLowResClockInt
+
+// ticksPerSecond returns a conversion rate between the cputicks clock and the nanotime clock.
+//
+// Note: Clocks are hard. Using this as an actual conversion rate for timestamps is ill-advised
+// and should be avoided when possible. Use only for durations, where a tiny error term isn't going
+// to make a meaningful difference in even a 1ms duration. If an accurate timestamp is needed,
+// use nanotime instead. (The entire Windows platform is a broad exception to this rule, where nanotime
+// produces timestamps on such a coarse granularity that the error from this conversion is actually
+// preferable.)
+//
+// The strategy for computing the conversion rate is to write down nanotime and cputicks as
+// early in process startup as possible. From then, we just need to wait until we get values
+// from nanotime that we can use (some platforms have a really coarse system time granularity).
+// We require some amount of time to pass to ensure that the conversion rate is fairly accurate
+// in aggregate. But because we compute this rate lazily, there's a pretty good chance a decent
+// amount of time has passed by the time we get here.
+//
+// Must be called from a normal goroutine context (running regular goroutine with a P).
+//
+// Called by runtime/pprof in addition to runtime code.
+//
+// TODO(mknyszek): This doesn't account for things like CPU frequency scaling. Consider
+// a more sophisticated and general approach in the future.
+func ticksPerSecond() int64 {
+       // Get the conversion rate if we've already computed it.
         r := ticks.val.Load()
         if r != 0 {
                 return r
         }
-       lock(&ticks.lock)
-       r = ticks.val.Load()
-       if r == 0 {
-               t0 := nanotime()
-               c0 := cputicks()
-               usleep(100 * 1000)
-               t1 := nanotime()
-               c1 := cputicks()
-               if t1 == t0 {
-                       t1++
+
+       // Compute the conversion rate.
+       for {
+               lock(&ticks.lock)
+               r = ticks.val.Load()
+               if r != 0 {
+                       unlock(&ticks.lock)
+                       return r
                 }
-               r = (c1 - c0) * 1000 * 1000 * 1000 / (t1 - t0)
-               if r == 0 {
-                       r++
+
+               // Grab the current time in both clocks.
+               nowTime := nanotime()
+               nowTicks := cputicks()
+
+               // See if we can use these times.
+               if nowTicks > ticks.startTicks && nowTime-ticks.startTime > minTimeForTicksPerSecond {
+                       // Perform the calculation with floats. We don't want to risk overflow.
+                       r = int64(float64(nowTicks-ticks.startTicks) * 1e9 / float64(nowTime-ticks.startTime))
+                       if r == 0 {
+                               // Zero is both a sentinel value and it would be bad if callers used this as
+                               // a divisor. We tried out best, so just make it 1.
+                               r++
+                       }
+                       ticks.val.Store(r)
+                       unlock(&ticks.lock)
+                       break
                 }
-               ticks.val.Store(r)
+               unlock(&ticks.lock)
+
+               // Sleep in one millisecond increments until we have a reliable time.
+               timeSleep(1_000_000)
         }
-       unlock(&ticks.lock)
         return r
  }
author	Michael Anthony Knyszek <mknyszek@google.com>
	Wed, 1 Nov 2023 20:00:27 +0000 (20:00 +0000)
committer	Gopher Robot <gobot@golang.org>
	Thu, 9 Nov 2023 22:38:24 +0000 (22:38 +0000)
src/runtime/cpuprof.go		patch \| blob \| history
src/runtime/mprof.go		patch \| blob \| history
src/runtime/proc.go		patch \| blob \| history
src/runtime/runtime.go		patch \| blob \| history