func Div64(hi, lo, y uint64) (quo, rem uint64) {...}
math/bits.Div64 returns the quotient and remainder of (hi, lo) divided
by y. When hi is zero, we can directly return lo/y, lo%y, which can save
a lot of unnecessary calculations.
The performance is measured on arm64 and the changes will only affect
the arch that doesn't have the intrinsic.
name old time/op new time/op delta
DivWVW/1-10 4.62ns ± 1% 2.45ns ± 1% -46.97%
DivWVW/2-10 12.4ns ± 0% 12.2ns ± 0% -1.38%
DivWVW/3-10 17.4ns ± 1% 17.2ns ± 0% -0.88%
DivWVW/4-10 21.4ns ± 1% 21.6ns ± 0% +0.75%
DivWVW/5-10 22.1ns ± 1% 21.9ns ± 0% -0.69%
DivWVW/10-10 53.4ns ± 1% 53.0ns ± 1% -0.69%
DivWVW/100-10 641ns ± 1% 633ns ± 0% -1.26%
DivWVW/1000-10 5.52µs ± 1% 5.44µs ± 0% -1.39%
DivWVW/10000-10 54.9µs ± 1% 54.7µs ± 1% -0.54%
DivWVW/100000-10 646µs ± 1% 643µs ± 1% ~
name old speed new speed delta
DivWVW/1-10 13.8GB/s ± 1% 26.1GB/s ± 1% +88.57%
DivWVW/2-10 10.3GB/s ± 0% 10.5GB/s ± 0% +1.39%
DivWVW/3-10 11.1GB/s ± 1% 11.2GB/s ± 0% +0.90%
DivWVW/4-10 12.0GB/s ± 1% 11.9GB/s ± 0% -0.74%
DivWVW/5-10 14.5GB/s ± 1% 14.6GB/s ± 0% +0.69%
DivWVW/10-10 12.0GB/s ± 1% 12.1GB/s ± 1% +0.69%
DivWVW/100-10 10.0GB/s ± 1% 10.1GB/s ± 0% +1.28%
DivWVW/1000-10 11.6GB/s ± 1% 11.8GB/s ± 0% +1.41%
DivWVW/10000-10 11.6GB/s ± 1% 11.7GB/s ± 1% +0.54%
DivWVW/100000-10 9.91GB/s ± 1% 9.95GB/s ± 1% ~
Change-Id: I12014c2e2cdb2c91608079f7502592307af9e525
Reviewed-on: https://go-review.googlesource.com/c/go/+/449776
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Eric Fang <eric.fang@arm.com>
Reviewed-by: Robert Griesemer <gri@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
// half in parameter hi and the lower half in parameter lo.
// Div64 panics for y == 0 (division by zero) or y <= hi (quotient overflow).
func Div64(hi, lo, y uint64) (quo, rem uint64) {
- const (
- two32 = 1 << 32
- mask32 = two32 - 1
- )
if y == 0 {
panic(divideError)
}
panic(overflowError)
}
+ // If high part is zero, we can directly return the results.
+ if hi == 0 {
+ return lo / y, lo % y
+ }
+
s := uint(LeadingZeros64(y))
y <<= s
+ const (
+ two32 = 1 << 32
+ mask32 = two32 - 1
+ )
yn1 := y >> 32
yn0 := y & mask32
un32 := hi<<s | lo>>(64-s)