From f2e26372278eda8b272597c99be5fe1df8496896 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Tue, 6 Jun 2023 10:51:09 -0400 Subject: [PATCH] math/rand/v2: simplify Perm MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The compiler says Perm is being inlined into BenchmarkPerm, and yet BenchmarkPerm30ViaShuffle, which you'd think is the same code, still runs significantly faster. The benchmarks are mystifying but this is clearly still a step in the right direction, since BenchmarkPerm30ViaShuffle is still the fastest and we avoid having two copies of that logic. goos: linux goarch: amd64 pkg: math/rand/v2 cpu: AMD Ryzen 9 7950X 16-Core Processor │ e1bbe739fb.amd64 │ 8993506f2f.amd64 │ │ sec/op │ sec/op vs base │ SourceUint64-32 1.316n ± 2% 1.325n ± 1% ~ (p=0.208 n=20) GlobalInt64-32 2.048n ± 1% 2.240n ± 1% +9.38% (p=0.000 n=20) GlobalInt64Parallel-32 0.1037n ± 1% 0.1041n ± 1% ~ (p=0.774 n=20) GlobalUint64-32 2.039n ± 2% 2.072n ± 3% ~ (p=0.115 n=20) GlobalUint64Parallel-32 0.1013n ± 1% 0.1008n ± 1% ~ (p=0.417 n=20) Int64-32 1.692n ± 2% 1.716n ± 1% ~ (p=0.122 n=20) Uint64-32 1.643n ± 2% 1.665n ± 1% ~ (p=0.062 n=20) GlobalIntN1000-32 3.287n ± 1% 3.335n ± 1% ~ (p=0.147 n=20) IntN1000-32 2.678n ± 2% 2.484n ± 1% -7.24% (p=0.000 n=20) Int64N1000-32 2.684n ± 2% 2.502n ± 2% -6.80% (p=0.000 n=20) Int64N1e8-32 2.663n ± 2% 2.484n ± 2% -6.76% (p=0.000 n=20) Int64N1e9-32 2.633n ± 1% 2.502n ± 0% -4.98% (p=0.000 n=20) Int64N2e9-32 2.657n ± 1% 2.502n ± 0% -5.87% (p=0.000 n=20) Int64N1e18-32 3.125n ± 2% 3.201n ± 1% +2.43% (p=0.000 n=20) Int64N2e18-32 3.476n ± 1% 3.504n ± 1% +0.83% (p=0.009 n=20) Int64N4e18-32 4.795n ± 1% 4.873n ± 1% ~ (p=0.106 n=20) Int32N1000-32 2.485n ± 2% 2.639n ± 1% +6.20% (p=0.000 n=20) Int32N1e8-32 2.457n ± 1% 2.686n ± 2% +9.34% (p=0.000 n=20) Int32N1e9-32 2.452n ± 1% 2.636n ± 1% +7.52% (p=0.000 n=20) Int32N2e9-32 2.453n ± 1% 2.660n ± 1% +8.44% (p=0.000 n=20) Float32-32 2.254n ± 1% 2.261n ± 1% ~ (p=0.888 n=20) Float64-32 2.262n ± 1% 2.280n ± 1% ~ (p=0.040 n=20) ExpFloat64-32 3.777n ± 2% 3.891n ± 1% +3.03% (p=0.000 n=20) NormFloat64-32 3.606n ± 1% 3.711n ± 1% +2.91% (p=0.000 n=20) Perm3-32 33.12n ± 2% 32.60n ± 2% ~ (p=0.045 n=20) Perm30-32 176.1n ± 1% 204.2n ± 0% +15.96% (p=0.000 n=20) Perm30ViaShuffle-32 109.3n ± 1% 121.7n ± 2% +11.30% (p=0.000 n=20) ShuffleOverhead-32 112.5n ± 1% 106.2n ± 2% -5.56% (p=0.000 n=20) Concurrent-32 2.099n ± 0% 2.190n ± 5% +4.36% (p=0.001 n=20) goos: darwin goarch: arm64 pkg: math/rand/v2 cpu: Apple M1 │ e1bbe739fb.arm64 │ 8993506f2f.arm64 │ │ sec/op │ sec/op vs base │ SourceUint64-8 2.290n ± 1% 2.271n ± 0% ~ (p=0.015 n=20) GlobalInt64-8 2.180n ± 1% 2.161n ± 1% ~ (p=0.180 n=20) GlobalInt64Parallel-8 0.4294n ± 0% 0.4303n ± 0% +0.19% (p=0.001 n=20) GlobalUint64-8 2.170n ± 1% 2.164n ± 1% ~ (p=0.673 n=20) GlobalUint64Parallel-8 0.4283n ± 0% 0.4287n ± 0% ~ (p=0.128 n=20) Int64-8 2.481n ± 1% 2.478n ± 1% ~ (p=0.867 n=20) Uint64-8 2.464n ± 1% 2.460n ± 1% ~ (p=0.763 n=20) GlobalIntN1000-8 2.814n ± 0% 2.814n ± 2% ~ (p=0.969 n=20) IntN1000-8 2.934n ± 2% 3.003n ± 2% +2.35% (p=0.000 n=20) Int64N1000-8 2.957n ± 1% 2.954n ± 0% ~ (p=0.285 n=20) Int64N1e8-8 2.935n ± 2% 2.956n ± 0% +0.73% (p=0.002 n=20) Int64N1e9-8 2.935n ± 2% 3.325n ± 0% +13.29% (p=0.000 n=20) Int64N2e9-8 2.933n ± 4% 2.956n ± 2% ~ (p=0.163 n=20) Int64N1e18-8 3.781n ± 1% 3.780n ± 1% ~ (p=0.805 n=20) Int64N2e18-8 4.362n ± 0% 4.385n ± 0% ~ (p=0.077 n=20) Int64N4e18-8 6.576n ± 1% 6.527n ± 0% ~ (p=0.024 n=20) Int32N1000-8 2.942n ± 2% 2.964n ± 1% ~ (p=0.073 n=20) Int32N1e8-8 2.941n ± 1% 2.964n ± 1% ~ (p=0.058 n=20) Int32N1e9-8 2.938n ± 2% 2.963n ± 2% +0.87% (p=0.003 n=20) Int32N2e9-8 2.982n ± 2% 2.961n ± 2% ~ (p=0.056 n=20) Float32-8 3.441n ± 0% 3.442n ± 0% ~ (p=0.030 n=20) Float64-8 3.441n ± 0% 3.442n ± 0% +0.03% (p=0.001 n=20) ExpFloat64-8 4.472n ± 0% 4.472n ± 0% ~ (p=0.877 n=20) NormFloat64-8 4.716n ± 0% 4.734n ± 0% +0.38% (p=0.000 n=20) Perm3-8 26.66n ± 0% 26.55n ± 0% -0.39% (p=0.000 n=20) Perm30-8 143.3n ± 0% 181.9n ± 0% +26.97% (p=0.000 n=20) Perm30ViaShuffle-8 142.9n ± 0% 143.1n ± 0% ~ (p=0.669 n=20) ShuffleOverhead-8 121.1n ± 1% 120.6n ± 1% -0.41% (p=0.004 n=20) Concurrent-8 2.379n ± 2% 2.357n ± 2% ~ (p=0.337 n=20) goos: linux goarch: 386 pkg: math/rand/v2 cpu: AMD Ryzen 9 7950X 16-Core Processor │ e1bbe739fb.386 │ 8993506f2f.386 │ │ sec/op │ sec/op vs base │ SourceUint64-32 2.087n ± 1% 2.102n ± 2% ~ (p=0.507 n=20) GlobalInt64-32 3.538n ± 2% 3.542n ± 2% ~ (p=0.425 n=20) GlobalInt64Parallel-32 0.3207n ± 1% 0.3202n ± 0% ~ (p=0.963 n=20) GlobalUint64-32 3.543n ± 1% 3.507n ± 1% ~ (p=0.034 n=20) GlobalUint64Parallel-32 0.3170n ± 0% 0.3170n ± 1% ~ (p=0.920 n=20) Int64-32 2.548n ± 1% 2.516n ± 1% ~ (p=0.139 n=20) Uint64-32 2.565n ± 2% 2.544n ± 1% ~ (p=0.394 n=20) GlobalIntN1000-32 6.300n ± 1% 6.237n ± 1% ~ (p=0.029 n=20) IntN1000-32 4.750n ± 0% 4.670n ± 2% ~ (p=0.034 n=20) Int64N1000-32 5.515n ± 2% 5.412n ± 1% -1.86% (p=0.009 n=20) Int64N1e8-32 5.527n ± 0% 5.414n ± 2% -2.05% (p=0.002 n=20) Int64N1e9-32 5.531n ± 2% 5.473n ± 1% ~ (p=0.047 n=20) Int64N2e9-32 5.514n ± 2% 5.487n ± 1% ~ (p=0.298 n=20) Int64N1e18-32 9.059n ± 1% 8.901n ± 2% ~ (p=0.037 n=20) Int64N2e18-32 9.594n ± 1% 9.521n ± 1% ~ (p=0.051 n=20) Int64N4e18-32 12.05n ± 2% 11.92n ± 1% ~ (p=0.357 n=20) Int32N1000-32 4.840n ± 2% 4.785n ± 1% ~ (p=0.189 n=20) Int32N1e8-32 4.832n ± 2% 4.748n ± 1% ~ (p=0.042 n=20) Int32N1e9-32 4.815n ± 2% 4.810n ± 1% ~ (p=0.878 n=20) Int32N2e9-32 4.813n ± 1% 4.812n ± 1% ~ (p=0.542 n=20) Float32-32 10.90n ± 2% 10.48n ± 4% -3.85% (p=0.007 n=20) Float64-32 20.32n ± 4% 19.79n ± 3% ~ (p=0.553 n=20) ExpFloat64-32 12.95n ± 3% 12.91n ± 3% ~ (p=0.909 n=20) NormFloat64-32 7.570n ± 1% 7.462n ± 1% -1.44% (p=0.004 n=20) Perm3-32 37.80n ± 2% 35.98n ± 2% -4.79% (p=0.000 n=20) Perm30-32 214.0n ± 1% 241.5n ± 1% +12.85% (p=0.000 n=20) Perm30ViaShuffle-32 188.7n ± 2% 187.3n ± 2% ~ (p=0.029 n=20) ShuffleOverhead-32 160.8n ± 1% 160.2n ± 1% ~ (p=0.180 n=20) Concurrent-32 3.288n ± 0% 3.308n ± 3% ~ (p=0.037 n=20) For #61716. Change-Id: I342b611456c3569520d3c91c849d29eba325d87e Reviewed-on: https://go-review.googlesource.com/c/go/+/502504 Reviewed-by: Dmitri Shuralyov Auto-Submit: Russ Cox LUCI-TryBot-Result: Go LUCI Reviewed-by: Rob Pike --- src/math/rand/v2/example_test.go | 2 +- src/math/rand/v2/rand.go | 16 +++++----------- src/math/rand/v2/regress_test.go | 28 ++++++++++++++-------------- 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/src/math/rand/v2/example_test.go b/src/math/rand/v2/example_test.go index 55892097a8..0362111451 100644 --- a/src/math/rand/v2/example_test.go +++ b/src/math/rand/v2/example_test.go @@ -93,7 +93,7 @@ func Example_rand() { // IntN(10) 8 4 5 // Int32N(10) 1 8 5 // Int64N(10) 4 2 6 - // Perm [0 4 1 3 2] [4 0 1 3 2] [4 1 3 0 2] + // Perm [0 2 4 3 1] [0 4 2 3 1] [2 1 3 0 4] } func ExamplePerm() { diff --git a/src/math/rand/v2/rand.go b/src/math/rand/v2/rand.go index 7e8be1ac4f..c6030f77fc 100644 --- a/src/math/rand/v2/rand.go +++ b/src/math/rand/v2/rand.go @@ -230,18 +230,12 @@ func (r *Rand) Float32() float32 { // Perm returns, as a slice of n ints, a pseudo-random permutation of the integers // in the half-open interval [0,n). func (r *Rand) Perm(n int) []int { - m := make([]int, n) - // In the following loop, the iteration when i=0 always swaps m[0] with m[0]. - // A change to remove this useless iteration is to assign 1 to i in the init - // statement. But Perm also effects r. Making this change will affect - // the final state of r. So this change can't be made for compatibility - // reasons for Go 1. - for i := 0; i < n; i++ { - j := r.IntN(i + 1) - m[i] = m[j] - m[j] = i + p := make([]int, n) + for i := range p { + p[i] = i } - return m + r.Shuffle(len(p), func(i, j int) { p[i], p[j] = p[j], p[i] }) + return p } // Shuffle pseudo-randomizes the order of elements. diff --git a/src/math/rand/v2/regress_test.go b/src/math/rand/v2/regress_test.go index 0b9df9b379..541e9a7b18 100644 --- a/src/math/rand/v2/regress_test.go +++ b/src/math/rand/v2/regress_test.go @@ -437,24 +437,24 @@ var regressGolden = []any{ []int{}, // Perm(0) []int{0}, // Perm(1) - []int{0, 4, 2, 1, 3}, // Perm(5) - []int{2, 4, 5, 0, 7, 1, 3, 6}, // Perm(8) - []int{6, 4, 1, 5, 7, 3, 0, 8, 2}, // Perm(9) - []int{8, 0, 1, 2, 3, 9, 5, 4, 7, 6}, // Perm(10) - []int{0, 13, 14, 7, 1, 4, 15, 10, 11, 12, 9, 5, 3, 6, 8, 2}, // Perm(16) + []int{0, 4, 2, 3, 1}, // Perm(5) + []int{4, 5, 7, 0, 6, 3, 2, 1}, // Perm(8) + []int{2, 5, 4, 0, 7, 8, 1, 6, 3}, // Perm(9) + []int{9, 8, 7, 1, 3, 2, 5, 4, 0, 6}, // Perm(10) + []int{1, 5, 8, 11, 14, 2, 7, 10, 15, 9, 13, 6, 0, 3, 12, 4}, // Perm(16) []int{}, // Perm(0) []int{0}, // Perm(1) - []int{3, 2, 4, 0, 1}, // Perm(5) - []int{7, 1, 6, 4, 2, 3, 5, 0}, // Perm(8) - []int{1, 7, 2, 6, 3, 5, 8, 4, 0}, // Perm(9) - []int{1, 5, 7, 0, 3, 6, 4, 9, 2, 8}, // Perm(10) - []int{6, 13, 2, 11, 14, 7, 10, 12, 4, 5, 3, 0, 15, 9, 1, 8}, // Perm(16) + []int{4, 1, 2, 0, 3}, // Perm(5) + []int{7, 0, 3, 5, 4, 1, 2, 6}, // Perm(8) + []int{6, 7, 1, 2, 0, 5, 8, 3, 4}, // Perm(9) + []int{7, 2, 8, 6, 1, 5, 9, 0, 3, 4}, // Perm(10) + []int{11, 0, 5, 1, 12, 4, 13, 9, 7, 2, 15, 10, 8, 14, 6, 3}, // Perm(16) []int{}, // Perm(0) []int{0}, // Perm(1) - []int{0, 4, 2, 1, 3}, // Perm(5) - []int{0, 7, 1, 4, 3, 6, 2, 5}, // Perm(8) - []int{1, 3, 0, 4, 5, 2, 8, 7, 6}, // Perm(9) - []int{5, 4, 7, 9, 6, 1, 0, 3, 8, 2}, // Perm(10) + []int{2, 4, 0, 3, 1}, // Perm(5) + []int{4, 2, 5, 0, 6, 3, 1, 7}, // Perm(8) + []int{3, 2, 8, 6, 5, 7, 1, 4, 0}, // Perm(9) + []int{2, 0, 7, 5, 6, 1, 8, 3, 4, 9}, // Perm(10) uint32(1298498081), // Uint32() uint32(2019727887), // Uint32() -- 2.44.0