On amd64, Ctz must include special handling of zeros.
But the prove pass has enough information to detect whether the input
is non-zero, allowing a more efficient lowering.
Introduce new CtzNonZero ops to capture and use this information.
Benchmark code:
func BenchmarkVisitBits(b *testing.B) {
b.Run("8", func(b *testing.B) {
for i := 0; i < b.N; i++ {
x := uint8(0xff)
for x != 0 {
sink = bits.TrailingZeros8(x)
x &= x - 1
}
}
})
// and similarly so for 16, 32, 64
}
name old time/op new time/op delta
VisitBits/8-8 7.27ns ± 4% 5.58ns ± 4% -23.35% (p=0.000 n=28+26)
VisitBits/16-8 14.7ns ± 7% 10.5ns ± 4% -28.43% (p=0.000 n=30+28)
VisitBits/32-8 27.6ns ± 8% 19.3ns ± 3% -30.14% (p=0.000 n=30+26)
VisitBits/64-8 44.0ns ±11% 38.0ns ± 5% -13.48% (p=0.000 n=30+30)
Fixes #25077
Change-Id: Ie6e5bd86baf39ee8a4ca7cadcf56d934e047f957
Reviewed-on: https://go-review.googlesource.com/109358
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
(Ctz16 x) -> (Select0 (BSFL (BTSLconst <typ.UInt32> [16] x)))
(Ctz8 x) -> (Select0 (BSFL (BTSLconst <typ.UInt32> [ 8] x)))
+(Ctz64NonZero x) -> (Select0 (BSFQ x))
+(Ctz32NonZero x) -> (Select0 (BSFL x))
+(Ctz16NonZero x) -> (Select0 (BSFL x))
+(Ctz8NonZero x) -> (Select0 (BSFL x))
+
// BitLen64 of a 64 bit value x requires checking whether x == 0, since BSRQ is undefined when x == 0.
// However, for zero-extended values, we can cheat a bit, and calculate
// BSR(x<<1 + 1), which is guaranteed to be non-zero, and which conveniently
(Sqrt x) -> (SQRTD x)
+// TODO: optimize this for ARMv5 and ARMv6
+(Ctz32NonZero x) -> (Ctz32 x)
+
// count trailing zero for ARMv5 and ARMv6
// 32 - CLZ(x&-x - 1)
(Ctz32 <t> x) && objabi.GOARM<=6 -> (RSBconst [32] (CLZ <t> (SUBconst <t> (AND <t> x (RSBconst <t> [0] x)) [1])))
(Round x) -> (FRINTAD x)
(Trunc x) -> (FRINTZD x)
+(Ctz64NonZero x) -> (Ctz64 x)
+(Ctz32NonZero x) -> (Ctz32 x)
+
(Ctz64 <t> x) -> (CLZ (RBIT <t> x))
(Ctz32 <t> x) -> (CLZW (RBITW <t> x))
(Sqrt x) -> (SQRTD x)
+// TODO: optimize this case?
+(Ctz32NonZero x) -> (Ctz32 x)
+
// count trailing zero
// 32 - CLZ(x&-x - 1)
(Ctz32 <t> x) -> (SUB (MOVWconst [32]) (CLZ <t> (SUBconst <t> [1] (AND <t> x (NEG <t> x)))))
(Addr {sym} base) -> (MOVDaddr {sym} base)
(OffPtr [off] ptr) -> (ADD (MOVDconst <typ.Int64> [off]) ptr)
+// TODO: optimize these cases?
+(Ctz32NonZero x) -> (Ctz32 x)
+(Ctz64NonZero x) -> (Ctz64 x)
+
(Ctz64 x) -> (POPCNTD (ANDN <typ.Int64> (ADDconst <typ.Int64> [-1] x) x))
(Ctz32 x) -> (POPCNTW (MOVWZreg (ANDN <typ.Int> (ADDconst <typ.Int> [-1] x) x)))
(OffPtr [off] ptr) && is32Bit(off) -> (ADDconst [off] ptr)
(OffPtr [off] ptr) -> (ADD (MOVDconst [off]) ptr)
+// TODO: optimize these cases?
+(Ctz64NonZero x) -> (Ctz64 x)
+(Ctz32NonZero x) -> (Ctz32 x)
+
// Ctz(x) = 64 - findLeftmostOne((x-1)&^x)
(Ctz64 <t> x) -> (SUB (MOVDconst [64]) (FLOGR (AND <t> (SUBconst <t> [1] x) (NOT <t> x))))
(Ctz32 <t> x) -> (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW <t> (SUBWconst <t> [1] x) (NOTW <t> x)))))
(Com32 <typ.UInt32> (Int64Hi x))
(Com32 <typ.UInt32> (Int64Lo x)))
+// Sadly, just because we know that x is non-zero,
+// we don't know whether either component is,
+// so just treat Ctz64NonZero the same as Ctz64.
+(Ctz64NonZero x) -> (Ctz64 x)
+
(Ctz64 x) ->
(Add32 <typ.UInt32>
(Ctz32 <typ.UInt32> (Int64Lo x))
{name: "Com32", argLength: 1},
{name: "Com64", argLength: 1},
- {name: "Ctz8", argLength: 1}, // Count trailing (low order) zeroes (returns 0-8)
- {name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
- {name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
- {name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
- {name: "BitLen8", argLength: 1}, // Number of bits in arg[0] (returns 0-8)
- {name: "BitLen16", argLength: 1}, // Number of bits in arg[0] (returns 0-16)
- {name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32)
- {name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64)
+ {name: "Ctz8", argLength: 1}, // Count trailing (low order) zeroes (returns 0-8)
+ {name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
+ {name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
+ {name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
+ {name: "Ctz8NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-7
+ {name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15
+ {name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31
+ {name: "Ctz64NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-63
+ {name: "BitLen8", argLength: 1}, // Number of bits in arg[0] (returns 0-8)
+ {name: "BitLen16", argLength: 1}, // Number of bits in arg[0] (returns 0-16)
+ {name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32)
+ {name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64)
{name: "Bswap32", argLength: 1}, // Swap bytes
{name: "Bswap64", argLength: 1}, // Swap bytes
OpCtz16
OpCtz32
OpCtz64
+ OpCtz8NonZero
+ OpCtz16NonZero
+ OpCtz32NonZero
+ OpCtz64NonZero
OpBitLen8
OpBitLen16
OpBitLen32
argLen: 1,
generic: true,
},
+ {
+ name: "Ctz8NonZero",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "Ctz16NonZero",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "Ctz32NonZero",
+ argLen: 1,
+ generic: true,
+ },
+ {
+ name: "Ctz64NonZero",
+ argLen: 1,
+ generic: true,
+ },
{
name: "BitLen8",
argLen: 1,
OpAdd32: math.MaxInt32, OpSub32: math.MaxInt32,
}
-// isNonNegative returns true if v is known to be non-negative.
+// isNonNegative reports whether v is known to be non-negative.
func (ft *factsTable) isNonNegative(v *Value) bool {
if isNonNegative(v) {
return true
}
}
+var ctzNonZeroOp = map[Op]Op{OpCtz8: OpCtz8NonZero, OpCtz16: OpCtz16NonZero, OpCtz32: OpCtz32NonZero, OpCtz64: OpCtz64NonZero}
+
// simplifyBlock simplifies some constant values in b and evaluates
// branches to non-uniquely dominated successors of b.
func simplifyBlock(sdom SparseTree, ft *factsTable, b *Block) {
- // Replace OpSlicemask operations in b with constants where possible.
for _, v := range b.Values {
- if v.Op != OpSlicemask {
- continue
- }
- x, delta := isConstDelta(v.Args[0])
- if x == nil {
- continue
- }
- // slicemask(x + y)
- // if x is larger than -y (y is negative), then slicemask is -1.
- lim, ok := ft.limits[x.ID]
- if !ok {
- continue
- }
- if lim.umin > uint64(-delta) {
- if v.Args[0].Op == OpAdd64 {
- v.reset(OpConst64)
- } else {
- v.reset(OpConst32)
+ switch v.Op {
+ case OpSlicemask:
+ // Replace OpSlicemask operations in b with constants where possible.
+ x, delta := isConstDelta(v.Args[0])
+ if x == nil {
+ continue
+ }
+ // slicemask(x + y)
+ // if x is larger than -y (y is negative), then slicemask is -1.
+ lim, ok := ft.limits[x.ID]
+ if !ok {
+ continue
+ }
+ if lim.umin > uint64(-delta) {
+ if v.Args[0].Op == OpAdd64 {
+ v.reset(OpConst64)
+ } else {
+ v.reset(OpConst32)
+ }
+ if b.Func.pass.debug > 0 {
+ b.Func.Warnl(v.Pos, "Proved slicemask not needed")
+ }
+ v.AuxInt = -1
+ }
+ case OpCtz8, OpCtz16, OpCtz32, OpCtz64:
+ // On some architectures, notably amd64, we can generate much better
+ // code for CtzNN if we know that the argument is non-zero.
+ // Capture that information here for use in arch-specific optimizations.
+ x := v.Args[0]
+ lim, ok := ft.limits[x.ID]
+ if !ok {
+ continue
}
- if b.Func.pass.debug > 0 {
- b.Func.Warnl(v.Pos, "Proved slicemask not needed")
+ if lim.umin > 0 || lim.min > 0 || lim.max < 0 {
+ v.Op = ctzNonZeroOp[v.Op]
}
- v.AuxInt = -1
}
}
}
}
-// isNonNegative returns true is v is known to be greater or equal to zero.
+// isNonNegative reports whether v is known to be greater or equal to zero.
func isNonNegative(v *Value) bool {
switch v.Op {
case OpConst64:
return rewriteValueAMD64_OpConstNil_0(v)
case OpCtz16:
return rewriteValueAMD64_OpCtz16_0(v)
+ case OpCtz16NonZero:
+ return rewriteValueAMD64_OpCtz16NonZero_0(v)
case OpCtz32:
return rewriteValueAMD64_OpCtz32_0(v)
+ case OpCtz32NonZero:
+ return rewriteValueAMD64_OpCtz32NonZero_0(v)
case OpCtz64:
return rewriteValueAMD64_OpCtz64_0(v)
+ case OpCtz64NonZero:
+ return rewriteValueAMD64_OpCtz64NonZero_0(v)
case OpCtz8:
return rewriteValueAMD64_OpCtz8_0(v)
+ case OpCtz8NonZero:
+ return rewriteValueAMD64_OpCtz8NonZero_0(v)
case OpCvt32Fto32:
return rewriteValueAMD64_OpCvt32Fto32_0(v)
case OpCvt32Fto64:
return true
}
}
+func rewriteValueAMD64_OpCtz16NonZero_0(v *Value) bool {
+ b := v.Block
+ _ = b
+ typ := &b.Func.Config.Types
+ _ = typ
+ // match: (Ctz16NonZero x)
+ // cond:
+ // result: (Select0 (BSFL x))
+ for {
+ x := v.Args[0]
+ v.reset(OpSelect0)
+ v0 := b.NewValue0(v.Pos, OpAMD64BSFL, types.NewTuple(typ.UInt32, types.TypeFlags))
+ v0.AddArg(x)
+ v.AddArg(v0)
+ return true
+ }
+}
func rewriteValueAMD64_OpCtz32_0(v *Value) bool {
b := v.Block
_ = b
return true
}
}
+func rewriteValueAMD64_OpCtz32NonZero_0(v *Value) bool {
+ b := v.Block
+ _ = b
+ typ := &b.Func.Config.Types
+ _ = typ
+ // match: (Ctz32NonZero x)
+ // cond:
+ // result: (Select0 (BSFL x))
+ for {
+ x := v.Args[0]
+ v.reset(OpSelect0)
+ v0 := b.NewValue0(v.Pos, OpAMD64BSFL, types.NewTuple(typ.UInt32, types.TypeFlags))
+ v0.AddArg(x)
+ v.AddArg(v0)
+ return true
+ }
+}
func rewriteValueAMD64_OpCtz64_0(v *Value) bool {
b := v.Block
_ = b
return true
}
}
+func rewriteValueAMD64_OpCtz64NonZero_0(v *Value) bool {
+ b := v.Block
+ _ = b
+ typ := &b.Func.Config.Types
+ _ = typ
+ // match: (Ctz64NonZero x)
+ // cond:
+ // result: (Select0 (BSFQ x))
+ for {
+ x := v.Args[0]
+ v.reset(OpSelect0)
+ v0 := b.NewValue0(v.Pos, OpAMD64BSFQ, types.NewTuple(typ.UInt64, types.TypeFlags))
+ v0.AddArg(x)
+ v.AddArg(v0)
+ return true
+ }
+}
func rewriteValueAMD64_OpCtz8_0(v *Value) bool {
b := v.Block
_ = b
return true
}
}
+func rewriteValueAMD64_OpCtz8NonZero_0(v *Value) bool {
+ b := v.Block
+ _ = b
+ typ := &b.Func.Config.Types
+ _ = typ
+ // match: (Ctz8NonZero x)
+ // cond:
+ // result: (Select0 (BSFL x))
+ for {
+ x := v.Args[0]
+ v.reset(OpSelect0)
+ v0 := b.NewValue0(v.Pos, OpAMD64BSFL, types.NewTuple(typ.UInt32, types.TypeFlags))
+ v0.AddArg(x)
+ v.AddArg(v0)
+ return true
+ }
+}
func rewriteValueAMD64_OpCvt32Fto32_0(v *Value) bool {
// match: (Cvt32Fto32 x)
// cond:
return rewriteValueARM_OpConstNil_0(v)
case OpCtz32:
return rewriteValueARM_OpCtz32_0(v)
+ case OpCtz32NonZero:
+ return rewriteValueARM_OpCtz32NonZero_0(v)
case OpCvt32Fto32:
return rewriteValueARM_OpCvt32Fto32_0(v)
case OpCvt32Fto32U:
}
return false
}
+func rewriteValueARM_OpCtz32NonZero_0(v *Value) bool {
+ // match: (Ctz32NonZero x)
+ // cond:
+ // result: (Ctz32 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz32)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueARM_OpCvt32Fto32_0(v *Value) bool {
// match: (Cvt32Fto32 x)
// cond:
return rewriteValueARM64_OpConstNil_0(v)
case OpCtz32:
return rewriteValueARM64_OpCtz32_0(v)
+ case OpCtz32NonZero:
+ return rewriteValueARM64_OpCtz32NonZero_0(v)
case OpCtz64:
return rewriteValueARM64_OpCtz64_0(v)
+ case OpCtz64NonZero:
+ return rewriteValueARM64_OpCtz64NonZero_0(v)
case OpCvt32Fto32:
return rewriteValueARM64_OpCvt32Fto32_0(v)
case OpCvt32Fto32U:
return true
}
}
+func rewriteValueARM64_OpCtz32NonZero_0(v *Value) bool {
+ // match: (Ctz32NonZero x)
+ // cond:
+ // result: (Ctz32 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz32)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueARM64_OpCtz64_0(v *Value) bool {
b := v.Block
_ = b
return true
}
}
+func rewriteValueARM64_OpCtz64NonZero_0(v *Value) bool {
+ // match: (Ctz64NonZero x)
+ // cond:
+ // result: (Ctz64 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz64)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueARM64_OpCvt32Fto32_0(v *Value) bool {
// match: (Cvt32Fto32 x)
// cond:
return rewriteValueMIPS_OpConstNil_0(v)
case OpCtz32:
return rewriteValueMIPS_OpCtz32_0(v)
+ case OpCtz32NonZero:
+ return rewriteValueMIPS_OpCtz32NonZero_0(v)
case OpCvt32Fto32:
return rewriteValueMIPS_OpCvt32Fto32_0(v)
case OpCvt32Fto64F:
return true
}
}
+func rewriteValueMIPS_OpCtz32NonZero_0(v *Value) bool {
+ // match: (Ctz32NonZero x)
+ // cond:
+ // result: (Ctz32 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz32)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueMIPS_OpCvt32Fto32_0(v *Value) bool {
// match: (Cvt32Fto32 x)
// cond:
return rewriteValuePPC64_OpCopysign_0(v)
case OpCtz32:
return rewriteValuePPC64_OpCtz32_0(v)
+ case OpCtz32NonZero:
+ return rewriteValuePPC64_OpCtz32NonZero_0(v)
case OpCtz64:
return rewriteValuePPC64_OpCtz64_0(v)
+ case OpCtz64NonZero:
+ return rewriteValuePPC64_OpCtz64NonZero_0(v)
case OpCvt32Fto32:
return rewriteValuePPC64_OpCvt32Fto32_0(v)
case OpCvt32Fto64:
return true
}
}
+func rewriteValuePPC64_OpCtz32NonZero_0(v *Value) bool {
+ // match: (Ctz32NonZero x)
+ // cond:
+ // result: (Ctz32 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz32)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValuePPC64_OpCtz64_0(v *Value) bool {
b := v.Block
_ = b
return true
}
}
+func rewriteValuePPC64_OpCtz64NonZero_0(v *Value) bool {
+ // match: (Ctz64NonZero x)
+ // cond:
+ // result: (Ctz64 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz64)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValuePPC64_OpCvt32Fto32_0(v *Value) bool {
b := v.Block
_ = b
return rewriteValueS390X_OpConstNil_0(v)
case OpCtz32:
return rewriteValueS390X_OpCtz32_0(v)
+ case OpCtz32NonZero:
+ return rewriteValueS390X_OpCtz32NonZero_0(v)
case OpCtz64:
return rewriteValueS390X_OpCtz64_0(v)
+ case OpCtz64NonZero:
+ return rewriteValueS390X_OpCtz64NonZero_0(v)
case OpCvt32Fto32:
return rewriteValueS390X_OpCvt32Fto32_0(v)
case OpCvt32Fto64:
return true
}
}
+func rewriteValueS390X_OpCtz32NonZero_0(v *Value) bool {
+ // match: (Ctz32NonZero x)
+ // cond:
+ // result: (Ctz32 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz32)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueS390X_OpCtz64_0(v *Value) bool {
b := v.Block
_ = b
return true
}
}
+func rewriteValueS390X_OpCtz64NonZero_0(v *Value) bool {
+ // match: (Ctz64NonZero x)
+ // cond:
+ // result: (Ctz64 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz64)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueS390X_OpCvt32Fto32_0(v *Value) bool {
// match: (Cvt32Fto32 x)
// cond:
return rewriteValuedec64_OpConst64_0(v)
case OpCtz64:
return rewriteValuedec64_OpCtz64_0(v)
+ case OpCtz64NonZero:
+ return rewriteValuedec64_OpCtz64NonZero_0(v)
case OpEq64:
return rewriteValuedec64_OpEq64_0(v)
case OpGeq64:
return true
}
}
+func rewriteValuedec64_OpCtz64NonZero_0(v *Value) bool {
+ // match: (Ctz64NonZero x)
+ // cond:
+ // result: (Ctz64 x)
+ for {
+ x := v.Args[0]
+ v.reset(OpCtz64)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValuedec64_OpEq64_0(v *Value) bool {
b := v.Block
_ = b
// s390x:"FLOGR","OR\t\\$256"
return bits.TrailingZeros8(n)
}
+
+// IterateBitsNN checks special handling of TrailingZerosNN when the input is known to be non-zero.
+
+func IterateBits(n uint) int {
+ i := 0
+ for n != 0 {
+ // amd64:"BSFQ",-"CMOVEQ"
+ i += bits.TrailingZeros(n)
+ n &= n - 1
+ }
+ return i
+}
+
+func IterateBits64(n uint64) int {
+ i := 0
+ for n != 0 {
+ // amd64:"BSFQ",-"CMOVEQ"
+ i += bits.TrailingZeros64(n)
+ n &= n - 1
+ }
+ return i
+}
+
+func IterateBits32(n uint32) int {
+ i := 0
+ for n != 0 {
+ // amd64:"BSFL",-"BTSQ"
+ i += bits.TrailingZeros32(n)
+ n &= n - 1
+ }
+ return i
+}
+
+func IterateBits16(n uint16) int {
+ i := 0
+ for n != 0 {
+ // amd64:"BSFL",-"BTSL"
+ i += bits.TrailingZeros16(n)
+ n &= n - 1
+ }
+ return i
+}
+
+func IterateBits8(n uint8) int {
+ i := 0
+ for n != 0 {
+ // amd64:"BSFL",-"BTSL"
+ i += bits.TrailingZeros8(n)
+ n &= n - 1
+ }
+ return i
+}
var buf bytes.Buffer
cmd.Stdout, cmd.Stderr = &buf, &buf
if err := cmd.Run(); err != nil {
+ fmt.Println(env, "\n", cmd.Stderr)
t.err = err
return
}