p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL,
- ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL:
+ ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL,
+ ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL:
if v.Args[0].Reg() != v.Reg() {
// POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus.
// TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7.
"sse41": {"roundsd"},
"fma": {"vfmadd231sd"},
"movbe": {"movbeqq", "movbeq", "movbell", "movbel", "movbe"},
+ "lzcnt": {"lzcntq", "lzcntl", "lzcnt"},
}
// Test to use POPCNT instruction, if available
// However, for zero-extended values, we can cheat a bit, and calculate
// BSR(x<<1 + 1), which is guaranteed to be non-zero, and which conveniently
// places the index of the highest set bit where we want it.
-(BitLen64 <t> x) => (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
-(BitLen32 x) => (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
-(BitLen16 x) => (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x)))
-(BitLen8 x) => (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x)))
+// For GOAMD64>=3, BitLen can be calculated by OperandSize - LZCNT(x).
+(BitLen64 <t> x) && buildcfg.GOAMD64 < 3 => (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
+(BitLen32 x) && buildcfg.GOAMD64 < 3 => (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
+(BitLen16 x) && buildcfg.GOAMD64 < 3 => (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x)))
+(BitLen8 x) && buildcfg.GOAMD64 < 3 => (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x)))
+(BitLen64 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-64] (LZCNTQ x)))
+// Use 64-bit version to allow const-fold remove unnecessary arithmetic.
+(BitLen(32|16|8) <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-32] (LZCNTL x)))
(Bswap(64|32) ...) => (BSWAP(Q|L) ...)
{name: "TZCNTQ", argLength: 1, reg: gp11, asm: "TZCNTQ", clobberFlags: true},
{name: "TZCNTL", argLength: 1, reg: gp11, asm: "TZCNTL", clobberFlags: true},
+ // CPUID feature: LZCNT.
+ // count the number of leading zero bits.
+ {name: "LZCNTQ", argLength: 1, reg: gp11, asm: "LZCNTQ", typ: "UInt64", clobberFlags: true},
+ {name: "LZCNTL", argLength: 1, reg: gp11, asm: "LZCNTL", typ: "UInt32", clobberFlags: true},
+
// CPUID feature: MOVBE
// MOVBEWload does not satisfy zero extended, so only use MOVBEWstore
{name: "MOVBEWstore", argLength: 3, reg: gpstore, asm: "MOVBEW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
OpAMD64BLSRL
OpAMD64TZCNTQ
OpAMD64TZCNTL
+ OpAMD64LZCNTQ
+ OpAMD64LZCNTL
OpAMD64MOVBEWstore
OpAMD64MOVBELload
OpAMD64MOVBELstore
},
},
},
+ {
+ name: "LZCNTQ",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.ALZCNTQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
+ {
+ name: "LZCNTL",
+ argLen: 1,
+ clobberFlags: true,
+ asm: x86.ALZCNTL,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ outputs: []outputInfo{
+ {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+ },
+ },
+ },
{
name: "MOVBEWstore",
auxType: auxSymOff,
b := v.Block
typ := &b.Func.Config.Types
// match: (BitLen16 x)
+ // cond: buildcfg.GOAMD64 < 3
// result: (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x)))
for {
x := v_0
+ if !(buildcfg.GOAMD64 < 3) {
+ break
+ }
v.reset(OpAMD64BSRL)
v0 := b.NewValue0(v.Pos, OpAMD64LEAL1, typ.UInt32)
v0.AuxInt = int32ToAuxInt(1)
v.AddArg(v0)
return true
}
+ // match: (BitLen16 <t> x)
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (NEGQ (ADDQconst <t> [-32] (LZCNTL x)))
+ for {
+ t := v.Type
+ x := v_0
+ if !(buildcfg.GOAMD64 >= 3) {
+ break
+ }
+ v.reset(OpAMD64NEGQ)
+ v0 := b.NewValue0(v.Pos, OpAMD64ADDQconst, t)
+ v0.AuxInt = int32ToAuxInt(-32)
+ v1 := b.NewValue0(v.Pos, OpAMD64LZCNTL, typ.UInt32)
+ v1.AddArg(x)
+ v0.AddArg(v1)
+ v.AddArg(v0)
+ return true
+ }
+ return false
}
func rewriteValueAMD64_OpBitLen32(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
typ := &b.Func.Config.Types
// match: (BitLen32 x)
+ // cond: buildcfg.GOAMD64 < 3
// result: (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
for {
x := v_0
+ if !(buildcfg.GOAMD64 < 3) {
+ break
+ }
v.reset(OpSelect0)
v0 := b.NewValue0(v.Pos, OpAMD64BSRQ, types.NewTuple(typ.UInt64, types.TypeFlags))
v1 := b.NewValue0(v.Pos, OpAMD64LEAQ1, typ.UInt64)
v.AddArg(v0)
return true
}
+ // match: (BitLen32 <t> x)
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (NEGQ (ADDQconst <t> [-32] (LZCNTL x)))
+ for {
+ t := v.Type
+ x := v_0
+ if !(buildcfg.GOAMD64 >= 3) {
+ break
+ }
+ v.reset(OpAMD64NEGQ)
+ v0 := b.NewValue0(v.Pos, OpAMD64ADDQconst, t)
+ v0.AuxInt = int32ToAuxInt(-32)
+ v1 := b.NewValue0(v.Pos, OpAMD64LZCNTL, typ.UInt32)
+ v1.AddArg(x)
+ v0.AddArg(v1)
+ v.AddArg(v0)
+ return true
+ }
+ return false
}
func rewriteValueAMD64_OpBitLen64(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
typ := &b.Func.Config.Types
// match: (BitLen64 <t> x)
+ // cond: buildcfg.GOAMD64 < 3
// result: (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
for {
t := v.Type
x := v_0
+ if !(buildcfg.GOAMD64 < 3) {
+ break
+ }
v.reset(OpAMD64ADDQconst)
v.AuxInt = int32ToAuxInt(1)
v0 := b.NewValue0(v.Pos, OpAMD64CMOVQEQ, t)
v.AddArg(v0)
return true
}
+ // match: (BitLen64 <t> x)
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (NEGQ (ADDQconst <t> [-64] (LZCNTQ x)))
+ for {
+ t := v.Type
+ x := v_0
+ if !(buildcfg.GOAMD64 >= 3) {
+ break
+ }
+ v.reset(OpAMD64NEGQ)
+ v0 := b.NewValue0(v.Pos, OpAMD64ADDQconst, t)
+ v0.AuxInt = int32ToAuxInt(-64)
+ v1 := b.NewValue0(v.Pos, OpAMD64LZCNTQ, typ.UInt64)
+ v1.AddArg(x)
+ v0.AddArg(v1)
+ v.AddArg(v0)
+ return true
+ }
+ return false
}
func rewriteValueAMD64_OpBitLen8(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
typ := &b.Func.Config.Types
// match: (BitLen8 x)
+ // cond: buildcfg.GOAMD64 < 3
// result: (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x)))
for {
x := v_0
+ if !(buildcfg.GOAMD64 < 3) {
+ break
+ }
v.reset(OpAMD64BSRL)
v0 := b.NewValue0(v.Pos, OpAMD64LEAL1, typ.UInt32)
v0.AuxInt = int32ToAuxInt(1)
v.AddArg(v0)
return true
}
+ // match: (BitLen8 <t> x)
+ // cond: buildcfg.GOAMD64 >= 3
+ // result: (NEGQ (ADDQconst <t> [-32] (LZCNTL x)))
+ for {
+ t := v.Type
+ x := v_0
+ if !(buildcfg.GOAMD64 >= 3) {
+ break
+ }
+ v.reset(OpAMD64NEGQ)
+ v0 := b.NewValue0(v.Pos, OpAMD64ADDQconst, t)
+ v0.AuxInt = int32ToAuxInt(-32)
+ v1 := b.NewValue0(v.Pos, OpAMD64LZCNTL, typ.UInt32)
+ v1.AddArg(x)
+ v0.AddArg(v1)
+ v.AddArg(v0)
+ return true
+ }
+ return false
}
func rewriteValueAMD64_OpCeil(v *Value) bool {
v_0 := v.Args[0]
// ----------------------- //
func LeadingZeros(n uint) int {
- // amd64:"BSRQ"
+ // amd64/v1,amd64/v2:"BSRQ"
+ // amd64/v3:"LZCNTQ", -"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
}
func LeadingZeros64(n uint64) int {
- // amd64:"BSRQ"
+ // amd64/v1,amd64/v2:"BSRQ"
+ // amd64/v3:"LZCNTQ", -"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
}
func LeadingZeros32(n uint32) int {
- // amd64:"BSRQ","LEAQ",-"CMOVQEQ"
+ // amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
+ // amd64/v3: "LZCNTL",- "BSRL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZW"
// mips:"CLZ"
}
func LeadingZeros16(n uint16) int {
- // amd64:"BSRL","LEAL",-"CMOVQEQ"
+ // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
+ // amd64/v3: "LZCNTL",- "BSRL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
}
func LeadingZeros8(n uint8) int {
- // amd64:"BSRL","LEAL",-"CMOVQEQ"
+ // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
+ // amd64/v3: "LZCNTL",- "BSRL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
// --------------- //
func Len(n uint) int {
- // amd64:"BSRQ"
+ // amd64/v1,amd64/v2:"BSRQ"
+ // amd64/v3: "LZCNTQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
}
func Len64(n uint64) int {
- // amd64:"BSRQ"
+ // amd64/v1,amd64/v2:"BSRQ"
+ // amd64/v3: "LZCNTQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
}
func Len32(n uint32) int {
- // amd64:"BSRQ","LEAQ",-"CMOVQEQ"
+ // amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
+ // amd64/v3: "LZCNTL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
}
func Len16(n uint16) int {
- // amd64:"BSRL","LEAL",-"CMOVQEQ"
+ // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
+ // amd64/v3: "LZCNTL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"
}
func Len8(n uint8) int {
- // amd64:"BSRL","LEAL",-"CMOVQEQ"
+ // amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
+ // amd64/v3: "LZCNTL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// mips:"CLZ"