]> Cypherpunks.ru repositories - gostls13.git/commitdiff
cmd/compile: use TZCNT instruction for GOAMD64>=v3
authorwdvxdr <wdvxdr1123@gmail.com>
Thu, 30 Sep 2021 01:57:04 +0000 (09:57 +0800)
committerKeith Randall <khr@golang.org>
Tue, 5 Oct 2021 16:06:49 +0000 (16:06 +0000)
on my Intel CoffeeLake CPU:
name               old time/op  new time/op  delta
TrailingZeros-8    0.68ns ± 1%  0.64ns ± 1%  -6.26%  (p=0.000 n=10+10)
TrailingZeros8-8   0.70ns ± 1%  0.70ns ± 1%    ~     (p=0.697 n=10+10)
TrailingZeros16-8  0.70ns ± 1%  0.70ns ± 1%  +0.57%  (p=0.043 n=10+10)
TrailingZeros32-8  0.66ns ± 1%  0.64ns ± 1%  -3.35%  (p=0.000 n=10+10)
TrailingZeros64-8  0.68ns ± 1%  0.64ns ± 1%  -5.84%  (p=0.000 n=9+10)

Updates #45453

Change-Id: I228ff2d51df24b1306136f061432f8a12bb1d6fd
Reviewed-on: https://go-review.googlesource.com/c/go/+/353249
Trust: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
src/cmd/compile/internal/amd64/ssa.go
src/cmd/compile/internal/ssa/gen/AMD64.rules
src/cmd/compile/internal/ssa/gen/AMD64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteAMD64.go
test/codegen/mathbits.go

index 68266d35d67a95f12e93edbc55fb7c4e7c1d0122..33cd5985e0ed7d7ea15974d20a28f355fd5f8ae1 100644 (file)
@@ -265,7 +265,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 
        case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
                ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
-               ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
+               ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL,
+               ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL:
                p := s.Prog(v.Op.Asm())
                p.From.Type = obj.TYPE_REG
                p.From.Reg = v.Args[0].Reg()
index edb1a4869a34c55c9ce61d1941d831da9487f1e4..1c63a3f70c867d9b1d77ba1dc5f4136fbadc09d3 100644 (file)
 (OffPtr [off] ptr) => (ADDQ (MOVQconst [off]) ptr)
 
 // Lowering other arithmetic
-(Ctz64 <t> x) => (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x)))
-(Ctz32 x) => (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
+(Ctz64 x)     && buildcfg.GOAMD64 >= 3 => (TZCNTQ x)
+(Ctz32 x)     && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
+(Ctz64 <t> x) && buildcfg.GOAMD64 <  3 => (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x)))
+(Ctz32 x)     && buildcfg.GOAMD64 <  3 => (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
 (Ctz16 x) => (BSFL (BTSLconst <typ.UInt32> [16] x))
 (Ctz8  x) => (BSFL (BTSLconst <typ.UInt32> [ 8] x))
 
-(Ctz64NonZero x) => (Select0 (BSFQ x))
-(Ctz32NonZero ...) => (BSFL ...)
-(Ctz16NonZero ...) => (BSFL ...)
-(Ctz8NonZero  ...) => (BSFL ...)
+(Ctz64NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTQ x)
+(Ctz32NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
+(Ctz16NonZero x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
+(Ctz8NonZero  x) && buildcfg.GOAMD64 >= 3 => (TZCNTL x)
+(Ctz64NonZero x) && buildcfg.GOAMD64 <  3 => (Select0 (BSFQ x))
+(Ctz32NonZero x) && buildcfg.GOAMD64 <  3 => (BSFL x)
+(Ctz16NonZero x) && buildcfg.GOAMD64 <  3 => (BSFL x)
+(Ctz8NonZero  x) && buildcfg.GOAMD64 <  3 => (BSFL x)
 
 // BitLen64 of a 64 bit value x requires checking whether x == 0, since BSRQ is undefined when x == 0.
 // However, for zero-extended values, we can cheat a bit, and calculate
index 6e4c514bd02b845c572c97c7784e27d0e89eb4bc..188777273624a8b17d4bab207214a743f3ec94f2 100644 (file)
@@ -918,6 +918,10 @@ func init() {
                {name: "BLSMSKL", argLength: 1, reg: gp11, asm: "BLSMSKL", clobberFlags: true}, // arg0 ^ (arg0 - 1)
                {name: "BLSRQ", argLength: 1, reg: gp11, asm: "BLSRQ", clobberFlags: true},     // arg0 & (arg0 - 1)
                {name: "BLSRL", argLength: 1, reg: gp11, asm: "BLSRL", clobberFlags: true},     // arg0 & (arg0 - 1)
+               // count the number of trailing zero bits, prefer TZCNTQ over BSFQ, as TZCNTQ(0)==64
+               // and BSFQ(0) is undefined. Same for TZCNTL(0)==32
+               {name: "TZCNTQ", argLength: 1, reg: gp11, asm: "TZCNTQ", clobberFlags: true},
+               {name: "TZCNTL", argLength: 1, reg: gp11, asm: "TZCNTL", clobberFlags: true},
        }
 
        var AMD64blocks = []blockData{
index 128ec1f049d4ed95c65ccde71cf2eea51828f539..6266092f6f16ef662e3f589c6b7e45bccbef78df 100644 (file)
@@ -1041,6 +1041,8 @@ const (
        OpAMD64BLSMSKL
        OpAMD64BLSRQ
        OpAMD64BLSRL
+       OpAMD64TZCNTQ
+       OpAMD64TZCNTL
 
        OpARMADD
        OpARMADDconst
@@ -13752,6 +13754,34 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:         "TZCNTQ",
+               argLen:       1,
+               clobberFlags: true,
+               asm:          x86.ATZCNTQ,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                       },
+                       outputs: []outputInfo{
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                       },
+               },
+       },
+       {
+               name:         "TZCNTL",
+               argLen:       1,
+               clobberFlags: true,
+               asm:          x86.ATZCNTL,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                       },
+                       outputs: []outputInfo{
+                               {0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
+                       },
+               },
+       },
 
        {
                name:        "ADD",
index 906260fb141291fca41b7389ea16791d278c0718..10d3afbc7dd1c546c287dd8769a7132be1badf78 100644 (file)
@@ -647,13 +647,11 @@ func rewriteValueAMD64(v *Value) bool {
        case OpCtz16:
                return rewriteValueAMD64_OpCtz16(v)
        case OpCtz16NonZero:
-               v.Op = OpAMD64BSFL
-               return true
+               return rewriteValueAMD64_OpCtz16NonZero(v)
        case OpCtz32:
                return rewriteValueAMD64_OpCtz32(v)
        case OpCtz32NonZero:
-               v.Op = OpAMD64BSFL
-               return true
+               return rewriteValueAMD64_OpCtz32NonZero(v)
        case OpCtz64:
                return rewriteValueAMD64_OpCtz64(v)
        case OpCtz64NonZero:
@@ -661,8 +659,7 @@ func rewriteValueAMD64(v *Value) bool {
        case OpCtz8:
                return rewriteValueAMD64_OpCtz8(v)
        case OpCtz8NonZero:
-               v.Op = OpAMD64BSFL
-               return true
+               return rewriteValueAMD64_OpCtz8NonZero(v)
        case OpCvt32Fto32:
                v.Op = OpAMD64CVTTSS2SL
                return true
@@ -28694,14 +28691,58 @@ func rewriteValueAMD64_OpCtz16(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpCtz16NonZero(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (Ctz16NonZero x)
+       // cond: buildcfg.GOAMD64 >= 3
+       // result: (TZCNTL x)
+       for {
+               x := v_0
+               if !(buildcfg.GOAMD64 >= 3) {
+                       break
+               }
+               v.reset(OpAMD64TZCNTL)
+               v.AddArg(x)
+               return true
+       }
+       // match: (Ctz16NonZero x)
+       // cond: buildcfg.GOAMD64 < 3
+       // result: (BSFL x)
+       for {
+               x := v_0
+               if !(buildcfg.GOAMD64 < 3) {
+                       break
+               }
+               v.reset(OpAMD64BSFL)
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
 func rewriteValueAMD64_OpCtz32(v *Value) bool {
        v_0 := v.Args[0]
        b := v.Block
        typ := &b.Func.Config.Types
        // match: (Ctz32 x)
+       // cond: buildcfg.GOAMD64 >= 3
+       // result: (TZCNTL x)
+       for {
+               x := v_0
+               if !(buildcfg.GOAMD64 >= 3) {
+                       break
+               }
+               v.reset(OpAMD64TZCNTL)
+               v.AddArg(x)
+               return true
+       }
+       // match: (Ctz32 x)
+       // cond: buildcfg.GOAMD64 < 3
        // result: (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
        for {
                x := v_0
+               if !(buildcfg.GOAMD64 < 3) {
+                       break
+               }
                v.reset(OpSelect0)
                v0 := b.NewValue0(v.Pos, OpAMD64BSFQ, types.NewTuple(typ.UInt64, types.TypeFlags))
                v1 := b.NewValue0(v.Pos, OpAMD64BTSQconst, typ.UInt64)
@@ -28711,16 +28752,61 @@ func rewriteValueAMD64_OpCtz32(v *Value) bool {
                v.AddArg(v0)
                return true
        }
+       return false
+}
+func rewriteValueAMD64_OpCtz32NonZero(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (Ctz32NonZero x)
+       // cond: buildcfg.GOAMD64 >= 3
+       // result: (TZCNTL x)
+       for {
+               x := v_0
+               if !(buildcfg.GOAMD64 >= 3) {
+                       break
+               }
+               v.reset(OpAMD64TZCNTL)
+               v.AddArg(x)
+               return true
+       }
+       // match: (Ctz32NonZero x)
+       // cond: buildcfg.GOAMD64 < 3
+       // result: (BSFL x)
+       for {
+               x := v_0
+               if !(buildcfg.GOAMD64 < 3) {
+                       break
+               }
+               v.reset(OpAMD64BSFL)
+               v.AddArg(x)
+               return true
+       }
+       return false
 }
 func rewriteValueAMD64_OpCtz64(v *Value) bool {
        v_0 := v.Args[0]
        b := v.Block
        typ := &b.Func.Config.Types
+       // match: (Ctz64 x)
+       // cond: buildcfg.GOAMD64 >= 3
+       // result: (TZCNTQ x)
+       for {
+               x := v_0
+               if !(buildcfg.GOAMD64 >= 3) {
+                       break
+               }
+               v.reset(OpAMD64TZCNTQ)
+               v.AddArg(x)
+               return true
+       }
        // match: (Ctz64 <t> x)
+       // cond: buildcfg.GOAMD64 < 3
        // result: (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x)))
        for {
                t := v.Type
                x := v_0
+               if !(buildcfg.GOAMD64 < 3) {
+                       break
+               }
                v.reset(OpAMD64CMOVQEQ)
                v0 := b.NewValue0(v.Pos, OpSelect0, t)
                v1 := b.NewValue0(v.Pos, OpAMD64BSFQ, types.NewTuple(typ.UInt64, types.TypeFlags))
@@ -28733,21 +28819,39 @@ func rewriteValueAMD64_OpCtz64(v *Value) bool {
                v.AddArg3(v0, v2, v3)
                return true
        }
+       return false
 }
 func rewriteValueAMD64_OpCtz64NonZero(v *Value) bool {
        v_0 := v.Args[0]
        b := v.Block
        typ := &b.Func.Config.Types
        // match: (Ctz64NonZero x)
+       // cond: buildcfg.GOAMD64 >= 3
+       // result: (TZCNTQ x)
+       for {
+               x := v_0
+               if !(buildcfg.GOAMD64 >= 3) {
+                       break
+               }
+               v.reset(OpAMD64TZCNTQ)
+               v.AddArg(x)
+               return true
+       }
+       // match: (Ctz64NonZero x)
+       // cond: buildcfg.GOAMD64 < 3
        // result: (Select0 (BSFQ x))
        for {
                x := v_0
+               if !(buildcfg.GOAMD64 < 3) {
+                       break
+               }
                v.reset(OpSelect0)
                v0 := b.NewValue0(v.Pos, OpAMD64BSFQ, types.NewTuple(typ.UInt64, types.TypeFlags))
                v0.AddArg(x)
                v.AddArg(v0)
                return true
        }
+       return false
 }
 func rewriteValueAMD64_OpCtz8(v *Value) bool {
        v_0 := v.Args[0]
@@ -28765,6 +28869,34 @@ func rewriteValueAMD64_OpCtz8(v *Value) bool {
                return true
        }
 }
+func rewriteValueAMD64_OpCtz8NonZero(v *Value) bool {
+       v_0 := v.Args[0]
+       // match: (Ctz8NonZero x)
+       // cond: buildcfg.GOAMD64 >= 3
+       // result: (TZCNTL x)
+       for {
+               x := v_0
+               if !(buildcfg.GOAMD64 >= 3) {
+                       break
+               }
+               v.reset(OpAMD64TZCNTL)
+               v.AddArg(x)
+               return true
+       }
+       // match: (Ctz8NonZero x)
+       // cond: buildcfg.GOAMD64 < 3
+       // result: (BSFL x)
+       for {
+               x := v_0
+               if !(buildcfg.GOAMD64 < 3) {
+                       break
+               }
+               v.reset(OpAMD64BSFL)
+               v.AddArg(x)
+               return true
+       }
+       return false
+}
 func rewriteValueAMD64_OpDiv16(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
index aecd84a78bb1c215d0fd4d22c5ed6a52984fb952..50527fea04374f289b78c51caaa1e08b4f7a7748 100644 (file)
@@ -272,7 +272,8 @@ func RotateLeftVariable32(n uint32, m int) uint32 {
 // ------------------------ //
 
 func TrailingZeros(n uint) int {
-       // amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+       // amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+       // amd64/v3:"TZCNTQ"
        // arm:"CLZ"
        // arm64:"RBIT","CLZ"
        // s390x:"FLOGR"
@@ -285,7 +286,8 @@ func TrailingZeros(n uint) int {
 }
 
 func TrailingZeros64(n uint64) int {
-       // amd64:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+       // amd64/v1,amd64/v2:"BSFQ","MOVL\t\\$64","CMOVQEQ"
+       // amd64/v3:"TZCNTQ"
        // arm64:"RBIT","CLZ"
        // s390x:"FLOGR"
        // ppc64/power8:"ANDN","POPCNTD"
@@ -303,7 +305,8 @@ func TrailingZeros64Subtract(n uint64) int {
 }
 
 func TrailingZeros32(n uint32) int {
-       // amd64:"BTSQ\\t\\$32","BSFQ"
+       // amd64/v1,amd64/v2:"BTSQ\\t\\$32","BSFQ"
+       // amd64/v3:"TZCNTL"
        // arm:"CLZ"
        // arm64:"RBITW","CLZW"
        // s390x:"FLOGR","MOVWZ"
@@ -343,7 +346,8 @@ func TrailingZeros8(n uint8) int {
 func IterateBits(n uint) int {
        i := 0
        for n != 0 {
-               // amd64:"BSFQ",-"CMOVEQ"
+               // amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ"
+               // amd64/v3:"TZCNTQ"
                i += bits.TrailingZeros(n)
                n &= n - 1
        }
@@ -353,7 +357,8 @@ func IterateBits(n uint) int {
 func IterateBits64(n uint64) int {
        i := 0
        for n != 0 {
-               // amd64:"BSFQ",-"CMOVEQ"
+               // amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ"
+               // amd64/v3:"TZCNTQ"
                i += bits.TrailingZeros64(n)
                n &= n - 1
        }
@@ -363,7 +368,8 @@ func IterateBits64(n uint64) int {
 func IterateBits32(n uint32) int {
        i := 0
        for n != 0 {
-               // amd64:"BSFL",-"BTSQ"
+               // amd64/v1,amd64/v2:"BSFL",-"BTSQ"
+               // amd64/v3:"TZCNTL"
                i += bits.TrailingZeros32(n)
                n &= n - 1
        }
@@ -373,7 +379,8 @@ func IterateBits32(n uint32) int {
 func IterateBits16(n uint16) int {
        i := 0
        for n != 0 {
-               // amd64:"BSFL",-"BTSL"
+               // amd64/v1,amd64/v2:"BSFL",-"BTSL"
+               // amd64/v3:"TZCNTL"
                // arm64:"RBITW","CLZW",-"ORR"
                i += bits.TrailingZeros16(n)
                n &= n - 1
@@ -384,7 +391,8 @@ func IterateBits16(n uint16) int {
 func IterateBits8(n uint8) int {
        i := 0
        for n != 0 {
-               // amd64:"BSFL",-"BTSL"
+               // amd64/v1,amd64/v2:"BSFL",-"BTSL"
+               // amd64/v3:"TZCNTL"
                // arm64:"RBITW","CLZW",-"ORR"
                i += bits.TrailingZeros8(n)
                n &= n - 1