From: Paul E. Murphy Date: Tue, 27 Jun 2023 22:17:33 +0000 (-0500) Subject: cmd/compile/internal/ssa: improve masking codegen on PPC64 X-Git-Tag: go1.22rc1~985 X-Git-Url: http://www.git.cypherpunks.ru/?a=commitdiff_plain;h=5cdb132228b90732d57215893a9910ded694c585;p=gostls13.git cmd/compile/internal/ssa: improve masking codegen on PPC64 Generate RLDIC[LR] instead of MOVD mask, Rx; AND Rx, Ry, Rz. This helps reduce code size, and reduces the latency caused by the constant load. Similarly, for smaller-than-register values, truncate constants which exceed the range of the value's type to avoid needing to load a constant. Change-Id: I6019684795eb8962d4fd6d9585d08b17c15e7d64 Reviewed-on: https://go-review.googlesource.com/c/go/+/515576 Reviewed-by: Lynn Boger Reviewed-by: Dmitri Shuralyov Run-TryBot: Paul Murphy TryBot-Result: Gopher Robot Reviewed-by: Cherry Mui --- diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules index 00d898f783..c4af55c328 100644 --- a/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules +++ b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules @@ -17,3 +17,16 @@ (SETBCR [0] cmp) && buildcfg.GOPPC64 <= 9 => (ISELZ [4] (MOVDconst [1]) cmp) (SETBC [1] cmp) && buildcfg.GOPPC64 <= 9 => (ISELZ [1] (MOVDconst [1]) cmp) (SETBCR [1] cmp) && buildcfg.GOPPC64 <= 9 => (ISELZ [5] (MOVDconst [1]) cmp) + +// The upper bits of the smaller than register values is undefined. Take advantage of that. +(AND x:(MOVDconst [m]) n) && t.Size() <= 2 => (Select0 (ANDCCconst [int64(int16(m))] n)) + +// Convert simple bit masks to an equivalent rldic[lr] if possible. +(AND x:(MOVDconst [m]) n) && isPPC64ValidShiftMask(m) => (RLDICL [encodePPC64RotateMask(0,m,64)] n) +(AND x:(MOVDconst [m]) n) && m != 0 && isPPC64ValidShiftMask(^m) => (RLDICR [encodePPC64RotateMask(0,m,64)] n) + +// If the RLDICL does not rotate its value, a shifted value can be merged. +(RLDICL [em] x:(SRDconst [s] a)) && (em&0xFF0000) == 0 => (RLDICL [mergePPC64RLDICLandSRDconst(em, s)] a) + +// Convert rotated 32 bit masks on 32 bit values into rlwinm. In general, this leaves the upper 32 bits in an undefined state. +(AND x:(MOVDconst [m]) n) && t.Size() == 4 && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(0,m,32)] n) diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index 6ee661dbbd..efbaae4d46 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -1499,6 +1499,25 @@ func encodePPC64RotateMask(rotate, mask, nbits int64) int64 { return int64(me) | int64(mb<<8) | int64(rotate<<16) | int64(nbits<<24) } +// Merge (RLDICL [encoded] (SRDconst [s] x)) into (RLDICL [new_encoded] x) +// SRDconst on PPC64 is an extended mnemonic of RLDICL. If the input to an +// RLDICL is an SRDconst, and the RLDICL does not rotate its value, the two +// operations can be combined. This functions assumes the two opcodes can +// be merged, and returns an encoded rotate+mask value of the combined RLDICL. +func mergePPC64RLDICLandSRDconst(encoded, s int64) int64 { + mb := s + r := 64 - s + // A larger mb is a smaller mask. + if (encoded>>8)&0xFF < mb { + encoded = (encoded &^ 0xFF00) | mb<<8 + } + // The rotate is expected to be 0. + if (encoded & 0xFF0000) != 0 { + panic("non-zero rotate") + } + return encoded | r<<16 +} + // DecodePPC64RotateMask is the inverse operation of encodePPC64RotateMask. The values returned as // mb and me satisfy the POWER ISA definition of MASK(x,y) where MASK(mb,me) = mask. func DecodePPC64RotateMask(sauxint int64) (rotate, mb, me int64, mask uint64) { diff --git a/src/cmd/compile/internal/ssa/rewritePPC64latelower.go b/src/cmd/compile/internal/ssa/rewritePPC64latelower.go index 56acbe403b..28e124d9e1 100644 --- a/src/cmd/compile/internal/ssa/rewritePPC64latelower.go +++ b/src/cmd/compile/internal/ssa/rewritePPC64latelower.go @@ -3,11 +3,16 @@ package ssa import "internal/buildcfg" +import "cmd/compile/internal/types" func rewriteValuePPC64latelower(v *Value) bool { switch v.Op { + case OpPPC64AND: + return rewriteValuePPC64latelower_OpPPC64AND(v) case OpPPC64ISEL: return rewriteValuePPC64latelower_OpPPC64ISEL(v) + case OpPPC64RLDICL: + return rewriteValuePPC64latelower_OpPPC64RLDICL(v) case OpPPC64SETBC: return rewriteValuePPC64latelower_OpPPC64SETBC(v) case OpPPC64SETBCR: @@ -15,6 +20,101 @@ func rewriteValuePPC64latelower(v *Value) bool { } return false } +func rewriteValuePPC64latelower_OpPPC64AND(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + b := v.Block + typ := &b.Func.Config.Types + // match: (AND x:(MOVDconst [m]) n) + // cond: t.Size() <= 2 + // result: (Select0 (ANDCCconst [int64(int16(m))] n)) + for { + t := v.Type + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + x := v_0 + if x.Op != OpPPC64MOVDconst { + continue + } + m := auxIntToInt64(x.AuxInt) + n := v_1 + if !(t.Size() <= 2) { + continue + } + v.reset(OpSelect0) + v0 := b.NewValue0(v.Pos, OpPPC64ANDCCconst, types.NewTuple(typ.Int, types.TypeFlags)) + v0.AuxInt = int64ToAuxInt(int64(int16(m))) + v0.AddArg(n) + v.AddArg(v0) + return true + } + break + } + // match: (AND x:(MOVDconst [m]) n) + // cond: isPPC64ValidShiftMask(m) + // result: (RLDICL [encodePPC64RotateMask(0,m,64)] n) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + x := v_0 + if x.Op != OpPPC64MOVDconst { + continue + } + m := auxIntToInt64(x.AuxInt) + n := v_1 + if !(isPPC64ValidShiftMask(m)) { + continue + } + v.reset(OpPPC64RLDICL) + v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(0, m, 64)) + v.AddArg(n) + return true + } + break + } + // match: (AND x:(MOVDconst [m]) n) + // cond: m != 0 && isPPC64ValidShiftMask(^m) + // result: (RLDICR [encodePPC64RotateMask(0,m,64)] n) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + x := v_0 + if x.Op != OpPPC64MOVDconst { + continue + } + m := auxIntToInt64(x.AuxInt) + n := v_1 + if !(m != 0 && isPPC64ValidShiftMask(^m)) { + continue + } + v.reset(OpPPC64RLDICR) + v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(0, m, 64)) + v.AddArg(n) + return true + } + break + } + // match: (AND x:(MOVDconst [m]) n) + // cond: t.Size() == 4 && isPPC64WordRotateMask(m) + // result: (RLWINM [encodePPC64RotateMask(0,m,32)] n) + for { + t := v.Type + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + x := v_0 + if x.Op != OpPPC64MOVDconst { + continue + } + m := auxIntToInt64(x.AuxInt) + n := v_1 + if !(t.Size() == 4 && isPPC64WordRotateMask(m)) { + continue + } + v.reset(OpPPC64RLWINM) + v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(0, m, 32)) + v.AddArg(n) + return true + } + break + } + return false +} func rewriteValuePPC64latelower_OpPPC64ISEL(v *Value) bool { v_2 := v.Args[2] v_1 := v.Args[1] @@ -49,6 +149,29 @@ func rewriteValuePPC64latelower_OpPPC64ISEL(v *Value) bool { } return false } +func rewriteValuePPC64latelower_OpPPC64RLDICL(v *Value) bool { + v_0 := v.Args[0] + // match: (RLDICL [em] x:(SRDconst [s] a)) + // cond: (em&0xFF0000)==0 + // result: (RLDICL [mergePPC64RLDICLandSRDconst(em, s)] a) + for { + em := auxIntToInt64(v.AuxInt) + x := v_0 + if x.Op != OpPPC64SRDconst { + break + } + s := auxIntToInt64(x.AuxInt) + a := x.Args[0] + if !((em & 0xFF0000) == 0) { + break + } + v.reset(OpPPC64RLDICL) + v.AuxInt = int64ToAuxInt(mergePPC64RLDICLandSRDconst(em, s)) + v.AddArg(a) + return true + } + return false +} func rewriteValuePPC64latelower_OpPPC64SETBC(v *Value) bool { v_0 := v.Args[0] b := v.Block diff --git a/test/codegen/bits.go b/test/codegen/bits.go index 88d5ebe9cf..67daf12d62 100644 --- a/test/codegen/bits.go +++ b/test/codegen/bits.go @@ -394,3 +394,29 @@ func zeroextendAndMask8to64(a int8, b int16) (x, y uint64) { return } + +// Verify rotate and mask instructions, and further simplified instructions for small types +func bitRotateAndMask(io64 [4]uint64, io32 [4]uint32, io16 [4]uint16, io8 [4]uint8) { + // ppc64x: "RLDICR\t[$]0, R[0-9]*, [$]47, R" + io64[0] = io64[0] & 0xFFFFFFFFFFFF0000 + // ppc64x: "RLDICL\t[$]0, R[0-9]*, [$]16, R" + io64[1] = io64[1] & 0x0000FFFFFFFFFFFF + // ppc64x: -"SRD", -"AND", "RLDICL\t[$]60, R[0-9]*, [$]16, R" + io64[2] = (io64[2] >> 4) & 0x0000FFFFFFFFFFFF + // ppc64x: -"SRD", -"AND", "RLDICL\t[$]36, R[0-9]*, [$]28, R" + io64[3] = (io64[3] >> 28) & 0x0000FFFFFFFFFFFF + + // ppc64x: "RLWNM\t[$]0, R[0-9]*, [$]4, [$]19, R" + io32[0] = io32[0] & 0x0FFFF000 + // ppc64x: "RLWNM\t[$]0, R[0-9]*, [$]20, [$]3, R" + io32[1] = io32[1] & 0xF0000FFF + // ppc64x: -"RLWNM", MOVD, AND + io32[2] = io32[2] & 0xFFFF0002 + + var bigc uint32 = 0x12345678 + // ppc64x: "ANDCC\t[$]22136" + io16[0] = io16[0] & uint16(bigc) + + // ppc64x: "ANDCC\t[$]120" + io8[0] = io8[0] & uint8(bigc) +}