From: Paul E. Murphy <murp@ibm.com>
Date: Tue, 27 Jun 2023 22:17:33 +0000 (-0500)
Subject: cmd/compile/internal/ssa: improve masking codegen on PPC64
X-Git-Tag: go1.22rc1~985
X-Git-Url: http://www.git.cypherpunks.ru/?a=commitdiff_plain;h=5cdb132228b90732d57215893a9910ded694c585;p=gostls13.git

cmd/compile/internal/ssa: improve masking codegen on PPC64

Generate RLDIC[LR] instead of MOVD mask, Rx; AND Rx, Ry, Rz.
This helps reduce code size, and reduces the latency caused
by the constant load.

Similarly, for smaller-than-register values, truncate constants
which exceed the range of the value's type to avoid needing to
load a constant.

Change-Id: I6019684795eb8962d4fd6d9585d08b17c15e7d64
Reviewed-on: https://go-review.googlesource.com/c/go/+/515576
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
---

diff --git a/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules
index 00d898f783..c4af55c328 100644
--- a/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules
+++ b/src/cmd/compile/internal/ssa/_gen/PPC64latelower.rules
@@ -17,3 +17,16 @@
 (SETBCR [0] cmp) && buildcfg.GOPPC64 <= 9 => (ISELZ [4] (MOVDconst [1]) cmp)
 (SETBC [1] cmp) && buildcfg.GOPPC64 <= 9 => (ISELZ [1] (MOVDconst [1]) cmp)
 (SETBCR [1] cmp) && buildcfg.GOPPC64 <= 9 => (ISELZ [5] (MOVDconst [1]) cmp)
+
+// The upper bits of the smaller than register values is undefined. Take advantage of that.
+(AND <t> x:(MOVDconst [m]) n) && t.Size() <= 2 => (Select0 (ANDCCconst [int64(int16(m))] n))
+
+// Convert simple bit masks to an equivalent rldic[lr] if possible.
+(AND x:(MOVDconst [m]) n) && isPPC64ValidShiftMask(m) => (RLDICL [encodePPC64RotateMask(0,m,64)] n)
+(AND x:(MOVDconst [m]) n) && m != 0 && isPPC64ValidShiftMask(^m) => (RLDICR [encodePPC64RotateMask(0,m,64)] n)
+
+// If the RLDICL does not rotate its value, a shifted value can be merged.
+(RLDICL [em] x:(SRDconst [s] a)) && (em&0xFF0000) == 0 => (RLDICL [mergePPC64RLDICLandSRDconst(em, s)] a)
+
+// Convert rotated 32 bit masks on 32 bit values into rlwinm. In general, this leaves the upper 32 bits in an undefined state.
+(AND <t> x:(MOVDconst [m]) n) && t.Size() == 4 && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(0,m,32)] n)
diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go
index 6ee661dbbd..efbaae4d46 100644
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -1499,6 +1499,25 @@ func encodePPC64RotateMask(rotate, mask, nbits int64) int64 {
 	return int64(me) | int64(mb<<8) | int64(rotate<<16) | int64(nbits<<24)
 }
 
+// Merge (RLDICL [encoded] (SRDconst [s] x)) into (RLDICL [new_encoded] x)
+// SRDconst on PPC64 is an extended mnemonic of RLDICL. If the input to an
+// RLDICL is an SRDconst, and the RLDICL does not rotate its value, the two
+// operations can be combined. This functions assumes the two opcodes can
+// be merged, and returns an encoded rotate+mask value of the combined RLDICL.
+func mergePPC64RLDICLandSRDconst(encoded, s int64) int64 {
+	mb := s
+	r := 64 - s
+	// A larger mb is a smaller mask.
+	if (encoded>>8)&0xFF < mb {
+		encoded = (encoded &^ 0xFF00) | mb<<8
+	}
+	// The rotate is expected to be 0.
+	if (encoded & 0xFF0000) != 0 {
+		panic("non-zero rotate")
+	}
+	return encoded | r<<16
+}
+
 // DecodePPC64RotateMask is the inverse operation of encodePPC64RotateMask.  The values returned as
 // mb and me satisfy the POWER ISA definition of MASK(x,y) where MASK(mb,me) = mask.
 func DecodePPC64RotateMask(sauxint int64) (rotate, mb, me int64, mask uint64) {
diff --git a/src/cmd/compile/internal/ssa/rewritePPC64latelower.go b/src/cmd/compile/internal/ssa/rewritePPC64latelower.go
index 56acbe403b..28e124d9e1 100644
--- a/src/cmd/compile/internal/ssa/rewritePPC64latelower.go
+++ b/src/cmd/compile/internal/ssa/rewritePPC64latelower.go
@@ -3,11 +3,16 @@
 package ssa
 
 import "internal/buildcfg"
+import "cmd/compile/internal/types"
 
 func rewriteValuePPC64latelower(v *Value) bool {
 	switch v.Op {
+	case OpPPC64AND:
+		return rewriteValuePPC64latelower_OpPPC64AND(v)
 	case OpPPC64ISEL:
 		return rewriteValuePPC64latelower_OpPPC64ISEL(v)
+	case OpPPC64RLDICL:
+		return rewriteValuePPC64latelower_OpPPC64RLDICL(v)
 	case OpPPC64SETBC:
 		return rewriteValuePPC64latelower_OpPPC64SETBC(v)
 	case OpPPC64SETBCR:
@@ -15,6 +20,101 @@ func rewriteValuePPC64latelower(v *Value) bool {
 	}
 	return false
 }
+func rewriteValuePPC64latelower_OpPPC64AND(v *Value) bool {
+	v_1 := v.Args[1]
+	v_0 := v.Args[0]
+	b := v.Block
+	typ := &b.Func.Config.Types
+	// match: (AND <t> x:(MOVDconst [m]) n)
+	// cond: t.Size() <= 2
+	// result: (Select0 (ANDCCconst [int64(int16(m))] n))
+	for {
+		t := v.Type
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x := v_0
+			if x.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(x.AuxInt)
+			n := v_1
+			if !(t.Size() <= 2) {
+				continue
+			}
+			v.reset(OpSelect0)
+			v0 := b.NewValue0(v.Pos, OpPPC64ANDCCconst, types.NewTuple(typ.Int, types.TypeFlags))
+			v0.AuxInt = int64ToAuxInt(int64(int16(m)))
+			v0.AddArg(n)
+			v.AddArg(v0)
+			return true
+		}
+		break
+	}
+	// match: (AND x:(MOVDconst [m]) n)
+	// cond: isPPC64ValidShiftMask(m)
+	// result: (RLDICL [encodePPC64RotateMask(0,m,64)] n)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x := v_0
+			if x.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(x.AuxInt)
+			n := v_1
+			if !(isPPC64ValidShiftMask(m)) {
+				continue
+			}
+			v.reset(OpPPC64RLDICL)
+			v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(0, m, 64))
+			v.AddArg(n)
+			return true
+		}
+		break
+	}
+	// match: (AND x:(MOVDconst [m]) n)
+	// cond: m != 0 && isPPC64ValidShiftMask(^m)
+	// result: (RLDICR [encodePPC64RotateMask(0,m,64)] n)
+	for {
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x := v_0
+			if x.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(x.AuxInt)
+			n := v_1
+			if !(m != 0 && isPPC64ValidShiftMask(^m)) {
+				continue
+			}
+			v.reset(OpPPC64RLDICR)
+			v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(0, m, 64))
+			v.AddArg(n)
+			return true
+		}
+		break
+	}
+	// match: (AND <t> x:(MOVDconst [m]) n)
+	// cond: t.Size() == 4 && isPPC64WordRotateMask(m)
+	// result: (RLWINM [encodePPC64RotateMask(0,m,32)] n)
+	for {
+		t := v.Type
+		for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+			x := v_0
+			if x.Op != OpPPC64MOVDconst {
+				continue
+			}
+			m := auxIntToInt64(x.AuxInt)
+			n := v_1
+			if !(t.Size() == 4 && isPPC64WordRotateMask(m)) {
+				continue
+			}
+			v.reset(OpPPC64RLWINM)
+			v.AuxInt = int64ToAuxInt(encodePPC64RotateMask(0, m, 32))
+			v.AddArg(n)
+			return true
+		}
+		break
+	}
+	return false
+}
 func rewriteValuePPC64latelower_OpPPC64ISEL(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@@ -49,6 +149,29 @@ func rewriteValuePPC64latelower_OpPPC64ISEL(v *Value) bool {
 	}
 	return false
 }
+func rewriteValuePPC64latelower_OpPPC64RLDICL(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (RLDICL [em] x:(SRDconst [s] a))
+	// cond: (em&0xFF0000)==0
+	// result: (RLDICL [mergePPC64RLDICLandSRDconst(em, s)] a)
+	for {
+		em := auxIntToInt64(v.AuxInt)
+		x := v_0
+		if x.Op != OpPPC64SRDconst {
+			break
+		}
+		s := auxIntToInt64(x.AuxInt)
+		a := x.Args[0]
+		if !((em & 0xFF0000) == 0) {
+			break
+		}
+		v.reset(OpPPC64RLDICL)
+		v.AuxInt = int64ToAuxInt(mergePPC64RLDICLandSRDconst(em, s))
+		v.AddArg(a)
+		return true
+	}
+	return false
+}
 func rewriteValuePPC64latelower_OpPPC64SETBC(v *Value) bool {
 	v_0 := v.Args[0]
 	b := v.Block
diff --git a/test/codegen/bits.go b/test/codegen/bits.go
index 88d5ebe9cf..67daf12d62 100644
--- a/test/codegen/bits.go
+++ b/test/codegen/bits.go
@@ -394,3 +394,29 @@ func zeroextendAndMask8to64(a int8, b int16) (x, y uint64) {
 	return
 
 }
+
+// Verify rotate and mask instructions, and further simplified instructions for small types
+func bitRotateAndMask(io64 [4]uint64, io32 [4]uint32, io16 [4]uint16, io8 [4]uint8) {
+	// ppc64x: "RLDICR\t[$]0, R[0-9]*, [$]47, R"
+	io64[0] = io64[0] & 0xFFFFFFFFFFFF0000
+	// ppc64x: "RLDICL\t[$]0, R[0-9]*, [$]16, R"
+	io64[1] = io64[1] & 0x0000FFFFFFFFFFFF
+	// ppc64x: -"SRD", -"AND", "RLDICL\t[$]60, R[0-9]*, [$]16, R"
+	io64[2] = (io64[2] >> 4) & 0x0000FFFFFFFFFFFF
+	// ppc64x: -"SRD", -"AND", "RLDICL\t[$]36, R[0-9]*, [$]28, R"
+	io64[3] = (io64[3] >> 28) & 0x0000FFFFFFFFFFFF
+
+	// ppc64x: "RLWNM\t[$]0, R[0-9]*, [$]4, [$]19, R"
+	io32[0] = io32[0] & 0x0FFFF000
+	// ppc64x: "RLWNM\t[$]0, R[0-9]*, [$]20, [$]3, R"
+	io32[1] = io32[1] & 0xF0000FFF
+	// ppc64x: -"RLWNM", MOVD, AND
+	io32[2] = io32[2] & 0xFFFF0002
+
+	var bigc uint32 = 0x12345678
+	// ppc64x: "ANDCC\t[$]22136"
+	io16[0] = io16[0] & uint16(bigc)
+
+	// ppc64x: "ANDCC\t[$]120"
+	io8[0] = io8[0] & uint8(bigc)
+}