From: Paul E. Murphy Date: Thu, 26 Oct 2023 14:23:12 +0000 (-0500) Subject: cmd/internal/asm/ppc64: avoid generating exser nops X-Git-Tag: go1.22rc1~425 X-Git-Url: http://www.git.cypherpunks.ru/?a=commitdiff_plain;h=7e5ed466bb63af4fcf0cae95f168ed1b4d4aa90c;p=gostls13.git cmd/internal/asm/ppc64: avoid generating exser nops "OR $0, R31, R31" is the execution serializing nop called "exser" on ISA 3.1 processors such as Power10. In general, the "OR $0, Rx, Rx" where Rx != 0 form should be avoided unless used explicitly for the uarch side-effects. Change-Id: Id76e3a703c902676ba4a3ffb64dd90dad9a320bf Reviewed-on: https://go-review.googlesource.com/c/go/+/537855 TryBot-Result: Gopher Robot Reviewed-by: Lynn Boger Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI Run-TryBot: Paul Murphy Reviewed-by: Heschi Kreinick --- diff --git a/src/cmd/asm/internal/asm/testdata/ppc64.s b/src/cmd/asm/internal/asm/testdata/ppc64.s index cc8d6c84d3..983a368a99 100644 --- a/src/cmd/asm/internal/asm/testdata/ppc64.s +++ b/src/cmd/asm/internal/asm/testdata/ppc64.s @@ -179,7 +179,9 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0 ADD $-32768, R6 // 38c68000 ADD $-32768, R6, R5 // 38a68000 // Hex constant 0xFFFFFFFE00000000 - ADD $-8589934592, R5 // 3fe0fffe63ff00007bff83e463ff00007cbf2a14 or 0602000038a50000 + ADD $-8589934592, R5 // 3fe0fffe600000007bff83e4600000007cbf2a14 or 0602000038a50000 + // Hex constant 0xFFFFFFFE00010001 + ADD $-8589869055, R5 // 3fe0fffe63ff00017bff83e463ff00017cbf2a14 or 0602000138a50001 //TODO: this compiles to add r5,r6,r0. It should be addi r5,r6,0. // this is OK since r0 == $0, but the latter is preferred. @@ -223,6 +225,8 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0 OR $-32768, R6, R7 // 3be080007fe73378 OR $1234567, R5 // 641f001263ffd6877fe52b78 OR $1234567, R5, R3 // 641f001263ffd6877fe32b78 + OR $2147483648, R5, R3 // 641f8000600000007fe32b78 + OR $2147483649, R5, R3 // 641f800063ff00017fe32b78 ORIS $255, R3, R4 XOR $1, R3 // 68630001 @@ -249,7 +253,6 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0 CMPB R3,R4,R4 // 7c6423f8 CMPEQB R3,R4,CR6 // 7f0321c0 - // TODO: constants for ADDC? ADD R3, R4 // 7c841a14 ADD R3, R4, R5 // 7ca41a14 ADDC R3, R4 // 7c841814 @@ -262,6 +265,8 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0 ADDV R3, R4 // 7c841e14 ADDVCC R3, R4 // 7c841e15 ADDCCC R3, R4, R5 // 7ca41815 + ADDCCC $65536, R4, R5 // 641f0001600000007cbf2015 + ADDCCC $65537, R4, R5 // 641f000163ff00017cbf2015 ADDME R3, R4 // 7c8301d4 ADDMECC R3, R4 // 7c8301d5 ADDMEV R3, R4 // 7c8305d4 @@ -315,6 +320,8 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0 SUBECC R3, R4, R5 // 7ca32111 SUBEV R3, R4, R5 // 7ca32510 SUBEVCC R3, R4, R5 // 7ca32511 + SUBC R3, $65536, R4 // 3fe00001600000007c83f810 + SUBC R3, $65537, R4 // 3fe0000163ff00017c83f810 MULLW R3, R4 // 7c8419d6 MULLW R3, R4, R5 // 7ca419d6 diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go index 73642bd209..dcecb26d00 100644 --- a/src/cmd/internal/obj/ppc64/asm9.go +++ b/src/cmd/internal/obj/ppc64/asm9.go @@ -65,6 +65,11 @@ const ( PFX_R_PCREL = 1 // Offset is relative to PC, RA should be 0 ) +const ( + // The preferred hardware nop instruction. + NOP = 0x60000000 +) + type Optab struct { as obj.As // Opcode a1 uint8 // p.From argument (obj.Addr). p is of type obj.Prog. @@ -831,7 +836,6 @@ func span9(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { // lay out the code, emitting code and data relocations. bp := c.cursym.P - nop := LOP_IRR(OP_ORI, REGZERO, REGZERO, 0) var i int32 for p := c.cursym.Func().Text.Link; p != nil; p = p.Link { c.pc = p.Pc @@ -846,13 +850,13 @@ func span9(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { if v > 0 { // Same padding instruction for all for i = 0; i < int32(v/4); i++ { - c.ctxt.Arch.ByteOrder.PutUint32(bp, nop) + c.ctxt.Arch.ByteOrder.PutUint32(bp, NOP) bp = bp[4:] } } } else { if p.Mark&PFX_X64B != 0 { - c.ctxt.Arch.ByteOrder.PutUint32(bp, nop) + c.ctxt.Arch.ByteOrder.PutUint32(bp, NOP) bp = bp[4:] } o.asmout(&c, p, o, &out) @@ -2531,6 +2535,18 @@ func decodeMask64(mask int64) (mb, me uint32, valid bool) { return mb, (me - 1) & 63, valid } +// Load the lower 16 bits of a constant into register r. +func loadl16(r int, d int64) uint32 { + v := uint16(d) + if v == 0 { + // Avoid generating "ori r,r,0", r != 0. Instead, generate the architectually preferred nop. + // For example, "ori r31,r31,0" is a special execution serializing nop on Power10 called "exser". + return NOP + } + return LOP_IRR(OP_ORI, uint32(r), uint32(r), uint32(v)) +} + +// Load the upper 16 bits of a 32b constant into register r. func loadu32(r int, d int64) uint32 { v := int32(d >> 16) if isuint32(uint64(d)) { @@ -2734,7 +2750,7 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) { rel.Add = int64(v) rel.Type = objabi.R_CALLPOWER } - o2 = 0x60000000 // nop, sometimes overwritten by ld r2, 24(r1) when dynamic linking + o2 = NOP // nop, sometimes overwritten by ld r2, 24(r1) when dynamic linking case 13: /* mov[bhwd]{z,} r,r */ // This needs to handle "MOV* $0, Rx". This shows up because $0 also @@ -2957,14 +2973,14 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) { } else if o.size == 12 { // Note, o1 is ADDIS if d is negative, ORIS otherwise. o1 = loadu32(REGTMP, d) // tmp = d & 0xFFFF0000 - o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d))) // tmp |= d & 0xFFFF + o2 = loadl16(REGTMP, d) // tmp |= d & 0xFFFF o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r)) // to = from + tmp } else { // For backwards compatibility with GOPPC64 < 10, generate 34b constants in register. - o1 = LOP_IRR(OP_ADDIS, REGZERO, REGTMP, uint32(d>>32)) // tmp = sign_extend((d>>32)&0xFFFF0000) - o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(d>>16)) // tmp |= (d>>16)&0xFFFF - o3 = AOP_MD(OP_RLDICR, REGTMP, REGTMP, 16, 63-16) // tmp <<= 16 - o4 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(uint16(d))) // tmp |= d&0xFFFF + o1 = LOP_IRR(OP_ADDIS, REGZERO, REGTMP, uint32(d>>32)) // tmp = sign_extend((d>>32)&0xFFFF0000) + o2 = loadl16(REGTMP, int64(d>>16)) // tmp |= (d>>16)&0xFFFF + o3 = AOP_MD(OP_RLDICR, REGTMP, REGTMP, 16, 63-16) // tmp <<= 16 + o4 = loadl16(REGTMP, int64(uint16(d))) // tmp |= d&0xFFFF o5 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r)) } @@ -2985,7 +3001,7 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) { o2 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r)) } else { o1 = loadu32(REGTMP, d) - o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(int32(d))) + o2 = loadl16(REGTMP, d) o3 = LOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), REGTMP, uint32(r)) } if p.From.Sym != nil { @@ -3081,9 +3097,9 @@ func asmout(c *ctxt9, p *obj.Prog, o *Optab, out *[5]uint32) { if p.To.Reg == REGTMP || p.From.Reg == REGTMP { c.ctxt.Diag("can't synthesize large constant\n%v", p) } - v := c.regoff(p.GetFrom3()) + v := c.vregoff(p.GetFrom3()) o1 = AOP_IRR(OP_ADDIS, REGTMP, REGZERO, uint32(v)>>16) - o2 = LOP_IRR(OP_ORI, REGTMP, REGTMP, uint32(v)) + o2 = loadl16(REGTMP, v) o3 = AOP_RRR(c.oprrr(p.As), uint32(p.To.Reg), uint32(p.From.Reg), REGTMP) if p.From.Sym != nil { c.ctxt.Diag("%v is not supported", p)