This CL enables intrinsic support to emit the following prefetch
instructions for PPC64 platform that are already emitted on other
platforms
1. Prefetch - prefetches data from memory address to cache;
2. PrefetchStreamed - prefetches data from memory address, with a
hint that this data is being streamed.
Benchmarks picked from go/test/bench/garbage
Parameters tested with:
GOMAXPROCS=8
tree2 -heapsize=
1000000000 -cpus=8
tree -n=18
parser
peano
Performance results with this change on POWER9
name old time/op new time/op delta
Tree2-8 75.3ms ± 2% 65.0ms ± 6% -13.61% (p=0.003 n=5+7)
Tree-8 576ms ± 2% 576ms ± 1% ~ (p=0.756 n=11+10)
Parser-8 3.60s ± 2% 3.59s ± 1% ~ (p=0.818 n=6+6)
Peano-8 84.8ms ± 1% 84.6ms ± 1% ~ (p=0.180 n=6+6)
Results on POWER8 and POWER10 are similar
Change-Id: If4ac95a85aaa7b2266014e1f8fb7cd7440cbf906
Reviewed-on: https://go-review.googlesource.com/c/go/+/353730
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Trust: Michael Knyszek <mknyszek@google.com>
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
+ case ssa.OpPPC64DCBT:
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = v.Args[0].Reg()
+ p.To.Type = obj.TYPE_CONST
+ p.To.Offset = v.AuxInt
+
case ssa.OpPPC64MOVWstorezero, ssa.OpPPC64MOVHstorezero, ssa.OpPPC64MOVBstorezero:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
&& clobber(call)
=> (Move [sz] dst src mem)
+// Prefetch instructions (aux is option: 0 - DCBT ; 8 - DCBT stream)
+(PrefetchCache ptr mem) => (DCBT ptr mem [0])
+(PrefetchCacheStreamed ptr mem) => (DCBT ptr mem [8])
+
crgp21 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
gpload = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
gploadidx = regInfo{inputs: []regMask{gp | sp | sb, gp}, outputs: []regMask{gp}}
+ prefreg = regInfo{inputs: []regMask{gp | sp | sb}}
gpstore = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}}
gpstoreidx = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}}
gpstorezero = regInfo{inputs: []regMask{gp | sp | sb}} // ppc64.REGZERO is reserved zero value
{name: "FMOVDloadidx", argLength: 3, reg: fploadidx, asm: "FMOVD", typ: "Float64"},
{name: "FMOVSloadidx", argLength: 3, reg: fploadidx, asm: "FMOVS", typ: "Float32"},
+ // Prefetch instruction
+ // Do prefetch of address generated with arg0 and arg1 with option aux. arg0=addr,arg1=memory, aux=option.
+ {name: "DCBT", argLength: 2, aux: "Int64", reg: prefreg, asm: "DCBT", hasSideEffects: true},
+
// Store bytes in the reverse endian order of the arch into arg0.
// These are indexed stores with no offset field in the instruction so the auxint fields are not used.
{name: "MOVDBRstore", argLength: 3, reg: gpstore, asm: "MOVDBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes reverse order
OpPPC64MOVDBRloadidx
OpPPC64FMOVDloadidx
OpPPC64FMOVSloadidx
+ OpPPC64DCBT
OpPPC64MOVDBRstore
OpPPC64MOVWBRstore
OpPPC64MOVHBRstore
},
},
},
+ {
+ name: "DCBT",
+ auxType: auxInt64,
+ argLen: 2,
+ hasSideEffects: true,
+ asm: ppc64.ADCBT,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+ },
+ },
+ },
{
name: "MOVDBRstore",
auxType: auxSym,
return true
case OpPopCount8:
return rewriteValuePPC64_OpPopCount8(v)
+ case OpPrefetchCache:
+ return rewriteValuePPC64_OpPrefetchCache(v)
+ case OpPrefetchCacheStreamed:
+ return rewriteValuePPC64_OpPrefetchCacheStreamed(v)
case OpRotateLeft16:
return rewriteValuePPC64_OpRotateLeft16(v)
case OpRotateLeft32:
return true
}
}
+func rewriteValuePPC64_OpPrefetchCache(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (PrefetchCache ptr mem)
+ // result: (DCBT ptr mem [0])
+ for {
+ ptr := v_0
+ mem := v_1
+ v.reset(OpPPC64DCBT)
+ v.AuxInt = int64ToAuxInt(0)
+ v.AddArg2(ptr, mem)
+ return true
+ }
+}
+func rewriteValuePPC64_OpPrefetchCacheStreamed(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (PrefetchCacheStreamed ptr mem)
+ // result: (DCBT ptr mem [8])
+ for {
+ ptr := v_0
+ mem := v_1
+ v.reset(OpPPC64DCBT)
+ v.AuxInt = int64ToAuxInt(8)
+ v.AddArg2(ptr, mem)
+ return true
+ }
+}
func rewriteValuePPC64_OpRotateLeft16(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
// Make Prefetch intrinsics for supported platforms
// On the unsupported platforms stub function will be eliminated
addF("runtime/internal/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
- sys.AMD64, sys.ARM64)
+ sys.AMD64, sys.ARM64, sys.PPC64)
addF("runtime/internal/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
- sys.AMD64, sys.ARM64)
+ sys.AMD64, sys.ARM64, sys.PPC64)
/******** runtime/internal/atomic ********/
addF("runtime/internal/atomic", "Load",