1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
8 "cmd/compile/internal/base"
9 "cmd/compile/internal/types"
14 // memcombine combines smaller loads and stores into larger ones.
15 // We ensure this generates good code for encoding/binary operations.
16 // It may help other cases also.
17 func memcombine(f *Func) {
18 // This optimization requires that the architecture has
19 // unaligned loads and unaligned stores.
20 if !f.Config.unalignedOK {
28 func memcombineLoads(f *Func) {
29 // Find "OR trees" to start with.
30 mark := f.newSparseSet(f.NumValues())
31 defer f.retSparseSet(mark)
34 // Mark all values that are the argument of an OR.
35 for _, b := range f.Blocks {
36 for _, v := range b.Values {
37 if v.Op == OpOr16 || v.Op == OpOr32 || v.Op == OpOr64 {
38 mark.add(v.Args[0].ID)
39 mark.add(v.Args[1].ID)
43 for _, b := range f.Blocks {
44 for _, v := range b.Values {
45 if v.Op != OpOr16 && v.Op != OpOr32 && v.Op != OpOr64 {
48 if mark.contains(v.ID) {
49 // marked - means it is not the root of an OR tree
52 // Add the OR tree rooted at v to the order.
53 // We use BFS here, but any walk that puts roots before leaves would work.
55 order = append(order, v)
56 for ; i < len(order); i++ {
58 for j := 0; j < 2; j++ {
60 if a.Op == OpOr16 || a.Op == OpOr32 || a.Op == OpOr64 {
61 order = append(order, a)
66 for _, v := range order {
67 max := f.Config.RegSize
77 for n := max; n > 1; n /= 2 {
78 if combineLoads(v, n) {
86 // A BaseAddress represents the address ptr+idx, where
87 // ptr is a pointer type and idx is an integer type.
88 // idx may be nil, in which case it is treated as 0.
89 type BaseAddress struct {
94 // splitPtr returns the base address of ptr and any
95 // constant offset from that base.
96 // BaseAddress{ptr,nil},0 is always a valid result, but splitPtr
97 // tries to peel away as many constants into off as possible.
98 func splitPtr(ptr *Value) (BaseAddress, int64) {
102 if ptr.Op == OpOffPtr {
105 } else if ptr.Op == OpAddPtr {
107 // We have two or more indexing values.
108 // Pick the first one we found.
109 return BaseAddress{ptr: ptr, idx: idx}, off
112 if idx.Op == OpAdd32 || idx.Op == OpAdd64 {
113 if idx.Args[0].Op == OpConst32 || idx.Args[0].Op == OpConst64 {
114 off += idx.Args[0].AuxInt
116 } else if idx.Args[1].Op == OpConst32 || idx.Args[1].Op == OpConst64 {
117 off += idx.Args[1].AuxInt
123 return BaseAddress{ptr: ptr, idx: idx}, off
128 func combineLoads(root *Value, n int64) bool {
142 // Find n values that are ORed together with the above op.
143 a := make([]*Value, 0, 8)
145 for i := 0; i < len(a) && int64(len(a)) < n; i++ {
147 if v.Uses != 1 && v != root {
148 // Something in this subtree is used somewhere else.
153 a = append(a, v.Args[1])
157 if int64(len(a)) != n {
161 // Check that the first entry to see what ops we're looking for.
162 // All the entries should be of the form shift(extend(load)), maybe with no shift.
168 if orOp == OpOr64 && (v.Op == OpZeroExt8to64 || v.Op == OpZeroExt16to64 || v.Op == OpZeroExt32to64) ||
169 orOp == OpOr32 && (v.Op == OpZeroExt8to32 || v.Op == OpZeroExt16to32) ||
170 orOp == OpOr16 && v.Op == OpZeroExt8to16 {
179 base, _ := splitPtr(v.Args[0])
181 size := v.Type.Size()
183 if root.Block.Func.Config.arch == "S390X" {
184 // s390x can't handle unaligned accesses to global variables.
185 if base.ptr.Op == OpAddr {
190 // Check all the entries, extract useful info.
191 type LoadRecord struct {
193 offset int64 // offset of load address from base
196 r := make([]LoadRecord, n, 8)
197 for i := int64(0); i < n; i++ {
204 if v.Args[1].Op != OpConst64 {
207 shift = v.Args[1].AuxInt
217 if load.Op != OpLoad {
223 if load.Args[1] != mem {
226 p, off := splitPtr(load.Args[0])
230 r[i] = LoadRecord{load: load, offset: off, shift: shift}
233 // Sort in memory address order.
234 sort.Slice(r, func(i, j int) bool {
235 return r[i].offset < r[j].offset
238 // Check that we have contiguous offsets.
239 for i := int64(0); i < n; i++ {
240 if r[i].offset != r[0].offset+i*size {
245 // Check for reads in little-endian or big-endian order.
247 isLittleEndian := true
248 for i := int64(0); i < n; i++ {
249 if r[i].shift != shift0+i*size*8 {
250 isLittleEndian = false
255 for i := int64(0); i < n; i++ {
256 if r[i].shift != shift0-i*size*8 {
261 if !isLittleEndian && !isBigEndian {
265 // Find a place to put the new load.
266 // This is tricky, because it has to be at a point where
267 // its memory argument is live. We can't just put it in root.Block.
268 // We use the block of the latest load.
269 loads := make([]*Value, n, 8)
270 for i := int64(0); i < n; i++ {
273 loadBlock := mergePoint(root.Block, loads...)
274 if loadBlock == nil {
277 // Find a source position to use.
279 for _, load := range loads {
280 if load.Block == loadBlock {
285 if pos == src.NoXPos {
289 // Check to see if we need byte swap before storing.
290 needSwap := isLittleEndian && root.Block.Func.Config.BigEndian ||
291 isBigEndian && !root.Block.Func.Config.BigEndian
292 if needSwap && (size != 1 || !root.Block.Func.Config.haveByteSwap(n)) {
296 // This is the commit point.
298 // First, issue load at lowest address.
299 v = loadBlock.NewValue2(pos, OpLoad, sizeType(n*size), r[0].load.Args[0], mem)
301 // Byte swap if needed,
303 v = byteSwap(loadBlock, pos, v)
307 if n*size < root.Type.Size() {
308 v = zeroExtend(loadBlock, pos, v, n*size, root.Type.Size())
312 if isLittleEndian && shift0 != 0 {
313 v = leftShift(loadBlock, pos, v, shift0)
315 if isBigEndian && shift0-(n-1)*8 != 0 {
316 v = leftShift(loadBlock, pos, v, shift0-(n-1)*8)
319 // Install with (Copy v).
323 // Clobber the loads, just to prevent additional work being done on
324 // subtrees (which are now unreachable).
325 for i := int64(0); i < n; i++ {
331 func memcombineStores(f *Func) {
332 mark := f.newSparseSet(f.NumValues())
333 defer f.retSparseSet(mark)
336 for _, b := range f.Blocks {
337 // Mark all stores which are not last in a store sequence.
339 for _, v := range b.Values {
341 mark.add(v.MemoryArg().ID)
345 // pick an order for visiting stores such that
346 // later stores come earlier in the ordering.
348 for _, v := range b.Values {
352 if mark.contains(v.ID) {
353 continue // not last in a chain of stores
356 order = append(order, v)
358 if v.Block != b || v.Op != OpStore {
364 // Look for combining opportunities at each store in queue order.
365 for _, v := range order {
366 if v.Op != OpStore { // already rewritten
370 size := v.Aux.(*types.Type).Size()
371 if size >= f.Config.RegSize || size == 0 {
375 for n := f.Config.RegSize / size; n > 1; n /= 2 {
376 if combineStores(v, n) {
384 // Try to combine the n stores ending in root.
385 // Returns true if successful.
386 func combineStores(root *Value, n int64) bool {
388 type StoreRecord struct {
392 getShiftBase := func(a []StoreRecord) *Value {
393 x := a[0].store.Args[1]
394 y := a[1].store.Args[1]
396 case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
402 case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
409 case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
415 case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
420 // a shift of x and x itself.
424 // a shift of y and y itself.
428 // 2 shifts both of the same argument.
433 isShiftBase := func(v, base *Value) bool {
436 case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
445 case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
452 shift := func(v, base *Value) int64 {
455 case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
464 case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
469 if val.Op != OpConst64 {
475 // Element size of the individual stores.
476 size := root.Aux.(*types.Type).Size()
477 if size*n > root.Block.Func.Config.RegSize {
481 // Gather n stores to look at. Check easy conditions we require.
482 a := make([]StoreRecord, 0, 8)
483 rbase, roff := splitPtr(root.Args[0])
484 if root.Block.Func.Config.arch == "S390X" {
485 // s390x can't handle unaligned accesses to global variables.
486 if rbase.ptr.Op == OpAddr {
490 a = append(a, StoreRecord{root, roff})
491 for i, x := int64(1), root.Args[2]; i < n; i, x = i+1, x.Args[2] {
495 if x.Block != root.Block {
498 if x.Uses != 1 { // Note: root can have more than one use.
501 if x.Aux.(*types.Type).Size() != size {
504 base, off := splitPtr(x.Args[0])
508 a = append(a, StoreRecord{x, off})
510 // Before we sort, grab the memory arg the result should have.
511 mem := a[n-1].store.Args[2]
513 // Sort stores in increasing address order.
514 sort.Slice(a, func(i, j int) bool {
515 return a[i].offset < a[j].offset
518 // Check that everything is written to sequential locations.
519 for i := int64(0); i < n; i++ {
520 if a[i].offset != a[0].offset+i*size {
525 // Memory location we're going to write at (the lowest one).
526 ptr := a[0].store.Args[0]
528 // Check for constant stores
530 for i := int64(0); i < n; i++ {
531 switch a[i].store.Args[1].Op {
532 case OpConst32, OpConst16, OpConst8:
539 // Modify root to do all the stores.
541 mask := int64(1)<<(8*size) - 1
542 for i := int64(0); i < n; i++ {
543 s := 8 * size * int64(i)
544 if root.Block.Func.Config.BigEndian {
547 c |= (a[i].store.Args[1].AuxInt & mask) << s
552 cv = root.Block.Func.ConstInt16(types.Types[types.TUINT16], int16(c))
554 cv = root.Block.Func.ConstInt32(types.Types[types.TUINT32], int32(c))
556 cv = root.Block.Func.ConstInt64(types.Types[types.TUINT64], c)
559 // Move all the stores to the root.
560 for i := int64(0); i < n; i++ {
563 v.Aux = cv.Type // widen store type
569 v.Type = types.Types[types.TBOOL] // erase memory type
575 // Check that all the shift/trunc are of the same base value.
576 shiftBase := getShiftBase(a)
577 if shiftBase == nil {
580 for i := int64(0); i < n; i++ {
581 if !isShiftBase(a[i].store, shiftBase) {
586 // Check for writes in little-endian or big-endian order.
587 isLittleEndian := true
588 shift0 := shift(a[0].store, shiftBase)
589 for i := int64(1); i < n; i++ {
590 if shift(a[i].store, shiftBase) != shift0+i*8 {
591 isLittleEndian = false
596 for i := int64(1); i < n; i++ {
597 if shift(a[i].store, shiftBase) != shift0-i*8 {
602 if !isLittleEndian && !isBigEndian {
606 // Check to see if we need byte swap before storing.
607 needSwap := isLittleEndian && root.Block.Func.Config.BigEndian ||
608 isBigEndian && !root.Block.Func.Config.BigEndian
609 if needSwap && (size != 1 || !root.Block.Func.Config.haveByteSwap(n)) {
613 // This is the commit point.
615 // Modify root to do all the stores.
617 if isLittleEndian && shift0 != 0 {
618 sv = rightShift(root.Block, root.Pos, sv, shift0)
620 if isBigEndian && shift0-(n-1)*8 != 0 {
621 sv = rightShift(root.Block, root.Pos, sv, shift0-(n-1)*8)
623 if sv.Type.Size() > size*n {
624 sv = truncate(root.Block, root.Pos, sv, sv.Type.Size(), size*n)
627 sv = byteSwap(root.Block, root.Pos, sv)
630 // Move all the stores to the root.
631 for i := int64(0); i < n; i++ {
634 v.Aux = sv.Type // widen store type
640 v.Type = types.Types[types.TBOOL] // erase memory type
646 func sizeType(size int64) *types.Type {
649 return types.Types[types.TUINT64]
651 return types.Types[types.TUINT32]
653 return types.Types[types.TUINT16]
655 base.Fatalf("bad size %d\n", size)
660 func truncate(b *Block, pos src.XPos, v *Value, from, to int64) *Value {
661 switch from*10 + to {
663 return b.NewValue1(pos, OpTrunc64to16, types.Types[types.TUINT16], v)
665 return b.NewValue1(pos, OpTrunc64to32, types.Types[types.TUINT32], v)
667 return b.NewValue1(pos, OpTrunc32to16, types.Types[types.TUINT16], v)
669 base.Fatalf("bad sizes %d %d\n", from, to)
673 func zeroExtend(b *Block, pos src.XPos, v *Value, from, to int64) *Value {
674 switch from*10 + to {
676 return b.NewValue1(pos, OpZeroExt16to32, types.Types[types.TUINT32], v)
678 return b.NewValue1(pos, OpZeroExt16to64, types.Types[types.TUINT64], v)
680 return b.NewValue1(pos, OpZeroExt32to64, types.Types[types.TUINT64], v)
682 base.Fatalf("bad sizes %d %d\n", from, to)
687 func leftShift(b *Block, pos src.XPos, v *Value, shift int64) *Value {
688 s := b.Func.ConstInt64(types.Types[types.TUINT64], shift)
689 size := v.Type.Size()
692 return b.NewValue2(pos, OpLsh64x64, v.Type, v, s)
694 return b.NewValue2(pos, OpLsh32x64, v.Type, v, s)
696 return b.NewValue2(pos, OpLsh16x64, v.Type, v, s)
698 base.Fatalf("bad size %d\n", size)
702 func rightShift(b *Block, pos src.XPos, v *Value, shift int64) *Value {
703 s := b.Func.ConstInt64(types.Types[types.TUINT64], shift)
704 size := v.Type.Size()
707 return b.NewValue2(pos, OpRsh64Ux64, v.Type, v, s)
709 return b.NewValue2(pos, OpRsh32Ux64, v.Type, v, s)
711 return b.NewValue2(pos, OpRsh16Ux64, v.Type, v, s)
713 base.Fatalf("bad size %d\n", size)
717 func byteSwap(b *Block, pos src.XPos, v *Value) *Value {
718 switch v.Type.Size() {
720 return b.NewValue1(pos, OpBswap64, v.Type, v)
722 return b.NewValue1(pos, OpBswap32, v.Type, v)
724 return b.NewValue1(pos, OpBswap16, v.Type, v)
727 v.Fatalf("bad size %d\n", v.Type.Size())