1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
8 "cmd/compile/internal/base"
9 "cmd/compile/internal/types"
14 // memcombine combines smaller loads and stores into larger ones.
15 // We ensure this generates good code for encoding/binary operations.
16 // It may help other cases also.
17 func memcombine(f *Func) {
18 // This optimization requires that the architecture has
19 // unaligned loads and unaligned stores.
20 if !f.Config.unalignedOK {
28 func memcombineLoads(f *Func) {
29 // Find "OR trees" to start with.
30 mark := f.newSparseSet(f.NumValues())
31 defer f.retSparseSet(mark)
34 // Mark all values that are the argument of an OR.
35 for _, b := range f.Blocks {
36 for _, v := range b.Values {
37 if v.Op == OpOr16 || v.Op == OpOr32 || v.Op == OpOr64 {
38 mark.add(v.Args[0].ID)
39 mark.add(v.Args[1].ID)
43 for _, b := range f.Blocks {
44 for _, v := range b.Values {
45 if v.Op != OpOr16 && v.Op != OpOr32 && v.Op != OpOr64 {
48 if mark.contains(v.ID) {
49 // marked - means it is not the root of an OR tree
52 // Add the OR tree rooted at v to the order.
53 // We use BFS here, but any walk that puts roots before leaves would work.
55 order = append(order, v)
56 for ; i < len(order); i++ {
58 for j := 0; j < 2; j++ {
60 if a.Op == OpOr16 || a.Op == OpOr32 || a.Op == OpOr64 {
61 order = append(order, a)
66 for _, v := range order {
67 max := f.Config.RegSize
77 for n := max; n > 1; n /= 2 {
78 if combineLoads(v, n) {
86 // A BaseAddress represents the address ptr+idx, where
87 // ptr is a pointer type and idx is an integer type.
88 // idx may be nil, in which case it is treated as 0.
89 type BaseAddress struct {
94 // splitPtr returns the base address of ptr and any
95 // constant offset from that base.
96 // BaseAddress{ptr,nil},0 is always a valid result, but splitPtr
97 // tries to peel away as many constants into off as possible.
98 func splitPtr(ptr *Value) (BaseAddress, int64) {
102 if ptr.Op == OpOffPtr {
105 } else if ptr.Op == OpAddPtr {
107 // We have two or more indexing values.
108 // Pick the first one we found.
109 return BaseAddress{ptr: ptr, idx: idx}, off
112 if idx.Op == OpAdd32 || idx.Op == OpAdd64 {
113 if idx.Args[0].Op == OpConst32 || idx.Args[0].Op == OpConst64 {
114 off += idx.Args[0].AuxInt
116 } else if idx.Args[1].Op == OpConst32 || idx.Args[1].Op == OpConst64 {
117 off += idx.Args[1].AuxInt
123 return BaseAddress{ptr: ptr, idx: idx}, off
128 func combineLoads(root *Value, n int64) bool {
142 // Find n values that are ORed together with the above op.
143 a := make([]*Value, 0, 8)
145 for i := 0; i < len(a) && int64(len(a)) < n; i++ {
147 if v.Uses != 1 && v != root {
148 // Something in this subtree is used somewhere else.
153 a = append(a, v.Args[1])
157 if int64(len(a)) != n {
161 // Check that the first entry to see what ops we're looking for.
162 // All the entries should be of the form shift(extend(load)), maybe with no shift.
168 if orOp == OpOr64 && (v.Op == OpZeroExt8to64 || v.Op == OpZeroExt16to64 || v.Op == OpZeroExt32to64) ||
169 orOp == OpOr32 && (v.Op == OpZeroExt8to32 || v.Op == OpZeroExt16to32) ||
170 orOp == OpOr16 && v.Op == OpZeroExt8to16 {
179 base, _ := splitPtr(v.Args[0])
181 size := v.Type.Size()
183 if root.Block.Func.Config.arch == "S390X" {
184 // s390x can't handle unaligned accesses to global variables.
185 if base.ptr.Op == OpAddr {
190 // Check all the entries, extract useful info.
191 type LoadRecord struct {
193 offset int64 // offset of load address from base
196 r := make([]LoadRecord, n, 8)
197 for i := int64(0); i < n; i++ {
204 if v.Args[1].Op != OpConst64 {
207 shift = v.Args[1].AuxInt
217 if load.Op != OpLoad {
223 if load.Args[1] != mem {
226 p, off := splitPtr(load.Args[0])
230 r[i] = LoadRecord{load: load, offset: off, shift: shift}
233 // Sort in memory address order.
234 sort.Slice(r, func(i, j int) bool {
235 return r[i].offset < r[j].offset
238 // Check that we have contiguous offsets.
239 for i := int64(0); i < n; i++ {
240 if r[i].offset != r[0].offset+i*size {
245 // Check for reads in little-endian or big-endian order.
247 isLittleEndian := true
248 for i := int64(0); i < n; i++ {
249 if r[i].shift != shift0+i*size*8 {
250 isLittleEndian = false
255 for i := int64(0); i < n; i++ {
256 if r[i].shift != shift0-i*size*8 {
261 if !isLittleEndian && !isBigEndian {
265 // Find a place to put the new load.
266 // This is tricky, because it has to be at a point where
267 // its memory argument is live. We can't just put it in root.Block.
268 // We use the block of the latest load.
269 loads := make([]*Value, n, 8)
270 for i := int64(0); i < n; i++ {
273 loadBlock := mergePoint(root.Block, loads...)
274 if loadBlock == nil {
277 // Find a source position to use.
279 for _, load := range loads {
280 if load.Block == loadBlock {
285 if pos == src.NoXPos {
289 // Check to see if we need byte swap before storing.
290 needSwap := isLittleEndian && root.Block.Func.Config.BigEndian ||
291 isBigEndian && !root.Block.Func.Config.BigEndian
292 if needSwap && (size != 1 || !root.Block.Func.Config.haveByteSwap(n)) {
296 // This is the commit point.
298 // First, issue load at lowest address.
299 v = loadBlock.NewValue2(pos, OpLoad, sizeType(n*size), r[0].load.Args[0], mem)
301 // Byte swap if needed,
303 v = byteSwap(loadBlock, pos, v)
307 if n*size < root.Type.Size() {
308 v = zeroExtend(loadBlock, pos, v, n*size, root.Type.Size())
312 if isLittleEndian && shift0 != 0 {
313 v = leftShift(loadBlock, pos, v, shift0)
315 if isBigEndian && shift0-(n-1)*8 != 0 {
316 v = leftShift(loadBlock, pos, v, shift0-(n-1)*8)
319 // Install with (Copy v).
323 // Clobber the loads, just to prevent additional work being done on
324 // subtrees (which are now unreachable).
325 for i := int64(0); i < n; i++ {
331 func memcombineStores(f *Func) {
332 mark := f.newSparseSet(f.NumValues())
333 defer f.retSparseSet(mark)
336 for _, b := range f.Blocks {
337 // Mark all stores which are not last in a store sequence.
339 for _, v := range b.Values {
341 mark.add(v.MemoryArg().ID)
345 // pick an order for visiting stores such that
346 // later stores come earlier in the ordering.
348 for _, v := range b.Values {
352 if mark.contains(v.ID) {
353 continue // not last in a chain of stores
356 order = append(order, v)
358 if v.Block != b || v.Op != OpStore {
364 // Look for combining opportunities at each store in queue order.
365 for _, v := range order {
366 if v.Op != OpStore { // already rewritten
370 size := v.Aux.(*types.Type).Size()
371 if size >= f.Config.RegSize || size == 0 {
375 for n := f.Config.RegSize / size; n > 1; n /= 2 {
376 if combineStores(v, n) {
384 // Try to combine the n stores ending in root.
385 // Returns true if successful.
386 func combineStores(root *Value, n int64) bool {
388 type StoreRecord struct {
392 getShiftBase := func(a []StoreRecord) *Value {
393 x := a[0].store.Args[1]
394 y := a[1].store.Args[1]
396 case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
402 case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
409 case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
415 case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
420 // a shift of x and x itself.
424 // a shift of y and y itself.
428 // 2 shifts both of the same argument.
433 isShiftBase := func(v, base *Value) bool {
436 case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
445 case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
452 shift := func(v, base *Value) int64 {
455 case OpTrunc64to8, OpTrunc64to16, OpTrunc64to32, OpTrunc32to8, OpTrunc32to16, OpTrunc16to8:
464 case OpRsh64Ux64, OpRsh32Ux64, OpRsh16Ux64:
469 if val.Op != OpConst64 {
475 // Element size of the individual stores.
476 size := root.Aux.(*types.Type).Size()
477 if size*n > root.Block.Func.Config.RegSize {
481 // Gather n stores to look at. Check easy conditions we require.
482 a := make([]StoreRecord, 0, 8)
483 rbase, roff := splitPtr(root.Args[0])
484 if root.Block.Func.Config.arch == "S390X" {
485 // s390x can't handle unaligned accesses to global variables.
486 if rbase.ptr.Op == OpAddr {
490 a = append(a, StoreRecord{root, roff})
491 for i, x := int64(1), root.Args[2]; i < n; i, x = i+1, x.Args[2] {
495 if x.Block != root.Block {
498 if x.Uses != 1 { // Note: root can have more than one use.
501 if x.Aux.(*types.Type).Size() != size {
502 // TODO: the constant source and consecutive load source cases
503 // do not need all the stores to be the same size.
506 base, off := splitPtr(x.Args[0])
510 a = append(a, StoreRecord{x, off})
512 // Before we sort, grab the memory arg the result should have.
513 mem := a[n-1].store.Args[2]
515 // Sort stores in increasing address order.
516 sort.Slice(a, func(i, j int) bool {
517 return a[i].offset < a[j].offset
520 // Check that everything is written to sequential locations.
521 for i := int64(0); i < n; i++ {
522 if a[i].offset != a[0].offset+i*size {
527 // Memory location we're going to write at (the lowest one).
528 ptr := a[0].store.Args[0]
530 // Check for constant stores
532 for i := int64(0); i < n; i++ {
533 switch a[i].store.Args[1].Op {
534 case OpConst32, OpConst16, OpConst8:
541 // Modify root to do all the stores.
543 mask := int64(1)<<(8*size) - 1
544 for i := int64(0); i < n; i++ {
545 s := 8 * size * int64(i)
546 if root.Block.Func.Config.BigEndian {
549 c |= (a[i].store.Args[1].AuxInt & mask) << s
554 cv = root.Block.Func.ConstInt16(types.Types[types.TUINT16], int16(c))
556 cv = root.Block.Func.ConstInt32(types.Types[types.TUINT32], int32(c))
558 cv = root.Block.Func.ConstInt64(types.Types[types.TUINT64], c)
561 // Move all the stores to the root.
562 for i := int64(0); i < n; i++ {
565 v.Aux = cv.Type // widen store type
571 v.Type = types.Types[types.TBOOL] // erase memory type
577 // Check for consecutive loads as the source of the stores.
579 var loadBase BaseAddress
581 for i := int64(0); i < n; i++ {
582 load := a[i].store.Args[1]
583 if load.Op != OpLoad {
591 if load.Type.IsPtr() {
592 // Don't combine stores containing a pointer, as we need
593 // a write barrier for those. This can't currently happen,
594 // but might in the future if we ever have another
595 // 8-byte-reg/4-byte-ptr architecture like amd64p32.
600 base, idx := splitPtr(load.Args[0])
602 // First one we found
608 if base != loadBase || mem != loadMem {
612 if idx != loadIdx+(a[i].offset-a[0].offset) {
618 // Modify the first load to do a larger load instead.
619 load := a[0].store.Args[1]
622 load.Type = types.Types[types.TUINT16]
624 load.Type = types.Types[types.TUINT32]
626 load.Type = types.Types[types.TUINT64]
629 // Modify root to do the store.
630 for i := int64(0); i < n; i++ {
633 v.Aux = load.Type // widen store type
639 v.Type = types.Types[types.TBOOL] // erase memory type
645 // Check that all the shift/trunc are of the same base value.
646 shiftBase := getShiftBase(a)
647 if shiftBase == nil {
650 for i := int64(0); i < n; i++ {
651 if !isShiftBase(a[i].store, shiftBase) {
656 // Check for writes in little-endian or big-endian order.
657 isLittleEndian := true
658 shift0 := shift(a[0].store, shiftBase)
659 for i := int64(1); i < n; i++ {
660 if shift(a[i].store, shiftBase) != shift0+i*8 {
661 isLittleEndian = false
666 for i := int64(1); i < n; i++ {
667 if shift(a[i].store, shiftBase) != shift0-i*8 {
672 if !isLittleEndian && !isBigEndian {
676 // Check to see if we need byte swap before storing.
677 needSwap := isLittleEndian && root.Block.Func.Config.BigEndian ||
678 isBigEndian && !root.Block.Func.Config.BigEndian
679 if needSwap && (size != 1 || !root.Block.Func.Config.haveByteSwap(n)) {
683 // This is the commit point.
685 // Modify root to do all the stores.
687 if isLittleEndian && shift0 != 0 {
688 sv = rightShift(root.Block, root.Pos, sv, shift0)
690 if isBigEndian && shift0-(n-1)*8 != 0 {
691 sv = rightShift(root.Block, root.Pos, sv, shift0-(n-1)*8)
693 if sv.Type.Size() > size*n {
694 sv = truncate(root.Block, root.Pos, sv, sv.Type.Size(), size*n)
697 sv = byteSwap(root.Block, root.Pos, sv)
700 // Move all the stores to the root.
701 for i := int64(0); i < n; i++ {
704 v.Aux = sv.Type // widen store type
710 v.Type = types.Types[types.TBOOL] // erase memory type
716 func sizeType(size int64) *types.Type {
719 return types.Types[types.TUINT64]
721 return types.Types[types.TUINT32]
723 return types.Types[types.TUINT16]
725 base.Fatalf("bad size %d\n", size)
730 func truncate(b *Block, pos src.XPos, v *Value, from, to int64) *Value {
731 switch from*10 + to {
733 return b.NewValue1(pos, OpTrunc64to16, types.Types[types.TUINT16], v)
735 return b.NewValue1(pos, OpTrunc64to32, types.Types[types.TUINT32], v)
737 return b.NewValue1(pos, OpTrunc32to16, types.Types[types.TUINT16], v)
739 base.Fatalf("bad sizes %d %d\n", from, to)
743 func zeroExtend(b *Block, pos src.XPos, v *Value, from, to int64) *Value {
744 switch from*10 + to {
746 return b.NewValue1(pos, OpZeroExt16to32, types.Types[types.TUINT32], v)
748 return b.NewValue1(pos, OpZeroExt16to64, types.Types[types.TUINT64], v)
750 return b.NewValue1(pos, OpZeroExt32to64, types.Types[types.TUINT64], v)
752 base.Fatalf("bad sizes %d %d\n", from, to)
757 func leftShift(b *Block, pos src.XPos, v *Value, shift int64) *Value {
758 s := b.Func.ConstInt64(types.Types[types.TUINT64], shift)
759 size := v.Type.Size()
762 return b.NewValue2(pos, OpLsh64x64, v.Type, v, s)
764 return b.NewValue2(pos, OpLsh32x64, v.Type, v, s)
766 return b.NewValue2(pos, OpLsh16x64, v.Type, v, s)
768 base.Fatalf("bad size %d\n", size)
772 func rightShift(b *Block, pos src.XPos, v *Value, shift int64) *Value {
773 s := b.Func.ConstInt64(types.Types[types.TUINT64], shift)
774 size := v.Type.Size()
777 return b.NewValue2(pos, OpRsh64Ux64, v.Type, v, s)
779 return b.NewValue2(pos, OpRsh32Ux64, v.Type, v, s)
781 return b.NewValue2(pos, OpRsh16Ux64, v.Type, v, s)
783 base.Fatalf("bad size %d\n", size)
787 func byteSwap(b *Block, pos src.XPos, v *Value) *Value {
788 switch v.Type.Size() {
790 return b.NewValue1(pos, OpBswap64, v.Type, v)
792 return b.NewValue1(pos, OpBswap32, v.Type, v)
794 return b.NewValue1(pos, OpBswap16, v.Type, v)
797 v.Fatalf("bad size %d\n", v.Type.Size())