// padding bytes to add to align code as requested.
func addpad(pc, a int64, ctxt *obj.Link, cursym *obj.LSym) int {
- // For 16 and 32 byte alignment, there is a tradeoff
- // between aligning the code and adding too many NOPs.
switch a {
- case 8:
- if pc&7 != 0 {
- return 4
- }
- case 16:
- // Align to 16 bytes if possible but add at
- // most 2 NOPs.
- switch pc & 15 {
- case 4, 12:
- return 4
- case 8:
- return 8
- }
- case 32:
- // Align to 32 bytes if possible but add at
- // most 3 NOPs.
- switch pc & 31 {
- case 4, 20:
- return 12
- case 8, 24:
- return 8
- case 12, 28:
- return 4
- }
- // When 32 byte alignment is requested on Linux,
- // promote the function's alignment to 32. On AIX
- // the function alignment is not changed which might
- // result in 16 byte alignment but that is still fine.
- // TODO: alignment on AIX
- if ctxt.Headtype != objabi.Haix && cursym.Func().Align < 32 {
- cursym.Func().Align = 32
+ case 8, 16, 32, 64:
+ // By default function alignment is 16. If an alignment > 16 is
+ // requested then the function alignment must also be promoted.
+ // The function alignment is not promoted on AIX at this time.
+ // TODO: Investigate AIX function alignment.
+ if ctxt.Headtype != objabi.Haix && cursym.Func().Align < int32(a) {
+ cursym.Func().Align = int32(a)
+ }
+ if pc&(a-1) != 0 {
+ return int(a - (pc & (a - 1)))
}
default:
ctxt.Diag("Unexpected alignment: %d for PCALIGN directive\n", a)
const invalidPCAlignSrc = `
TEXT test(SB),0,$0-0
ADD $2, R3
-PCALIGN $64
+PCALIGN $128
RET
`
PCALIGN $16
PCALIGN $8
-Functions in Go are aligned to 16 bytes, as is the case in all other compilers
-for PPC64.
+By default, functions in Go are aligned to 16 bytes, as is the case in all
+other compilers for PPC64. If there is a PCALIGN directive requesting alignment
+greater than 16, then the alignment of the containing function must be
+promoted to that same alignment or greater.
+
+The behavior of PCALIGN is changed in Go 1.21 to be more straightforward to
+ensure the alignment required for some instructions in power10. The acceptable
+values are 8, 16, 32 and 64, and the use of those values will always provide the
+specified alignment.
6. Shift instructions
BEQ Lcbc_dec
- PCALIGN $32
+ PCALIGN $16
Lcbc_enc:
P8_LXVB16X(INP, R0, INOUT)
ADD $16, INP
CLEAR_KEYS()
RET
- PCALIGN $32
+ PCALIGN $16
Lcbc_dec:
P8_LXVB16X(INP, R0, TMP)
ADD $16, INP
MOVD $32,R11 // set offsets to load into vector
MOVD $48,R12 // set offsets to load into vector
- PCALIGN $32
+ PCALIGN $16
cmp64_loop:
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
MOVD $48, R16
ANDCC $0x3F, R5, R5 // len%64==0?
- PCALIGN $32
+ PCALIGN $16
loop64:
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
#else
MOVD $3, R17 // Number of bytes beyond 16
#endif
- PCALIGN $32
+ PCALIGN $16
index2to16loop:
MTVSRD R10, V8 // Set up shift
VSLDOI $8, V8, V8, V8
VSLO V1, V8, V1 // Shift by start byte
- PCALIGN $32
+ PCALIGN $16
index2to16next:
VAND V1, SEPMASK, V2 // Just compare size of sep
VCMPEQUBCC V0, V2, V3 // Compare sep and partial string
// gain significant performance as z_len increases (up to
// 1.45x).
- PCALIGN $32
+ PCALIGN $16
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
// gain significant performance as z_len increases (up to
// 1.45x).
- PCALIGN $32
+ PCALIGN $16
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
CMP R0, R9
MOVD R9, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
- PCALIGN $32
+ PCALIGN $16
loop:
MOVD 8(R8), R20 // R20 = x[i]
// we don't need to capture CA every iteration because we've already
// done that above.
- PCALIGN $32
+ PCALIGN $16
loop:
MOVD 8(R8), R20
MOVD 16(R8), R21
CMP R5, R0 // iterate from i=len(z)-1 to 0
BEQ loopexit // Already at end?
MOVD 0(R15),R10 // x[i]
- PCALIGN $32
+ PCALIGN $16
shloop:
SLD R9, R10, R10 // x[i]<<s
MOVDU -8(R15), R14
CMP R0, R14
MOVD R14, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
- PCALIGN $32
+ PCALIGN $16
loop:
MOVD 8(R8), R20 // R20 = x[i]
MOVD R0, R4 // R4 = c = 0
MOVD R22, CTR // Initialize loop counter
BEQ done
- PCALIGN $32
+ PCALIGN $16
loop:
MOVD (R8)(R3), R20 // Load x[i]
MOVD $128, R9 // index regs for 128 bytes
MOVD $256, R10
MOVD $384, R11
- PCALIGN $32
+ PCALIGN $16
zero512:
DCBZ (R3+R0) // clear first chunk
DCBZ (R3+R9) // clear second chunk
MOVD OCTWORDS, CTR // Number of 64 byte chunks
MOVD $32, IDX32
MOVD $48, IDX48
- PCALIGN $32
+ PCALIGN $16
forward64:
LXVD2X (R0)(SRC), VS32 // load 64 bytes
ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0
MOVD QWORDS, CTR // set up loop ctr
MOVD $16, IDX16 // 32 bytes at a time
- PCALIGN $32
+ PCALIGN $16
backward32loop:
SUB $32, TGT