1 // Copyright 2019 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Based on CRYPTOGAMS code with the following comment:
6 // # ====================================================================
7 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8 // # project. The module is, however, dual licensed under OpenSSL and
9 // # CRYPTOGAMS licenses depending on where you obtain it. For further
10 // # details see http://www.openssl.org/~appro/cryptogams/.
11 // # ====================================================================
13 // Original code can be found at the link below:
14 // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91e5c39ca79126a4a876d5d8ff
16 // There are some differences between CRYPTOGAMS code and this one. The round
17 // loop for "_int" isn't the same as the original. Some adjustments were
18 // necessary because there are less vector registers available. For example, some
19 // X variables (r12, r13, r14, and r15) share the same register used by the
20 // counter. The original code uses ctr to name the counter. Here we use CNT
21 // because golang uses CTR as the counter register name.
23 // +build ppc64le,!gccgo,!appengine
119 DATA ·consts+0x00(SB)/8, $0x3320646e61707865
120 DATA ·consts+0x08(SB)/8, $0x6b20657479622d32
121 DATA ·consts+0x10(SB)/8, $0x0000000000000001
122 DATA ·consts+0x18(SB)/8, $0x0000000000000000
123 DATA ·consts+0x20(SB)/8, $0x0000000000000004
124 DATA ·consts+0x28(SB)/8, $0x0000000000000000
125 DATA ·consts+0x30(SB)/8, $0x0a0b08090e0f0c0d
126 DATA ·consts+0x38(SB)/8, $0x0203000106070405
127 DATA ·consts+0x40(SB)/8, $0x090a0b080d0e0f0c
128 DATA ·consts+0x48(SB)/8, $0x0102030005060704
129 GLOBL ·consts(SB), RODATA, $80
131 //func chaCha20_ctr32_vmx(out, inp *byte, len int, key *[32]byte, counter *[16]byte)
132 TEXT ·chaCha20_ctr32_vmx(SB),NOSPLIT|NOFRAME,$0
133 // Load the arguments inside the registers
138 MOVD counter+32(FP), CNT
140 MOVD $·consts(SB), CONSTS // point to consts addr
163 LVX (CONSTS)(X1), FOUR
164 LVX (CONSTS)(X2), SIXTEEN
165 LVX (CONSTS)(X3), TWENTY4
167 // Align key and counter
169 VPERM DD0, K2, T0, K2
170 VPERM DD1, K3, T1, K3
172 // Load counter to GPR
178 // Adjust vectors for the initial state
183 // Synthesized constants
184 VSPLTISW $-12, TWENTY
186 VSPLTISW $-7, TWENTY5
189 VSPLTISW $-1, OUTMASK
190 LVSR (INP)(R0), INPPERM
191 LVSL (OUT)(R0), OUTPERM
192 VPERM OUTMASK, T0, OUTPERM, OUTMASK
196 MOVD $0x61707865, CON0
197 MOVD $0x3320646e, CON1
198 MOVD $0x79622d32, CON2
199 MOVD $0x6b206574, CON3
239 // CRYPTOGAMS uses a macro to create a loop using perl. This isn't possible
240 // using assembly macros. Therefore, the macro expansion result was used
241 // in order to maintain the algorithm efficiency.
242 // This loop generates three keystream blocks using VMX instructions and,
243 // in parallel, one keystream block using scalar instructions.
258 VPERM D0, D0, SIXTEEN, D0
259 VPERM D1, D1, SIXTEEN, D1
262 VPERM D2, D2, SIXTEEN, D2
294 VPERM D0, D0, TWENTY4, D0
295 VPERM D1, D1, TWENTY4, D1
298 VPERM D2, D2, TWENTY4, D2
318 VSLDOI $8, C0, C0, C0
319 VSLDOI $8, C1, C1, C1
322 VSLDOI $8, C2, C2, C2
323 VSLDOI $12, B0, B0, B0
326 VSLDOI $12, B1, B1, B1
327 VSLDOI $12, B2, B2, B2
330 VSLDOI $4, D0, D0, D0
331 VSLDOI $4, D1, D1, D1
334 VSLDOI $4, D2, D2, D2
347 VPERM D0, D0, SIXTEEN, D0
350 VPERM D1, D1, SIXTEEN, D1
351 VPERM D2, D2, SIXTEEN, D2
383 VPERM D0, D0, TWENTY4, D0
386 VPERM D1, D1, TWENTY4, D1
387 VPERM D2, D2, TWENTY4, D2
407 VSLDOI $8, C0, C0, C0
410 VSLDOI $8, C1, C1, C1
411 VSLDOI $8, C2, C2, C2
414 VSLDOI $4, B0, B0, B0
415 VSLDOI $4, B1, B1, B1
418 VSLDOI $4, B2, B2, B2
419 VSLDOI $12, D0, D0, D0
422 VSLDOI $12, D1, D1, D1
423 VSLDOI $12, D2, D2, D2
434 // Accumulate key block
435 ADD $0x61707865, X0, X0
436 ADD $0x3320646e, X1, X1
437 ADD $0x79622d32, X2, X2
438 ADD $0x6b206574, X3, X3
461 // Accumulate key block
483 // XOR the input slice (INP) with the keystream, which is stored in GPRs (X0-X3).
485 // Load input (aligned or not)
521 // Store output (aligned or not)
527 ADD $64, INP, INP // INP points to the end of the slice for the alignment code below
555 VPERM DD1, DD0, INPPERM, DD0 // Align input
556 VPERM DD2, DD1, INPPERM, DD1
557 VPERM DD3, DD2, INPPERM, DD2
558 VPERM DD4, DD3, INPPERM, DD3
559 VXOR A0, DD0, A0 // XOR with input
561 LVX (INP)(TMP0), DD1 // Keep loading input
568 MOVD $63, TMP3 // 63 is not a typo
569 VPERM A0, A0, OUTPERM, A0
570 VPERM B0, B0, OUTPERM, B0
571 VPERM C0, C0, OUTPERM, C0
572 VPERM D0, D0, OUTPERM, D0
574 VPERM DD1, DD4, INPPERM, DD4 // Align input
575 VPERM DD2, DD1, INPPERM, DD1
576 VPERM DD3, DD2, INPPERM, DD2
577 VPERM DD0, DD3, INPPERM, DD3
580 LVX (INP)(TMP0), DD1 // Keep loading
586 // Note that the LVX address is always rounded down to the nearest 16-byte
587 // boundary, and that it always points to at most 15 bytes beyond the end of
588 // the slice, so we cannot cross a page boundary.
589 LVX (INP)(TMP3), DD4 // Redundant in aligned case.
591 VPERM A1, A1, OUTPERM, A1 // Pre-misalign output
592 VPERM B1, B1, OUTPERM, B1
593 VPERM C1, C1, OUTPERM, C1
594 VPERM D1, D1, OUTPERM, D1
596 VPERM DD1, DD0, INPPERM, DD0 // Align Input
597 VPERM DD2, DD1, INPPERM, DD1
598 VPERM DD3, DD2, INPPERM, DD2
599 VPERM DD4, DD3, INPPERM, DD3
604 VPERM A2, A2, OUTPERM, A2
605 VPERM B2, B2, OUTPERM, B2
606 VPERM C2, C2, OUTPERM, C2
607 VPERM D2, D2, OUTPERM, D2
609 ANDCC $15, OUT, X1 // Is out aligned?
612 VSEL A0, B0, OUTMASK, DD0 // Collect pre-misaligned output
613 VSEL B0, C0, OUTMASK, DD1
614 VSEL C0, D0, OUTMASK, DD2
615 VSEL D0, A1, OUTMASK, DD3
616 VSEL A1, B1, OUTMASK, B0
617 VSEL B1, C1, OUTMASK, C0
618 VSEL C1, D1, OUTMASK, D0
619 VSEL D1, A2, OUTMASK, A1
620 VSEL A2, B2, OUTMASK, B1
621 VSEL B2, C2, OUTMASK, C1
622 VSEL C2, D2, OUTMASK, D1
641 SUB X1, OUT, X2 // in misaligned case edges
642 MOVD $0, X3 // are written byte-by-byte
648 BNE unaligned_tail_vmx
655 BNE unaligned_head_vmx
657 CMPU LEN, $255 // done with 256-byte block yet?
664 CMPU LEN, $255 // done with 256-byte block yet?