1 // Copyright 2018 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
6 // +build !gccgo,!appengine
12 // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
13 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
16 MOVD src_len+32(FP), R3
19 MOVD counter+64(FP), R7
21 MOVD $·constants(SB), R10
22 MOVD $·incRotMatrix(SB), R11
27 ADD R2, R13, R12 // R12 for block end
31 VLD1 (R11), [V30.S4, V31.S4]
34 // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
38 // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
40 // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
44 // load counter + nonce
45 // VLD1R (R7), [V12.S4]
48 // VLD3R (R6), [V13.S4, V14.S4, V15.S4]
52 VADD V30.S4, V12.S4, V12.S4
56 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
57 VADD V0.S4, V4.S4, V0.S4
58 VADD V1.S4, V5.S4, V1.S4
59 VADD V2.S4, V6.S4, V2.S4
60 VADD V3.S4, V7.S4, V3.S4
61 VEOR V12.B16, V0.B16, V12.B16
62 VEOR V13.B16, V1.B16, V13.B16
63 VEOR V14.B16, V2.B16, V14.B16
64 VEOR V15.B16, V3.B16, V15.B16
69 // V8..V11 += V12..V15
70 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
71 VADD V8.S4, V12.S4, V8.S4
72 VADD V9.S4, V13.S4, V9.S4
73 VADD V10.S4, V14.S4, V10.S4
74 VADD V11.S4, V15.S4, V11.S4
75 VEOR V8.B16, V4.B16, V16.B16
76 VEOR V9.B16, V5.B16, V17.B16
77 VEOR V10.B16, V6.B16, V18.B16
78 VEOR V11.B16, V7.B16, V19.B16
79 VSHL $12, V16.S4, V4.S4
80 VSHL $12, V17.S4, V5.S4
81 VSHL $12, V18.S4, V6.S4
82 VSHL $12, V19.S4, V7.S4
83 VSRI $20, V16.S4, V4.S4
84 VSRI $20, V17.S4, V5.S4
85 VSRI $20, V18.S4, V6.S4
86 VSRI $20, V19.S4, V7.S4
89 // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
90 VADD V0.S4, V4.S4, V0.S4
91 VADD V1.S4, V5.S4, V1.S4
92 VADD V2.S4, V6.S4, V2.S4
93 VADD V3.S4, V7.S4, V3.S4
94 VEOR V12.B16, V0.B16, V12.B16
95 VEOR V13.B16, V1.B16, V13.B16
96 VEOR V14.B16, V2.B16, V14.B16
97 VEOR V15.B16, V3.B16, V15.B16
98 VTBL V31.B16, [V12.B16], V12.B16
99 VTBL V31.B16, [V13.B16], V13.B16
100 VTBL V31.B16, [V14.B16], V14.B16
101 VTBL V31.B16, [V15.B16], V15.B16
103 // V8..V11 += V12..V15
104 // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
105 VADD V12.S4, V8.S4, V8.S4
106 VADD V13.S4, V9.S4, V9.S4
107 VADD V14.S4, V10.S4, V10.S4
108 VADD V15.S4, V11.S4, V11.S4
109 VEOR V8.B16, V4.B16, V16.B16
110 VEOR V9.B16, V5.B16, V17.B16
111 VEOR V10.B16, V6.B16, V18.B16
112 VEOR V11.B16, V7.B16, V19.B16
113 VSHL $7, V16.S4, V4.S4
114 VSHL $7, V17.S4, V5.S4
115 VSHL $7, V18.S4, V6.S4
116 VSHL $7, V19.S4, V7.S4
117 VSRI $25, V16.S4, V4.S4
118 VSRI $25, V17.S4, V5.S4
119 VSRI $25, V18.S4, V6.S4
120 VSRI $25, V19.S4, V7.S4
122 // V0..V3 += V5..V7, V4
123 // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
124 VADD V0.S4, V5.S4, V0.S4
125 VADD V1.S4, V6.S4, V1.S4
126 VADD V2.S4, V7.S4, V2.S4
127 VADD V3.S4, V4.S4, V3.S4
128 VEOR V15.B16, V0.B16, V15.B16
129 VEOR V12.B16, V1.B16, V12.B16
130 VEOR V13.B16, V2.B16, V13.B16
131 VEOR V14.B16, V3.B16, V14.B16
132 VREV32 V12.H8, V12.H8
133 VREV32 V13.H8, V13.H8
134 VREV32 V14.H8, V14.H8
135 VREV32 V15.H8, V15.H8
137 // V10 += V15; V5 <<<= ((V10 XOR V5), 12)
139 VADD V15.S4, V10.S4, V10.S4
140 VADD V12.S4, V11.S4, V11.S4
141 VADD V13.S4, V8.S4, V8.S4
142 VADD V14.S4, V9.S4, V9.S4
143 VEOR V10.B16, V5.B16, V16.B16
144 VEOR V11.B16, V6.B16, V17.B16
145 VEOR V8.B16, V7.B16, V18.B16
146 VEOR V9.B16, V4.B16, V19.B16
147 VSHL $12, V16.S4, V5.S4
148 VSHL $12, V17.S4, V6.S4
149 VSHL $12, V18.S4, V7.S4
150 VSHL $12, V19.S4, V4.S4
151 VSRI $20, V16.S4, V5.S4
152 VSRI $20, V17.S4, V6.S4
153 VSRI $20, V18.S4, V7.S4
154 VSRI $20, V19.S4, V4.S4
156 // V0 += V5; V15 <<<= ((V0 XOR V15), 8)
158 VADD V5.S4, V0.S4, V0.S4
159 VADD V6.S4, V1.S4, V1.S4
160 VADD V7.S4, V2.S4, V2.S4
161 VADD V4.S4, V3.S4, V3.S4
162 VEOR V0.B16, V15.B16, V15.B16
163 VEOR V1.B16, V12.B16, V12.B16
164 VEOR V2.B16, V13.B16, V13.B16
165 VEOR V3.B16, V14.B16, V14.B16
166 VTBL V31.B16, [V12.B16], V12.B16
167 VTBL V31.B16, [V13.B16], V13.B16
168 VTBL V31.B16, [V14.B16], V14.B16
169 VTBL V31.B16, [V15.B16], V15.B16
171 // V10 += V15; V5 <<<= ((V10 XOR V5), 7)
173 VADD V15.S4, V10.S4, V10.S4
174 VADD V12.S4, V11.S4, V11.S4
175 VADD V13.S4, V8.S4, V8.S4
176 VADD V14.S4, V9.S4, V9.S4
177 VEOR V10.B16, V5.B16, V16.B16
178 VEOR V11.B16, V6.B16, V17.B16
179 VEOR V8.B16, V7.B16, V18.B16
180 VEOR V9.B16, V4.B16, V19.B16
181 VSHL $7, V16.S4, V5.S4
182 VSHL $7, V17.S4, V6.S4
183 VSHL $7, V18.S4, V7.S4
184 VSHL $7, V19.S4, V4.S4
185 VSRI $25, V16.S4, V5.S4
186 VSRI $25, V17.S4, V6.S4
187 VSRI $25, V18.S4, V7.S4
188 VSRI $25, V19.S4, V4.S4
193 // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
196 // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
198 VADD V30.S4, V12.S4, V12.S4
199 VADD V16.S4, V0.S4, V0.S4
200 VADD V17.S4, V1.S4, V1.S4
201 VADD V18.S4, V2.S4, V2.S4
202 VADD V19.S4, V3.S4, V3.S4
203 // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
208 // load counter + nonce
209 // VLD1R (R7), [V28.S4]
211 // VLD3R (R6), [V29.S4, V30.S4, V31.S4]
214 VADD V20.S4, V4.S4, V4.S4
215 VADD V21.S4, V5.S4, V5.S4
216 VADD V22.S4, V6.S4, V6.S4
217 VADD V23.S4, V7.S4, V7.S4
218 VADD V24.S4, V8.S4, V8.S4
219 VADD V25.S4, V9.S4, V9.S4
220 VADD V26.S4, V10.S4, V10.S4
221 VADD V27.S4, V11.S4, V11.S4
222 VADD V28.S4, V12.S4, V12.S4
223 VADD V29.S4, V13.S4, V13.S4
224 VADD V30.S4, V14.S4, V14.S4
225 VADD V31.S4, V15.S4, V15.S4
227 VZIP1 V1.S4, V0.S4, V16.S4
228 VZIP2 V1.S4, V0.S4, V17.S4
229 VZIP1 V3.S4, V2.S4, V18.S4
230 VZIP2 V3.S4, V2.S4, V19.S4
231 VZIP1 V5.S4, V4.S4, V20.S4
232 VZIP2 V5.S4, V4.S4, V21.S4
233 VZIP1 V7.S4, V6.S4, V22.S4
234 VZIP2 V7.S4, V6.S4, V23.S4
235 VZIP1 V9.S4, V8.S4, V24.S4
236 VZIP2 V9.S4, V8.S4, V25.S4
237 VZIP1 V11.S4, V10.S4, V26.S4
238 VZIP2 V11.S4, V10.S4, V27.S4
239 VZIP1 V13.S4, V12.S4, V28.S4
240 VZIP2 V13.S4, V12.S4, V29.S4
241 VZIP1 V15.S4, V14.S4, V30.S4
242 VZIP2 V15.S4, V14.S4, V31.S4
243 VZIP1 V18.D2, V16.D2, V0.D2
244 VZIP2 V18.D2, V16.D2, V4.D2
245 VZIP1 V19.D2, V17.D2, V8.D2
246 VZIP2 V19.D2, V17.D2, V12.D2
247 VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
249 VZIP1 V22.D2, V20.D2, V1.D2
250 VZIP2 V22.D2, V20.D2, V5.D2
251 VZIP1 V23.D2, V21.D2, V9.D2
252 VZIP2 V23.D2, V21.D2, V13.D2
253 VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
254 VZIP1 V26.D2, V24.D2, V2.D2
255 VZIP2 V26.D2, V24.D2, V6.D2
256 VZIP1 V27.D2, V25.D2, V10.D2
257 VZIP2 V27.D2, V25.D2, V14.D2
258 VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
259 VZIP1 V30.D2, V28.D2, V3.D2
260 VZIP2 V30.D2, V28.D2, V7.D2
261 VZIP1 V31.D2, V29.D2, V11.D2
262 VZIP2 V31.D2, V29.D2, V15.D2
263 VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
264 VEOR V0.B16, V16.B16, V16.B16
265 VEOR V1.B16, V17.B16, V17.B16
266 VEOR V2.B16, V18.B16, V18.B16
267 VEOR V3.B16, V19.B16, V19.B16
268 VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
269 VEOR V4.B16, V20.B16, V20.B16
270 VEOR V5.B16, V21.B16, V21.B16
271 VEOR V6.B16, V22.B16, V22.B16
272 VEOR V7.B16, V23.B16, V23.B16
273 VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
274 VEOR V8.B16, V24.B16, V24.B16
275 VEOR V9.B16, V25.B16, V25.B16
276 VEOR V10.B16, V26.B16, V26.B16
277 VEOR V11.B16, V27.B16, V27.B16
278 VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
279 VEOR V12.B16, V28.B16, V28.B16
280 VEOR V13.B16, V29.B16, V29.B16
281 VEOR V14.B16, V30.B16, V30.B16
282 VEOR V15.B16, V31.B16, V31.B16
283 VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
286 MOVW R20, (R7) // update counter
294 DATA ·constants+0x00(SB)/4, $0x61707865
295 DATA ·constants+0x04(SB)/4, $0x3320646e
296 DATA ·constants+0x08(SB)/4, $0x79622d32
297 DATA ·constants+0x0c(SB)/4, $0x6b206574
298 GLOBL ·constants(SB), NOPTR|RODATA, $32
300 DATA ·incRotMatrix+0x00(SB)/4, $0x00000000
301 DATA ·incRotMatrix+0x04(SB)/4, $0x00000001
302 DATA ·incRotMatrix+0x08(SB)/4, $0x00000002
303 DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003
304 DATA ·incRotMatrix+0x10(SB)/4, $0x02010003
305 DATA ·incRotMatrix+0x14(SB)/4, $0x06050407
306 DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B
307 DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
308 GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32