xref: /linux/arch/arm/crypto/chacha-scalar-core.S (revision e5a52fd2b8cdb700b3c07b030e050a49ef3156b9)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) 2018 Google, Inc.
4 */
5
6#include <linux/linkage.h>
7#include <asm/assembler.h>
8
9/*
10 * Design notes:
11 *
12 * 16 registers would be needed to hold the state matrix, but only 14 are
13 * available because 'sp' and 'pc' cannot be used.  So we spill the elements
14 * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
15 * 'ldrd' and one 'strd' instruction per round.
16 *
17 * All rotates are performed using the implicit rotate operand accepted by the
18 * 'add' and 'eor' instructions.  This is faster than using explicit rotate
19 * instructions.  To make this work, we allow the values in the second and last
20 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
21 * wrong rotation amount.  The rotation amount is then fixed up just in time
22 * when the values are used.  'brot' is the number of bits the values in row 'b'
23 * need to be rotated right to arrive at the correct values, and 'drot'
24 * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
25 * that they end up as (25, 24) after every round.
26 */
27
28	// ChaCha state registers
29	X0	.req	r0
30	X1	.req	r1
31	X2	.req	r2
32	X3	.req	r3
33	X4	.req	r4
34	X5	.req	r5
35	X6	.req	r6
36	X7	.req	r7
37	X8_X10	.req	r8	// shared by x8 and x10
38	X9_X11	.req	r9	// shared by x9 and x11
39	X12	.req	r10
40	X13	.req	r11
41	X14	.req	r12
42	X15	.req	r14
43
44.macro __rev		out, in,  t0, t1, t2
45.if __LINUX_ARM_ARCH__ >= 6
46	rev		\out, \in
47.else
48	lsl		\t0, \in, #24
49	and		\t1, \in, #0xff00
50	and		\t2, \in, #0xff0000
51	orr		\out, \t0, \in, lsr #24
52	orr		\out, \out, \t1, lsl #8
53	orr		\out, \out, \t2, lsr #8
54.endif
55.endm
56
57.macro _le32_bswap	x,  t0, t1, t2
58#ifdef __ARMEB__
59	__rev		\x, \x,  \t0, \t1, \t2
60#endif
61.endm
62
63.macro _le32_bswap_4x	a, b, c, d,  t0, t1, t2
64	_le32_bswap	\a,  \t0, \t1, \t2
65	_le32_bswap	\b,  \t0, \t1, \t2
66	_le32_bswap	\c,  \t0, \t1, \t2
67	_le32_bswap	\d,  \t0, \t1, \t2
68.endm
69
70.macro __ldrd		a, b, src, offset
71#if __LINUX_ARM_ARCH__ >= 6
72	ldrd		\a, \b, [\src, #\offset]
73#else
74	ldr		\a, [\src, #\offset]
75	ldr		\b, [\src, #\offset + 4]
76#endif
77.endm
78
79.macro __strd		a, b, dst, offset
80#if __LINUX_ARM_ARCH__ >= 6
81	strd		\a, \b, [\dst, #\offset]
82#else
83	str		\a, [\dst, #\offset]
84	str		\b, [\dst, #\offset + 4]
85#endif
86.endm
87
88.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
89
90	// a += b; d ^= a; d = rol(d, 16);
91	add		\a1, \a1, \b1, ror #brot
92	add		\a2, \a2, \b2, ror #brot
93	eor		\d1, \a1, \d1, ror #drot
94	eor		\d2, \a2, \d2, ror #drot
95	// drot == 32 - 16 == 16
96
97	// c += d; b ^= c; b = rol(b, 12);
98	add		\c1, \c1, \d1, ror #16
99	add		\c2, \c2, \d2, ror #16
100	eor		\b1, \c1, \b1, ror #brot
101	eor		\b2, \c2, \b2, ror #brot
102	// brot == 32 - 12 == 20
103
104	// a += b; d ^= a; d = rol(d, 8);
105	add		\a1, \a1, \b1, ror #20
106	add		\a2, \a2, \b2, ror #20
107	eor		\d1, \a1, \d1, ror #16
108	eor		\d2, \a2, \d2, ror #16
109	// drot == 32 - 8 == 24
110
111	// c += d; b ^= c; b = rol(b, 7);
112	add		\c1, \c1, \d1, ror #24
113	add		\c2, \c2, \d2, ror #24
114	eor		\b1, \c1, \b1, ror #20
115	eor		\b2, \c2, \b2, ror #20
116	// brot == 32 - 7 == 25
117.endm
118
119.macro _doubleround
120
121	// column round
122
123	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
124	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
125
126	// save (x8, x9); restore (x10, x11)
127	__strd		X8_X10, X9_X11, sp, 0
128	__ldrd		X8_X10, X9_X11, sp, 8
129
130	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
131	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
132
133	.set brot, 25
134	.set drot, 24
135
136	// diagonal round
137
138	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
139	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
140
141	// save (x10, x11); restore (x8, x9)
142	__strd		X8_X10, X9_X11, sp, 8
143	__ldrd		X8_X10, X9_X11, sp, 0
144
145	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
146	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
147.endm
148
149.macro _chacha_permute	nrounds
150	.set brot, 0
151	.set drot, 0
152	.rept \nrounds / 2
153	 _doubleround
154	.endr
155.endm
156
157.macro _chacha		nrounds
158
159.Lnext_block\@:
160	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
161	// Registers contain x0-x9,x12-x15.
162
163	// Do the core ChaCha permutation to update x0-x15.
164	_chacha_permute	\nrounds
165
166	add		sp, #8
167	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
168	// Registers contain x0-x9,x12-x15.
169	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
170
171	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
172	push		{X8_X10, X9_X11, X12, X13, X14, X15}
173
174	// Load (OUT, IN, LEN).
175	ldr		r14, [sp, #96]
176	ldr		r12, [sp, #100]
177	ldr		r11, [sp, #104]
178
179	orr		r10, r14, r12
180
181	// Use slow path if fewer than 64 bytes remain.
182	cmp		r11, #64
183	blt		.Lxor_slowpath\@
184
185	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
186	// ARMv6+, since ldmia and stmia (used below) still require alignment.
187	tst		r10, #3
188	bne		.Lxor_slowpath\@
189
190	// Fast path: XOR 64 bytes of aligned data.
191
192	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
193	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
194	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
195
196	// x0-x3
197	__ldrd		r8, r9, sp, 32
198	__ldrd		r10, r11, sp, 40
199	add		X0, X0, r8
200	add		X1, X1, r9
201	add		X2, X2, r10
202	add		X3, X3, r11
203	_le32_bswap_4x	X0, X1, X2, X3,  r8, r9, r10
204	ldmia		r12!, {r8-r11}
205	eor		X0, X0, r8
206	eor		X1, X1, r9
207	eor		X2, X2, r10
208	eor		X3, X3, r11
209	stmia		r14!, {X0-X3}
210
211	// x4-x7
212	__ldrd		r8, r9, sp, 48
213	__ldrd		r10, r11, sp, 56
214	add		X4, r8, X4, ror #brot
215	add		X5, r9, X5, ror #brot
216	ldmia		r12!, {X0-X3}
217	add		X6, r10, X6, ror #brot
218	add		X7, r11, X7, ror #brot
219	_le32_bswap_4x	X4, X5, X6, X7,  r8, r9, r10
220	eor		X4, X4, X0
221	eor		X5, X5, X1
222	eor		X6, X6, X2
223	eor		X7, X7, X3
224	stmia		r14!, {X4-X7}
225
226	// x8-x15
227	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
228	__ldrd		r8, r9, sp, 32
229	__ldrd		r10, r11, sp, 40
230	add		r0, r0, r8		// x8
231	add		r1, r1, r9		// x9
232	add		r6, r6, r10		// x10
233	add		r7, r7, r11		// x11
234	_le32_bswap_4x	r0, r1, r6, r7,  r8, r9, r10
235	ldmia		r12!, {r8-r11}
236	eor		r0, r0, r8		// x8
237	eor		r1, r1, r9		// x9
238	eor		r6, r6, r10		// x10
239	eor		r7, r7, r11		// x11
240	stmia		r14!, {r0,r1,r6,r7}
241	ldmia		r12!, {r0,r1,r6,r7}
242	__ldrd		r8, r9, sp, 48
243	__ldrd		r10, r11, sp, 56
244	add		r2, r8, r2, ror #drot	// x12
245	add		r3, r9, r3, ror #drot	// x13
246	add		r4, r10, r4, ror #drot	// x14
247	add		r5, r11, r5, ror #drot	// x15
248	_le32_bswap_4x	r2, r3, r4, r5,  r9, r10, r11
249	  ldr		r9, [sp, #72]		// load LEN
250	eor		r2, r2, r0		// x12
251	eor		r3, r3, r1		// x13
252	eor		r4, r4, r6		// x14
253	eor		r5, r5, r7		// x15
254	  subs		r9, #64			// decrement and check LEN
255	stmia		r14!, {r2-r5}
256
257	beq		.Ldone\@
258
259.Lprepare_for_next_block\@:
260
261	// Stack: x0-x15 OUT IN LEN
262
263	// Increment block counter (x12)
264	add		r8, #1
265
266	// Store updated (OUT, IN, LEN)
267	str		r14, [sp, #64]
268	str		r12, [sp, #68]
269	str		r9, [sp, #72]
270
271	  mov		r14, sp
272
273	// Store updated block counter (x12)
274	str		r8, [sp, #48]
275
276	  sub		sp, #16
277
278	// Reload state and do next block
279	ldmia		r14!, {r0-r11}		// load x0-x11
280	__strd		r10, r11, sp, 8		// store x10-x11 before state
281	ldmia		r14, {r10-r12,r14}	// load x12-x15
282	b		.Lnext_block\@
283
284.Lxor_slowpath\@:
285	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
286	// We handle it by storing the 64 bytes of keystream to the stack, then
287	// XOR-ing the needed portion with the data.
288
289	// Allocate keystream buffer
290	sub		sp, #64
291	mov		r14, sp
292
293	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
294	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
295	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
296
297	// Save keystream for x0-x3
298	__ldrd		r8, r9, sp, 96
299	__ldrd		r10, r11, sp, 104
300	add		X0, X0, r8
301	add		X1, X1, r9
302	add		X2, X2, r10
303	add		X3, X3, r11
304	_le32_bswap_4x	X0, X1, X2, X3,  r8, r9, r10
305	stmia		r14!, {X0-X3}
306
307	// Save keystream for x4-x7
308	__ldrd		r8, r9, sp, 112
309	__ldrd		r10, r11, sp, 120
310	add		X4, r8, X4, ror #brot
311	add		X5, r9, X5, ror #brot
312	add		X6, r10, X6, ror #brot
313	add		X7, r11, X7, ror #brot
314	_le32_bswap_4x	X4, X5, X6, X7,  r8, r9, r10
315	  add		r8, sp, #64
316	stmia		r14!, {X4-X7}
317
318	// Save keystream for x8-x15
319	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
320	__ldrd		r8, r9, sp, 128
321	__ldrd		r10, r11, sp, 136
322	add		r0, r0, r8		// x8
323	add		r1, r1, r9		// x9
324	add		r6, r6, r10		// x10
325	add		r7, r7, r11		// x11
326	_le32_bswap_4x	r0, r1, r6, r7,  r8, r9, r10
327	stmia		r14!, {r0,r1,r6,r7}
328	__ldrd		r8, r9, sp, 144
329	__ldrd		r10, r11, sp, 152
330	add		r2, r8, r2, ror #drot	// x12
331	add		r3, r9, r3, ror #drot	// x13
332	add		r4, r10, r4, ror #drot	// x14
333	add		r5, r11, r5, ror #drot	// x15
334	_le32_bswap_4x	r2, r3, r4, r5,  r9, r10, r11
335	stmia		r14, {r2-r5}
336
337	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
338	// Registers: r8 is block counter, r12 is IN.
339
340	ldr		r9, [sp, #168]		// LEN
341	ldr		r14, [sp, #160]		// OUT
342	cmp		r9, #64
343	  mov		r0, sp
344	movle		r1, r9
345	movgt		r1, #64
346	// r1 is number of bytes to XOR, in range [1, 64]
347
348.if __LINUX_ARM_ARCH__ < 6
349	orr		r2, r12, r14
350	tst		r2, #3			// IN or OUT misaligned?
351	bne		.Lxor_next_byte\@
352.endif
353
354	// XOR a word at a time
355.rept 16
356	subs		r1, #4
357	blt		.Lxor_words_done\@
358	ldr		r2, [r12], #4
359	ldr		r3, [r0], #4
360	eor		r2, r2, r3
361	str		r2, [r14], #4
362.endr
363	b		.Lxor_slowpath_done\@
364.Lxor_words_done\@:
365	ands		r1, r1, #3
366	beq		.Lxor_slowpath_done\@
367
368	// XOR a byte at a time
369.Lxor_next_byte\@:
370	ldrb		r2, [r12], #1
371	ldrb		r3, [r0], #1
372	eor		r2, r2, r3
373	strb		r2, [r14], #1
374	subs		r1, #1
375	bne		.Lxor_next_byte\@
376
377.Lxor_slowpath_done\@:
378	subs		r9, #64
379	add		sp, #96
380	bgt		.Lprepare_for_next_block\@
381
382.Ldone\@:
383.endm	// _chacha
384
385/*
386 * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
387 *		     const u32 *state, int nrounds);
388 */
389ENTRY(chacha_doarm)
390	cmp		r2, #0			// len == 0?
391	reteq		lr
392
393	ldr		ip, [sp]
394	cmp		ip, #12
395
396	push		{r0-r2,r4-r11,lr}
397
398	// Push state x0-x15 onto stack.
399	// Also store an extra copy of x10-x11 just before the state.
400
401	add		X12, r3, #48
402	ldm		X12, {X12,X13,X14,X15}
403	push		{X12,X13,X14,X15}
404	sub		sp, sp, #64
405
406	__ldrd		X8_X10, X9_X11, r3, 40
407	__strd		X8_X10, X9_X11, sp, 8
408	__strd		X8_X10, X9_X11, sp, 56
409	ldm		r3, {X0-X9_X11}
410	__strd		X0, X1, sp, 16
411	__strd		X2, X3, sp, 24
412	__strd		X4, X5, sp, 32
413	__strd		X6, X7, sp, 40
414	__strd		X8_X10, X9_X11, sp, 48
415
416	beq		1f
417	_chacha		20
418
4190:	add		sp, #76
420	pop		{r4-r11, pc}
421
4221:	_chacha		12
423	b		0b
424ENDPROC(chacha_doarm)
425
426/*
427 * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
428 */
429ENTRY(hchacha_block_arm)
430	push		{r1,r4-r11,lr}
431
432	cmp		r2, #12			// ChaCha12 ?
433
434	mov		r14, r0
435	ldmia		r14!, {r0-r11}		// load x0-x11
436	push		{r10-r11}		// store x10-x11 to stack
437	ldm		r14, {r10-r12,r14}	// load x12-x15
438	sub		sp, #8
439
440	beq		1f
441	_chacha_permute	20
442
443	// Skip over (unused0-unused1, x10-x11)
4440:	add		sp, #16
445
446	// Fix up rotations of x12-x15
447	ror		X12, X12, #drot
448	ror		X13, X13, #drot
449	  pop		{r4}			// load 'out'
450	ror		X14, X14, #drot
451	ror		X15, X15, #drot
452
453	// Store (x0-x3,x12-x15) to 'out'
454	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
455
456	pop		{r4-r11,pc}
457
4581:	_chacha_permute	12
459	b		0b
460ENDPROC(hchacha_block_arm)
461