xref: /linux/arch/powerpc/lib/checksum_32.S (revision 8dd765a5d769c521d73931850d1c8708fbc490cb)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * This file contains assembly-language implementations
4 * of IP-style 1's complement checksum routines.
5 *
6 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9 */
10
11#include <linux/export.h>
12#include <linux/sys.h>
13#include <asm/processor.h>
14#include <asm/cache.h>
15#include <asm/errno.h>
16#include <asm/ppc_asm.h>
17
18	.text
19
20/*
21 * computes the checksum of a memory block at buff, length len,
22 * and adds in "sum" (32-bit)
23 *
24 * __csum_partial(buff, len, sum)
25 */
26_GLOBAL(__csum_partial)
27	subi	r3,r3,4
28	srawi.	r6,r4,2		/* Divide len by 4 and also clear carry */
29	beq	3f		/* if we're doing < 4 bytes */
30	andi.	r0,r3,2		/* Align buffer to longword boundary */
31	beq+	1f
32	lhz	r0,4(r3)	/* do 2 bytes to get aligned */
33	subi	r4,r4,2
34	addi	r3,r3,2
35	srwi.	r6,r4,2		/* # words to do */
36	adde	r5,r5,r0
37	beq	3f
381:	andi.	r6,r6,3		/* Prepare to handle words 4 by 4 */
39	beq	21f
40	mtctr	r6
412:	lwzu	r0,4(r3)
42	adde	r5,r5,r0
43	bdnz	2b
4421:	srwi.	r6,r4,4		/* # blocks of 4 words to do */
45	beq	3f
46	lwz	r0,4(r3)
47	mtctr	r6
48	lwz	r6,8(r3)
49	adde	r5,r5,r0
50	lwz	r7,12(r3)
51	adde	r5,r5,r6
52	lwzu	r8,16(r3)
53	adde	r5,r5,r7
54	bdz	23f
5522:	lwz	r0,4(r3)
56	adde	r5,r5,r8
57	lwz	r6,8(r3)
58	adde	r5,r5,r0
59	lwz	r7,12(r3)
60	adde	r5,r5,r6
61	lwzu	r8,16(r3)
62	adde	r5,r5,r7
63	bdnz	22b
6423:	adde	r5,r5,r8
653:	andi.	r0,r4,2
66	beq+	4f
67	lhz	r0,4(r3)
68	addi	r3,r3,2
69	adde	r5,r5,r0
704:	andi.	r0,r4,1
71	beq+	5f
72	lbz	r0,4(r3)
73	slwi	r0,r0,8		/* Upper byte of word */
74	adde	r5,r5,r0
755:	addze	r3,r5		/* add in final carry */
76	blr
77EXPORT_SYMBOL(__csum_partial)
78
79/*
80 * Computes the checksum of a memory block at src, length len,
81 * and adds in 0xffffffff, while copying the block to dst.
82 * If an access exception occurs it returns zero.
83 *
84 * csum_partial_copy_generic(src, dst, len)
85 */
86#define CSUM_COPY_16_BYTES_WITHEX(n)	\
878 ## n ## 0:			\
88	lwz	r7,4(r4);	\
898 ## n ## 1:			\
90	lwz	r8,8(r4);	\
918 ## n ## 2:			\
92	lwz	r9,12(r4);	\
938 ## n ## 3:			\
94	lwzu	r10,16(r4);	\
958 ## n ## 4:			\
96	stw	r7,4(r6);	\
97	adde	r12,r12,r7;	\
988 ## n ## 5:			\
99	stw	r8,8(r6);	\
100	adde	r12,r12,r8;	\
1018 ## n ## 6:			\
102	stw	r9,12(r6);	\
103	adde	r12,r12,r9;	\
1048 ## n ## 7:			\
105	stwu	r10,16(r6);	\
106	adde	r12,r12,r10
107
108#define CSUM_COPY_16_BYTES_EXCODE(n)		\
109	EX_TABLE(8 ## n ## 0b, fault);	\
110	EX_TABLE(8 ## n ## 1b, fault);	\
111	EX_TABLE(8 ## n ## 2b, fault);	\
112	EX_TABLE(8 ## n ## 3b, fault);	\
113	EX_TABLE(8 ## n ## 4b, fault);	\
114	EX_TABLE(8 ## n ## 5b, fault);	\
115	EX_TABLE(8 ## n ## 6b, fault);	\
116	EX_TABLE(8 ## n ## 7b, fault);
117
118	.text
119
120CACHELINE_BYTES = L1_CACHE_BYTES
121LG_CACHELINE_BYTES = L1_CACHE_SHIFT
122CACHELINE_MASK = (L1_CACHE_BYTES-1)
123
124_GLOBAL(csum_partial_copy_generic)
125	li	r12,-1
126	addic	r0,r0,0			/* clear carry */
127	addi	r6,r4,-4
128	neg	r0,r4
129	addi	r4,r3,-4
130	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
131	crset	4*cr7+eq
132	beq	58f
133
134	cmplw	0,r5,r0			/* is this more than total to do? */
135	blt	63f			/* if not much to do */
136	rlwinm	r7,r6,3,0x8
137	rlwnm	r12,r12,r7,0,31	/* odd destination address: rotate one byte */
138	cmplwi	cr7,r7,0	/* is destination address even ? */
139	andi.	r8,r0,3			/* get it word-aligned first */
140	mtctr	r8
141	beq+	61f
142	li	r3,0
14370:	lbz	r9,4(r4)		/* do some bytes */
144	addi	r4,r4,1
145	slwi	r3,r3,8
146	rlwimi	r3,r9,0,24,31
14771:	stb	r9,4(r6)
148	addi	r6,r6,1
149	bdnz	70b
150	adde	r12,r12,r3
15161:	subf	r5,r0,r5
152	srwi.	r0,r0,2
153	mtctr	r0
154	beq	58f
15572:	lwzu	r9,4(r4)		/* do some words */
156	adde	r12,r12,r9
15773:	stwu	r9,4(r6)
158	bdnz	72b
159
16058:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
161	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
162	li	r11,4
163	beq	63f
164
165	/* Here we decide how far ahead to prefetch the source */
166	li	r3,4
167	cmpwi	r0,1
168	li	r7,0
169	ble	114f
170	li	r7,1
171#if MAX_COPY_PREFETCH > 1
172	/* Heuristically, for large transfers we prefetch
173	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
174	   we prefetch 1 cacheline ahead. */
175	cmpwi	r0,MAX_COPY_PREFETCH
176	ble	112f
177	li	r7,MAX_COPY_PREFETCH
178112:	mtctr	r7
179111:	dcbt	r3,r4
180	addi	r3,r3,CACHELINE_BYTES
181	bdnz	111b
182#else
183	dcbt	r3,r4
184	addi	r3,r3,CACHELINE_BYTES
185#endif /* MAX_COPY_PREFETCH > 1 */
186
187114:	subf	r8,r7,r0
188	mr	r0,r7
189	mtctr	r8
190
19153:	dcbt	r3,r4
19254:	dcbz	r11,r6
193/* the main body of the cacheline loop */
194	CSUM_COPY_16_BYTES_WITHEX(0)
195#if L1_CACHE_BYTES >= 32
196	CSUM_COPY_16_BYTES_WITHEX(1)
197#if L1_CACHE_BYTES >= 64
198	CSUM_COPY_16_BYTES_WITHEX(2)
199	CSUM_COPY_16_BYTES_WITHEX(3)
200#if L1_CACHE_BYTES >= 128
201	CSUM_COPY_16_BYTES_WITHEX(4)
202	CSUM_COPY_16_BYTES_WITHEX(5)
203	CSUM_COPY_16_BYTES_WITHEX(6)
204	CSUM_COPY_16_BYTES_WITHEX(7)
205#endif
206#endif
207#endif
208	bdnz	53b
209	cmpwi	r0,0
210	li	r3,4
211	li	r7,0
212	bne	114b
213
21463:	srwi.	r0,r5,2
215	mtctr	r0
216	beq	64f
21730:	lwzu	r0,4(r4)
218	adde	r12,r12,r0
21931:	stwu	r0,4(r6)
220	bdnz	30b
221
22264:	andi.	r0,r5,2
223	beq+	65f
22440:	lhz	r0,4(r4)
225	addi	r4,r4,2
22641:	sth	r0,4(r6)
227	adde	r12,r12,r0
228	addi	r6,r6,2
22965:	andi.	r0,r5,1
230	beq+	66f
23150:	lbz	r0,4(r4)
23251:	stb	r0,4(r6)
233	slwi	r0,r0,8
234	adde	r12,r12,r0
23566:	addze	r3,r12
236	beqlr+	cr7
237	rlwinm	r3,r3,8,0,31	/* odd destination address: rotate one byte */
238	blr
239
240fault:
241	li	r3,0
242	blr
243
244	EX_TABLE(70b, fault);
245	EX_TABLE(71b, fault);
246	EX_TABLE(72b, fault);
247	EX_TABLE(73b, fault);
248	EX_TABLE(54b, fault);
249
250/*
251 * this stuff handles faults in the cacheline loop and branches to either
252 * fault (if in read part) or fault (if in write part)
253 */
254	CSUM_COPY_16_BYTES_EXCODE(0)
255#if L1_CACHE_BYTES >= 32
256	CSUM_COPY_16_BYTES_EXCODE(1)
257#if L1_CACHE_BYTES >= 64
258	CSUM_COPY_16_BYTES_EXCODE(2)
259	CSUM_COPY_16_BYTES_EXCODE(3)
260#if L1_CACHE_BYTES >= 128
261	CSUM_COPY_16_BYTES_EXCODE(4)
262	CSUM_COPY_16_BYTES_EXCODE(5)
263	CSUM_COPY_16_BYTES_EXCODE(6)
264	CSUM_COPY_16_BYTES_EXCODE(7)
265#endif
266#endif
267#endif
268
269	EX_TABLE(30b, fault);
270	EX_TABLE(31b, fault);
271	EX_TABLE(40b, fault);
272	EX_TABLE(41b, fault);
273	EX_TABLE(50b, fault);
274	EX_TABLE(51b, fault);
275
276EXPORT_SYMBOL(csum_partial_copy_generic)
277
278/*
279 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
280 *			   const struct in6_addr *daddr,
281 *			   __u32 len, __u8 proto, __wsum sum)
282 */
283
284_GLOBAL(csum_ipv6_magic)
285	lwz	r8, 0(r3)
286	lwz	r9, 4(r3)
287	addc	r0, r7, r8
288	lwz	r10, 8(r3)
289	adde	r0, r0, r9
290	lwz	r11, 12(r3)
291	adde	r0, r0, r10
292	lwz	r8, 0(r4)
293	adde	r0, r0, r11
294	lwz	r9, 4(r4)
295	adde	r0, r0, r8
296	lwz	r10, 8(r4)
297	adde	r0, r0, r9
298	lwz	r11, 12(r4)
299	adde	r0, r0, r10
300	add	r5, r5, r6	/* assumption: len + proto doesn't carry */
301	adde	r0, r0, r11
302	adde	r0, r0, r5
303	addze	r0, r0
304	rotlwi	r3, r0, 16
305	add	r3, r0, r3
306	not	r3, r3
307	rlwinm	r3, r3, 16, 16, 31
308	blr
309EXPORT_SYMBOL(csum_ipv6_magic)
310