xref: /illumos-gate/usr/src/lib/libc/sparcv9/crt/__align_cpy_4.S (revision a1d41cf940fc4cda50098ad61e6a78b19c7483cd)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27	.file	"__align_cpy_4.s"
28
29/* __align_cpy_4(s1, s2, n)
30 *
31 * Copy 4-byte aligned source to 4-byte aligned target in multiples of 4 bytes.
32 *
33 * Input:
34 *	o0	address of target
35 *	o1	address of source
36 *	o2	number of bytes to copy (must be a multiple of 4)
37 * Output:
38 *	o0	address of target
39 * Caller's registers that have been changed by this function:
40 *	o1-o5, g1, g5
41 *
42 * Note:
43 *	This helper routine will not be used by any 32-bit compilations.
44 *	To do so would break binary compatibility with previous versions of
45 *	Solaris.
46 *
47 * Assumptions:
48 *	Source and target addresses are 4-byte aligned.
49 *	Bytes to be copied are non-overlapping or _exactly_ overlapping.
50 *	The number of bytes to be copied is a multiple of 4.
51 *	Call will usually be made with a byte count of more than 4*4 and
52 *	less than a few hundred bytes.  Legal values are 0 to MAX_SIZE_T.
53 *
54 * Optimization attempt:
55 *	Reasonable speed for a generic v9.
56 */
57
58#include <sys/asm_linkage.h>
59
60	ENTRY(__align_cpy_4)
61	brz,pn %o2, .done		! Skip out if no bytes to copy.
62	cmp	%o0, %o1
63	be,pn	%xcc, .done		! Addresses are identical--done.
64	and	%o0, 7, %o3		! Is target 8-byte aligned?
65	and	%o1, 7, %o4		! Is source 8-byte aligned?
66	cmp	%o3, %o4
67	bne,pt	%icc, .noton8		! Exactly one of source and target is
68	mov	%o0, %g1		!     8-byte aligned.
69	brz,pt	%o3, .both8		! Both are 8-byte aligned.
70	nop
71
72	ld	[%o1], %o3		! Neither is aligned, so do 4 bytes;
73	subcc	%o2, 4, %o2		! then both will be aligned.
74	st	%o3, [%g1]
75	bz,pn	%xcc, .done
76	add	%g1, 4, %g1
77	b	.both8
78	add	%o1, 4, %o1
79
80! Section of code dealing with case where source and target are both 8-byte
81! aligned.  Get and store 16 bytes at a time using ldx and stx.
82
83	.align	32
84.both8:					! Both source and target are aligned.
85	cmp	%o2, 16
86	bl,a,pn %xcc, .chkwd
87	cmp	%o2, 8
88
89	sub	%o2, 12, %o2
90.loop16a:				! Load and store 16 bytes at a time.
91	ldx	[%o1], %o3
92	ldx	[%o1+8], %o4
93	subcc	%o2, 16, %o2
94	stx	%o3, [%g1]
95	stx	%o4, [%g1+8]
96	add	%o1, 16, %o1
97	bg,pt	%xcc, .loop16a		! Have at least 16 bytes left.
98	add	%g1, 16, %g1
99
100	addcc	%o2, 12, %o2
101	bg,a,pt	%xcc, .chkwd		! Have some remaining bytes.
102	cmp	%o2, 8
103	retl
104	nop
105
106.chkwd:
107	bl,a,pn	%xcc, .wrword		! Only 4 bytes left.
108	ld	[%o1], %o3
109
110	ldx	[%o1], %o3		! Have 8 or 12, so do 8.
111	stx	%o3, [%g1]
112	add	%o1, 8, %o1
113	add	%g1, 8, %g1
114	subcc	%o2, 8, %o2
115	bg,a,pn %xcc, .wrword		! Still have four to do.
116	ld	[%o1], %o3
117
118	retl
119	nop
120
121.wrword:				! Copy final word.
122	st	%o3, [%g1]
123
124.done:
125	retl
126	nop
127
128! Section of code where either source or target, but not both, are 8-byte
129! aligned.  So, use ld and st instructions rather than trying to copy stuff
130! around in registers.
131
132	.align	32			! Ultra cache line boundary.
133.noton8:
134	add	%o1, %o2, %g5	! Ending address of source.
135	andcc	%o2, 15, %o3	! Mod 16 of number of bytes to copy.
136	bz,pn	%xcc, .loop16	! Copy odd amounts first, then multiples of 16.
137	cmp	%o3, 4
138	bz,pn	%xcc, .mod4
139	cmp	%o3, 8
140	bz,pn	%xcc, .mod8
141	cmp	%o3, 12
142	bz,pt	%xcc, .mod12
143	nop
144	illtrap	0		! Size not valid.
145
146.mod4:				! Do first 4 bytes, then do multiples of 16.
147	lduw	[%o1], %o2
148	add	%o1, 4, %o1
149	st	%o2, [%g1]
150	cmp	%o1, %g5
151	bl,a,pt %xcc, .loop16
152	add	%g1, 4, %g1
153	retl
154	nop
155.mod8:				! Do first 8 bytes, then do multiples of 16.
156	lduw	[%o1], %o2
157	lduw	[%o1+4], %o3
158	add	%o1, 8, %o1
159	st	%o2, [%g1]
160	st	%o3, [%g1+4]
161	cmp	%o1, %g5
162	bl,a,pt	%xcc, .loop16
163	add	%g1, 8, %g1
164	retl
165	nop
166.mod12:				! Do first 12 bytes, then do multiples of 16.
167	lduw	[%o1], %o2
168	lduw	[%o1+4], %o3
169	lduw	[%o1+8], %o4
170	add	%o1, 12, %o1
171	st	%o2, [%g1]
172	st	%o3, [%g1+4]
173	st	%o4, [%g1+8]
174	cmp	%o1, %g5
175	bl,a,pt	%xcc, .loop16
176	add	%g1, 12, %g1
177	retl
178	nop
179	.align	32			! Ultra cache line boundary.
180.loop16:				! Do multiples of 16 bytes.
181	lduw	[%o1], %o2
182	lduw	[%o1+4], %o3
183	lduw	[%o1+8], %o4
184	lduw	[%o1+12], %o5
185	add	%o1, 16, %o1
186	st	%o2, [%g1]
187	st	%o3, [%g1+4]
188	cmp	%o1, %g5
189	st	%o4, [%g1+8]
190	st	%o5, [%g1+12]
191	bl,a,pt	%xcc, .loop16
192	add	%g1, 16,%g1
193	retl			! Target address is already in o0.
194	nop
195
196	SET_SIZE(__align_cpy_4)
197