xref: /illumos-gate/usr/src/uts/common/os/mmapobj.c (revision cf45009884e299356c21eb3d343d4b99bfd1fd5f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2014 Joyent, Inc.  All rights reserved.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/sysmacros.h>
29 #include <sys/kmem.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/errno.h>
33 #include <sys/mman.h>
34 #include <sys/cmn_err.h>
35 #include <sys/cred.h>
36 #include <sys/vmsystm.h>
37 #include <sys/machsystm.h>
38 #include <sys/debug.h>
39 #include <vm/as.h>
40 #include <vm/seg.h>
41 #include <sys/vmparam.h>
42 #include <sys/vfs.h>
43 #include <sys/elf.h>
44 #include <sys/machelf.h>
45 #include <sys/corectl.h>
46 #include <sys/exec.h>
47 #include <sys/exechdr.h>
48 #include <sys/autoconf.h>
49 #include <sys/mem.h>
50 #include <vm/seg_dev.h>
51 #include <sys/vmparam.h>
52 #include <sys/mmapobj.h>
53 #include <sys/atomic.h>
54 
55 /*
56  * Theory statement:
57  *
58  * The main driving force behind mmapobj is to interpret and map ELF files
59  * inside of the kernel instead of having the linker be responsible for this.
60  *
61  * mmapobj also supports the AOUT 4.x binary format as well as flat files in
62  * a read only manner.
63  *
64  * When interpreting and mapping an ELF file, mmapobj will map each PT_LOAD
65  * or PT_SUNWBSS segment according to the ELF standard.  Refer to the "Linker
66  * and Libraries Guide" for more information about the standard and mapping
67  * rules.
68  *
69  * Having mmapobj interpret and map objects will allow the kernel to make the
70  * best decision for where to place the mappings for said objects.  Thus, we
71  * can make optimizations inside of the kernel for specific platforms or cache
72  * mapping information to make mapping objects faster.  The cache is ignored
73  * if ASLR is enabled.
74  *
75  * The lib_va_hash will be one such optimization.  For each ELF object that
76  * mmapobj is asked to interpret, we will attempt to cache the information
77  * about the PT_LOAD and PT_SUNWBSS sections to speed up future mappings of
78  * the same objects.  We will cache up to LIBVA_CACHED_SEGS (see below) program
79  * headers which should cover a majority of the libraries out there without
80  * wasting space.  In order to make sure that the cached information is valid,
81  * we check the passed in vnode's mtime and ctime to make sure the vnode
82  * has not been modified since the last time we used it.
83  *
84  * In addition, the lib_va_hash may contain a preferred starting VA for the
85  * object which can be useful for platforms which support a shared context.
86  * This will increase the likelyhood that library text can be shared among
87  * many different processes.  We limit the reserved VA space for 32 bit objects
88  * in order to minimize fragmenting the processes address space.
89  *
90  * In addition to the above, the mmapobj interface allows for padding to be
91  * requested before the first mapping and after the last mapping created.
92  * When padding is requested, no additional optimizations will be made for
93  * that request.
94  */
95 
96 /*
97  * Threshold to prevent allocating too much kernel memory to read in the
98  * program headers for an object.  If it requires more than below,
99  * we will use a KM_NOSLEEP allocation to allocate memory to hold all of the
100  * program headers which could possibly fail.  If less memory than below is
101  * needed, then we use a KM_SLEEP allocation and are willing to wait for the
102  * memory if we need to.
103  */
104 size_t mmapobj_alloc_threshold = 65536;
105 
106 /* Debug stats for test coverage */
107 #ifdef DEBUG
108 struct mobj_stats {
109 	uint_t	mobjs_unmap_called;
110 	uint_t	mobjs_remap_devnull;
111 	uint_t	mobjs_lookup_start;
112 	uint_t	mobjs_alloc_start;
113 	uint_t	mobjs_alloc_vmem;
114 	uint_t	mobjs_add_collision;
115 	uint_t	mobjs_get_addr;
116 	uint_t	mobjs_map_flat_no_padding;
117 	uint_t	mobjs_map_flat_padding;
118 	uint_t	mobjs_map_ptload_text;
119 	uint_t	mobjs_map_ptload_initdata;
120 	uint_t	mobjs_map_ptload_preread;
121 	uint_t	mobjs_map_ptload_unaligned_text;
122 	uint_t	mobjs_map_ptload_unaligned_map_fail;
123 	uint_t	mobjs_map_ptload_unaligned_read_fail;
124 	uint_t	mobjs_zfoddiff;
125 	uint_t	mobjs_zfoddiff_nowrite;
126 	uint_t	mobjs_zfodextra;
127 	uint_t	mobjs_ptload_failed;
128 	uint_t	mobjs_map_elf_no_holes;
129 	uint_t	mobjs_unmap_hole;
130 	uint_t	mobjs_nomem_header;
131 	uint_t	mobjs_inval_header;
132 	uint_t	mobjs_overlap_header;
133 	uint_t	mobjs_np2_align;
134 	uint_t	mobjs_np2_align_overflow;
135 	uint_t	mobjs_exec_padding;
136 	uint_t	mobjs_exec_addr_mapped;
137 	uint_t	mobjs_exec_addr_devnull;
138 	uint_t	mobjs_exec_addr_in_use;
139 	uint_t	mobjs_lvp_found;
140 	uint_t	mobjs_no_loadable_yet;
141 	uint_t	mobjs_nothing_to_map;
142 	uint_t	mobjs_e2big;
143 	uint_t	mobjs_dyn_pad_align;
144 	uint_t	mobjs_dyn_pad_noalign;
145 	uint_t	mobjs_alloc_start_fail;
146 	uint_t	mobjs_lvp_nocache;
147 	uint_t	mobjs_extra_padding;
148 	uint_t	mobjs_lvp_not_needed;
149 	uint_t	mobjs_no_mem_map_sz;
150 	uint_t	mobjs_check_exec_failed;
151 	uint_t	mobjs_lvp_used;
152 	uint_t	mobjs_wrong_model;
153 	uint_t	mobjs_noexec_fs;
154 	uint_t	mobjs_e2big_et_rel;
155 	uint_t	mobjs_et_rel_mapped;
156 	uint_t	mobjs_unknown_elf_type;
157 	uint_t	mobjs_phent32_too_small;
158 	uint_t	mobjs_phent64_too_small;
159 	uint_t	mobjs_inval_elf_class;
160 	uint_t	mobjs_too_many_phdrs;
161 	uint_t	mobjs_no_phsize;
162 	uint_t	mobjs_phsize_large;
163 	uint_t	mobjs_phsize_xtralarge;
164 	uint_t	mobjs_fast_wrong_model;
165 	uint_t	mobjs_fast_e2big;
166 	uint_t	mobjs_fast;
167 	uint_t	mobjs_fast_success;
168 	uint_t	mobjs_fast_not_now;
169 	uint_t	mobjs_small_file;
170 	uint_t	mobjs_read_error;
171 	uint_t	mobjs_unsupported;
172 	uint_t	mobjs_flat_e2big;
173 	uint_t	mobjs_phent_align32;
174 	uint_t	mobjs_phent_align64;
175 	uint_t	mobjs_lib_va_find_hit;
176 	uint_t	mobjs_lib_va_find_delay_delete;
177 	uint_t	mobjs_lib_va_find_delete;
178 	uint_t	mobjs_lib_va_add_delay_delete;
179 	uint_t	mobjs_lib_va_add_delete;
180 	uint_t	mobjs_lib_va_create_failure;
181 	uint_t	mobjs_min_align;
182 #if defined(__sparc)
183 	uint_t	mobjs_aout_uzero_fault;
184 	uint_t	mobjs_aout_64bit_try;
185 	uint_t	mobjs_aout_noexec;
186 	uint_t	mobjs_aout_e2big;
187 	uint_t	mobjs_aout_lib;
188 	uint_t	mobjs_aout_fixed;
189 	uint_t	mobjs_aout_zfoddiff;
190 	uint_t	mobjs_aout_map_bss;
191 	uint_t	mobjs_aout_bss_fail;
192 	uint_t	mobjs_aout_nlist;
193 	uint_t	mobjs_aout_addr_in_use;
194 #endif
195 } mobj_stats;
196 
197 #define	MOBJ_STAT_ADD(stat)		((mobj_stats.mobjs_##stat)++)
198 #else
199 #define	MOBJ_STAT_ADD(stat)
200 #endif
201 
202 /*
203  * Check if addr is at or above the address space reserved for the stack.
204  * The stack is at the top of the address space for all sparc processes
205  * and 64 bit x86 processes.  For 32 bit x86, the stack is not at the top
206  * of the address space and thus this check wil always return false for
207  * 32 bit x86 processes.
208  */
209 #if defined(__sparc)
210 #define	OVERLAPS_STACK(addr, p)						\
211 	(addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK)))
212 #elif defined(__amd64)
213 #define	OVERLAPS_STACK(addr, p)						\
214 	((p->p_model == DATAMODEL_LP64) &&				\
215 	(addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK))))
216 #endif
217 
218 /* lv_flags values - bitmap */
219 #define	LV_ELF32	0x1		/* 32 bit ELF file */
220 #define	LV_ELF64	0x2		/* 64 bit ELF file */
221 #define	LV_DEL		0x4		/* delete when lv_refcnt hits zero */
222 
223 /*
224  * Note: lv_num_segs will denote how many segments this file has and will
225  * only be set after the lv_mps array has been filled out.
226  * lv_mps can only be valid if lv_num_segs is non-zero.
227  */
228 struct lib_va {
229 	struct lib_va		*lv_next;
230 	caddr_t			lv_base_va;	/* start va for library */
231 	ssize_t			lv_len;		/* total va span of library */
232 	size_t			lv_align;	/* minimum alignment */
233 	uint64_t		lv_nodeid;	/* filesystem node id */
234 	uint64_t		lv_fsid;	/* filesystem id */
235 	timestruc_t		lv_ctime;	/* last time file was changed */
236 	timestruc_t		lv_mtime;	/* or modified */
237 	mmapobj_result_t	lv_mps[LIBVA_CACHED_SEGS]; /* cached pheaders */
238 	int			lv_num_segs;	/* # segs for this file */
239 	int			lv_flags;
240 	uint_t			lv_refcnt;	/* number of holds on struct */
241 };
242 
243 #define	LIB_VA_SIZE	1024
244 #define	LIB_VA_MASK	(LIB_VA_SIZE - 1)
245 #define	LIB_VA_MUTEX_SHIFT	3
246 
247 #if (LIB_VA_SIZE & (LIB_VA_SIZE - 1))
248 #error	"LIB_VA_SIZE is not a power of 2"
249 #endif
250 
251 static struct lib_va *lib_va_hash[LIB_VA_SIZE];
252 static kmutex_t lib_va_hash_mutex[LIB_VA_SIZE >> LIB_VA_MUTEX_SHIFT];
253 
254 #define	LIB_VA_HASH_MUTEX(index)					\
255 	(&lib_va_hash_mutex[index >> LIB_VA_MUTEX_SHIFT])
256 
257 #define	LIB_VA_HASH(nodeid)						\
258 	(((nodeid) ^ ((nodeid) << 7) ^ ((nodeid) << 13)) & LIB_VA_MASK)
259 
260 #define	LIB_VA_MATCH_ID(arg1, arg2)					\
261 	((arg1)->lv_nodeid == (arg2)->va_nodeid &&			\
262 	(arg1)->lv_fsid == (arg2)->va_fsid)
263 
264 #define	LIB_VA_MATCH_TIME(arg1, arg2)					\
265 	((arg1)->lv_ctime.tv_sec == (arg2)->va_ctime.tv_sec &&		\
266 	(arg1)->lv_mtime.tv_sec == (arg2)->va_mtime.tv_sec &&		\
267 	(arg1)->lv_ctime.tv_nsec == (arg2)->va_ctime.tv_nsec &&		\
268 	(arg1)->lv_mtime.tv_nsec == (arg2)->va_mtime.tv_nsec)
269 
270 #define	LIB_VA_MATCH(arg1, arg2)					\
271 	(LIB_VA_MATCH_ID(arg1, arg2) && LIB_VA_MATCH_TIME(arg1, arg2))
272 
273 /*
274  * lib_va will be used for optimized allocation of address ranges for
275  * libraries, such that subsequent mappings of the same library will attempt
276  * to use the same VA as previous mappings of that library.
277  * In order to map libraries at the same VA in many processes, we need to carve
278  * out our own address space for them which is unique across many processes.
279  * We use different arenas for 32 bit and 64 bit libraries.
280  *
281  * Since the 32 bit address space is relatively small, we limit the number of
282  * libraries which try to use consistent virtual addresses to lib_threshold.
283  * For 64 bit libraries there is no such limit since the address space is large.
284  */
285 static vmem_t *lib_va_32_arena;
286 static vmem_t *lib_va_64_arena;
287 uint_t lib_threshold = 20;	/* modifiable via /etc/system */
288 
289 static kmutex_t lib_va_init_mutex;	/* no need to initialize */
290 
291 /*
292  * Number of 32 bit and 64 bit libraries in lib_va hash.
293  */
294 static uint_t libs_mapped_32 = 0;
295 static uint_t libs_mapped_64 = 0;
296 
297 /*
298  * Free up the resources associated with lvp as well as lvp itself.
299  * We also decrement the number of libraries mapped via a lib_va
300  * cached virtual address.
301  */
302 void
303 lib_va_free(struct lib_va *lvp)
304 {
305 	int is_64bit = lvp->lv_flags & LV_ELF64;
306 	ASSERT(lvp->lv_refcnt == 0);
307 
308 	if (lvp->lv_base_va != NULL) {
309 		vmem_xfree(is_64bit ? lib_va_64_arena : lib_va_32_arena,
310 		    lvp->lv_base_va, lvp->lv_len);
311 		if (is_64bit) {
312 			atomic_dec_32(&libs_mapped_64);
313 		} else {
314 			atomic_dec_32(&libs_mapped_32);
315 		}
316 	}
317 	kmem_free(lvp, sizeof (struct lib_va));
318 }
319 
320 /*
321  * See if the file associated with the vap passed in is in the lib_va hash.
322  * If it is and the file has not been modified since last use, then
323  * return a pointer to that data.  Otherwise, return NULL if the file has
324  * changed or the file was not found in the hash.
325  */
326 static struct lib_va *
327 lib_va_find(vattr_t *vap)
328 {
329 	struct lib_va *lvp;
330 	struct lib_va *del = NULL;
331 	struct lib_va **tmp;
332 	uint_t index;
333 	index = LIB_VA_HASH(vap->va_nodeid);
334 
335 	mutex_enter(LIB_VA_HASH_MUTEX(index));
336 	tmp = &lib_va_hash[index];
337 	while (*tmp != NULL) {
338 		lvp = *tmp;
339 		if (LIB_VA_MATCH_ID(lvp, vap)) {
340 			if (LIB_VA_MATCH_TIME(lvp, vap)) {
341 				ASSERT((lvp->lv_flags & LV_DEL) == 0);
342 				lvp->lv_refcnt++;
343 				MOBJ_STAT_ADD(lib_va_find_hit);
344 			} else {
345 				/*
346 				 * file was updated since last use.
347 				 * need to remove it from list.
348 				 */
349 				del = lvp;
350 				*tmp = del->lv_next;
351 				del->lv_next = NULL;
352 				/*
353 				 * If we can't delete it now, mark it for later
354 				 */
355 				if (del->lv_refcnt) {
356 					MOBJ_STAT_ADD(lib_va_find_delay_delete);
357 					del->lv_flags |= LV_DEL;
358 					del = NULL;
359 				}
360 				lvp = NULL;
361 			}
362 			mutex_exit(LIB_VA_HASH_MUTEX(index));
363 			if (del) {
364 				ASSERT(del->lv_refcnt == 0);
365 				MOBJ_STAT_ADD(lib_va_find_delete);
366 				lib_va_free(del);
367 			}
368 			return (lvp);
369 		}
370 		tmp = &lvp->lv_next;
371 	}
372 	mutex_exit(LIB_VA_HASH_MUTEX(index));
373 	return (NULL);
374 }
375 
376 /*
377  * Add a new entry to the lib_va hash.
378  * Search the hash while holding the appropriate mutex to make sure that the
379  * data is not already in the cache.  If we find data that is in the cache
380  * already and has not been modified since last use, we return NULL.  If it
381  * has been modified since last use, we will remove that entry from
382  * the hash and it will be deleted once it's reference count reaches zero.
383  * If there is no current entry in the hash we will add the new entry and
384  * return it to the caller who is responsible for calling lib_va_release to
385  * drop their reference count on it.
386  *
387  * lv_num_segs will be set to zero since the caller needs to add that
388  * information to the data structure.
389  */
390 static struct lib_va *
391 lib_va_add_hash(caddr_t base_va, ssize_t len, size_t align, vattr_t *vap)
392 {
393 	struct lib_va *lvp;
394 	uint_t index;
395 	model_t model;
396 	struct lib_va **tmp;
397 	struct lib_va *del = NULL;
398 
399 	model = get_udatamodel();
400 	index = LIB_VA_HASH(vap->va_nodeid);
401 
402 	lvp = kmem_alloc(sizeof (struct lib_va), KM_SLEEP);
403 
404 	mutex_enter(LIB_VA_HASH_MUTEX(index));
405 
406 	/*
407 	 * Make sure not adding same data a second time.
408 	 * The hash chains should be relatively short and adding
409 	 * is a relatively rare event, so it's worth the check.
410 	 */
411 	tmp = &lib_va_hash[index];
412 	while (*tmp != NULL) {
413 		if (LIB_VA_MATCH_ID(*tmp, vap)) {
414 			if (LIB_VA_MATCH_TIME(*tmp, vap)) {
415 				mutex_exit(LIB_VA_HASH_MUTEX(index));
416 				kmem_free(lvp, sizeof (struct lib_va));
417 				return (NULL);
418 			}
419 
420 			/*
421 			 * We have the same nodeid and fsid but the file has
422 			 * been modified since we last saw it.
423 			 * Need to remove the old node and add this new
424 			 * one.
425 			 * Could probably use a callback mechanism to make
426 			 * this cleaner.
427 			 */
428 			ASSERT(del == NULL);
429 			del = *tmp;
430 			*tmp = del->lv_next;
431 			del->lv_next = NULL;
432 
433 			/*
434 			 * Check to see if we can free it.  If lv_refcnt
435 			 * is greater than zero, than some other thread
436 			 * has a reference to the one we want to delete
437 			 * and we can not delete it.  All of this is done
438 			 * under the lib_va_hash_mutex lock so it is atomic.
439 			 */
440 			if (del->lv_refcnt) {
441 				MOBJ_STAT_ADD(lib_va_add_delay_delete);
442 				del->lv_flags |= LV_DEL;
443 				del = NULL;
444 			}
445 			/* tmp is already advanced */
446 			continue;
447 		}
448 		tmp = &((*tmp)->lv_next);
449 	}
450 
451 	lvp->lv_base_va = base_va;
452 	lvp->lv_len = len;
453 	lvp->lv_align = align;
454 	lvp->lv_nodeid = vap->va_nodeid;
455 	lvp->lv_fsid = vap->va_fsid;
456 	lvp->lv_ctime.tv_sec = vap->va_ctime.tv_sec;
457 	lvp->lv_ctime.tv_nsec = vap->va_ctime.tv_nsec;
458 	lvp->lv_mtime.tv_sec = vap->va_mtime.tv_sec;
459 	lvp->lv_mtime.tv_nsec = vap->va_mtime.tv_nsec;
460 	lvp->lv_next = NULL;
461 	lvp->lv_refcnt = 1;
462 
463 	/* Caller responsible for filling this and lv_mps out */
464 	lvp->lv_num_segs = 0;
465 
466 	if (model == DATAMODEL_LP64) {
467 		lvp->lv_flags = LV_ELF64;
468 	} else {
469 		ASSERT(model == DATAMODEL_ILP32);
470 		lvp->lv_flags = LV_ELF32;
471 	}
472 
473 	if (base_va != NULL) {
474 		if (model == DATAMODEL_LP64) {
475 			atomic_inc_32(&libs_mapped_64);
476 		} else {
477 			ASSERT(model == DATAMODEL_ILP32);
478 			atomic_inc_32(&libs_mapped_32);
479 		}
480 	}
481 	ASSERT(*tmp == NULL);
482 	*tmp = lvp;
483 	mutex_exit(LIB_VA_HASH_MUTEX(index));
484 	if (del) {
485 		ASSERT(del->lv_refcnt == 0);
486 		MOBJ_STAT_ADD(lib_va_add_delete);
487 		lib_va_free(del);
488 	}
489 	return (lvp);
490 }
491 
492 /*
493  * Release the hold on lvp which was acquired by lib_va_find or lib_va_add_hash.
494  * In addition, if this is the last hold and lvp is marked for deletion,
495  * free up it's reserved address space and free the structure.
496  */
497 static void
498 lib_va_release(struct lib_va *lvp)
499 {
500 	uint_t index;
501 	int to_del = 0;
502 
503 	ASSERT(lvp->lv_refcnt > 0);
504 
505 	index = LIB_VA_HASH(lvp->lv_nodeid);
506 	mutex_enter(LIB_VA_HASH_MUTEX(index));
507 	if (--lvp->lv_refcnt == 0 && (lvp->lv_flags & LV_DEL)) {
508 		to_del = 1;
509 	}
510 	mutex_exit(LIB_VA_HASH_MUTEX(index));
511 	if (to_del) {
512 		ASSERT(lvp->lv_next == 0);
513 		lib_va_free(lvp);
514 	}
515 }
516 
517 /*
518  * Dummy function for mapping through /dev/null
519  * Normally I would have used mmmmap in common/io/mem.c
520  * but that is a static function, and for /dev/null, it
521  * just returns -1.
522  */
523 /* ARGSUSED */
524 static int
525 mmapobj_dummy(dev_t dev, off_t off, int prot)
526 {
527 	return (-1);
528 }
529 
530 /*
531  * Called when an error occurred which requires mmapobj to return failure.
532  * All mapped objects will be unmapped and /dev/null mappings will be
533  * reclaimed if necessary.
534  * num_mapped is the number of elements of mrp which have been mapped, and
535  * num_segs is the total number of elements in mrp.
536  * For e_type ET_EXEC, we need to unmap all of the elements in mrp since
537  * we had already made reservations for them.
538  * If num_mapped equals num_segs, then we know that we had fully mapped
539  * the file and only need to clean up the segments described.
540  * If they are not equal, then for ET_DYN we will unmap the range from the
541  * end of the last mapped segment to the end of the last segment in mrp
542  * since we would have made a reservation for that memory earlier.
543  * If e_type is passed in as zero, num_mapped must equal num_segs.
544  */
545 void
546 mmapobj_unmap(mmapobj_result_t *mrp, int num_mapped, int num_segs,
547     ushort_t e_type)
548 {
549 	int i;
550 	struct as *as = curproc->p_as;
551 	caddr_t addr;
552 	size_t size;
553 
554 	if (e_type == ET_EXEC) {
555 		num_mapped = num_segs;
556 	}
557 #ifdef DEBUG
558 	if (e_type == 0) {
559 		ASSERT(num_mapped == num_segs);
560 	}
561 #endif
562 
563 	MOBJ_STAT_ADD(unmap_called);
564 	for (i = 0; i < num_mapped; i++) {
565 
566 		/*
567 		 * If we are going to have to create a mapping we need to
568 		 * make sure that no one else will use the address we
569 		 * need to remap between the time it is unmapped and
570 		 * mapped below.
571 		 */
572 		if (mrp[i].mr_flags & MR_RESV) {
573 			as_rangelock(as);
574 		}
575 		/* Always need to unmap what we mapped */
576 		(void) as_unmap(as, mrp[i].mr_addr, mrp[i].mr_msize);
577 
578 		/* Need to reclaim /dev/null reservation from earlier */
579 		if (mrp[i].mr_flags & MR_RESV) {
580 			struct segdev_crargs dev_a;
581 
582 			ASSERT(e_type != ET_DYN);
583 			/*
584 			 * Use seg_dev segment driver for /dev/null mapping.
585 			 */
586 			dev_a.mapfunc = mmapobj_dummy;
587 			dev_a.dev = makedevice(mm_major, M_NULL);
588 			dev_a.offset = 0;
589 			dev_a.type = 0;		/* neither PRIVATE nor SHARED */
590 			dev_a.prot = dev_a.maxprot = (uchar_t)PROT_NONE;
591 			dev_a.hat_attr = 0;
592 			dev_a.hat_flags = 0;
593 
594 			(void) as_map(as, mrp[i].mr_addr, mrp[i].mr_msize,
595 			    segdev_create, &dev_a);
596 			MOBJ_STAT_ADD(remap_devnull);
597 			as_rangeunlock(as);
598 		}
599 	}
600 
601 	if (num_mapped != num_segs) {
602 		ASSERT(e_type == ET_DYN);
603 		/* Need to unmap any reservation made after last mapped seg */
604 		if (num_mapped == 0) {
605 			addr = mrp[0].mr_addr;
606 		} else {
607 			addr = mrp[num_mapped - 1].mr_addr +
608 			    mrp[num_mapped - 1].mr_msize;
609 		}
610 		size = (size_t)mrp[num_segs - 1].mr_addr +
611 		    mrp[num_segs - 1].mr_msize - (size_t)addr;
612 		(void) as_unmap(as, addr, size);
613 
614 		/*
615 		 * Now we need to unmap the holes between mapped segs.
616 		 * Note that we have not mapped all of the segments and thus
617 		 * the holes between segments would not have been unmapped
618 		 * yet.  If num_mapped == num_segs, then all of the holes
619 		 * between segments would have already been unmapped.
620 		 */
621 
622 		for (i = 1; i < num_mapped; i++) {
623 			addr = mrp[i - 1].mr_addr + mrp[i - 1].mr_msize;
624 			size = mrp[i].mr_addr - addr;
625 			(void) as_unmap(as, addr, size);
626 		}
627 	}
628 }
629 
630 /*
631  * We need to add the start address into mrp so that the unmap function
632  * has absolute addresses to use.
633  */
634 static void
635 mmapobj_unmap_exec(mmapobj_result_t *mrp, int num_mapped, caddr_t start_addr)
636 {
637 	int i;
638 
639 	for (i = 0; i < num_mapped; i++) {
640 		mrp[i].mr_addr += (size_t)start_addr;
641 	}
642 	mmapobj_unmap(mrp, num_mapped, num_mapped, ET_EXEC);
643 }
644 
645 static caddr_t
646 mmapobj_lookup_start_addr(struct lib_va *lvp)
647 {
648 	proc_t *p = curproc;
649 	struct as *as = p->p_as;
650 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL);
651 	int error;
652 	uint_t ma_flags = _MAP_LOW32;
653 	caddr_t base = NULL;
654 	size_t len;
655 	size_t align;
656 
657 	ASSERT(lvp != NULL);
658 	MOBJ_STAT_ADD(lookup_start);
659 
660 	as_rangelock(as);
661 
662 	base = lvp->lv_base_va;
663 	len = lvp->lv_len;
664 
665 	/*
666 	 * If we don't have an expected base address, or the one that we want
667 	 * to use is not available or acceptable, go get an acceptable
668 	 * address range.
669 	 */
670 	if (base == NULL || as_gap(as, len, &base, &len, 0, NULL) ||
671 	    valid_usr_range(base, len, PROT_ALL, as, as->a_userlimit) !=
672 	    RANGE_OKAY || OVERLAPS_STACK(base + len, p)) {
673 		if (lvp->lv_flags & LV_ELF64) {
674 			ma_flags = 0;
675 		}
676 
677 		align = lvp->lv_align;
678 		if (align > 1) {
679 			ma_flags |= MAP_ALIGN;
680 		}
681 
682 		base = (caddr_t)align;
683 		map_addr(&base, len, 0, 1, ma_flags);
684 	}
685 
686 	/*
687 	 * Need to reserve the address space we're going to use.
688 	 * Don't reserve swap space since we'll be mapping over this.
689 	 */
690 	if (base != NULL) {
691 		crargs.flags |= MAP_NORESERVE;
692 		error = as_map(as, base, len, segvn_create, &crargs);
693 		if (error) {
694 			base = NULL;
695 		}
696 	}
697 
698 	as_rangeunlock(as);
699 	return (base);
700 }
701 
702 /*
703  * Get the starting address for a given file to be mapped and return it
704  * to the caller.  If we're using lib_va and we need to allocate an address,
705  * we will attempt to allocate it from the global reserved pool such that the
706  * same address can be used in the future for this file.  If we can't use the
707  * reserved address then we just get one that will fit in our address space.
708  *
709  * Returns the starting virtual address for the range to be mapped or NULL
710  * if an error is encountered. If we successfully insert the requested info
711  * into the lib_va hash, then *lvpp will be set to point to this lib_va
712  * structure.  The structure will have a hold on it and thus lib_va_release
713  * needs to be called on it by the caller.  This function will not fill out
714  * lv_mps or lv_num_segs since it does not have enough information to do so.
715  * The caller is responsible for doing this making sure that any modifications
716  * to lv_mps are visible before setting lv_num_segs.
717  */
718 static caddr_t
719 mmapobj_alloc_start_addr(struct lib_va **lvpp, size_t len, int use_lib_va,
720     int randomize, size_t align, vattr_t *vap)
721 {
722 	proc_t *p = curproc;
723 	struct as *as = p->p_as;
724 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL);
725 	int error;
726 	model_t model;
727 	uint_t ma_flags = _MAP_LOW32;
728 	caddr_t base = NULL;
729 	vmem_t *model_vmem;
730 	size_t lib_va_start;
731 	size_t lib_va_end;
732 	size_t lib_va_len;
733 
734 	ASSERT(lvpp != NULL);
735 	ASSERT((randomize & use_lib_va) != 1);
736 
737 	MOBJ_STAT_ADD(alloc_start);
738 	model = get_udatamodel();
739 
740 	if (model == DATAMODEL_LP64) {
741 		ma_flags = 0;
742 		model_vmem = lib_va_64_arena;
743 	} else {
744 		ASSERT(model == DATAMODEL_ILP32);
745 		model_vmem = lib_va_32_arena;
746 	}
747 
748 	if (align > 1) {
749 		ma_flags |= MAP_ALIGN;
750 	}
751 
752 	if (randomize != 0)
753 		ma_flags |= _MAP_RANDOMIZE;
754 
755 	if (use_lib_va) {
756 		/*
757 		 * The first time through, we need to setup the lib_va arenas.
758 		 * We call map_addr to find a suitable range of memory to map
759 		 * the given library, and we will set the highest address
760 		 * in our vmem arena to the end of this adddress range.
761 		 * We allow up to half of the address space to be used
762 		 * for lib_va addresses but we do not prevent any allocations
763 		 * in this range from other allocation paths.
764 		 */
765 		if (lib_va_64_arena == NULL && model == DATAMODEL_LP64) {
766 			mutex_enter(&lib_va_init_mutex);
767 			if (lib_va_64_arena == NULL) {
768 				base = (caddr_t)align;
769 				as_rangelock(as);
770 				map_addr(&base, len, 0, 1, ma_flags);
771 				as_rangeunlock(as);
772 				if (base == NULL) {
773 					mutex_exit(&lib_va_init_mutex);
774 					MOBJ_STAT_ADD(lib_va_create_failure);
775 					goto nolibva;
776 				}
777 				lib_va_end = (size_t)base + len;
778 				lib_va_len = lib_va_end >> 1;
779 				lib_va_len = P2ROUNDUP(lib_va_len, PAGESIZE);
780 				lib_va_start = lib_va_end - lib_va_len;
781 
782 				/*
783 				 * Need to make sure we avoid the address hole.
784 				 * We know lib_va_end is valid but we need to
785 				 * make sure lib_va_start is as well.
786 				 */
787 				if ((lib_va_end > (size_t)hole_end) &&
788 				    (lib_va_start < (size_t)hole_end)) {
789 					lib_va_start = P2ROUNDUP(
790 					    (size_t)hole_end, PAGESIZE);
791 					lib_va_len = lib_va_end - lib_va_start;
792 				}
793 				lib_va_64_arena = vmem_create("lib_va_64",
794 				    (void *)lib_va_start, lib_va_len, PAGESIZE,
795 				    NULL, NULL, NULL, 0,
796 				    VM_NOSLEEP | VMC_IDENTIFIER);
797 				if (lib_va_64_arena == NULL) {
798 					mutex_exit(&lib_va_init_mutex);
799 					goto nolibva;
800 				}
801 			}
802 			model_vmem = lib_va_64_arena;
803 			mutex_exit(&lib_va_init_mutex);
804 		} else if (lib_va_32_arena == NULL &&
805 		    model == DATAMODEL_ILP32) {
806 			mutex_enter(&lib_va_init_mutex);
807 			if (lib_va_32_arena == NULL) {
808 				base = (caddr_t)align;
809 				as_rangelock(as);
810 				map_addr(&base, len, 0, 1, ma_flags);
811 				as_rangeunlock(as);
812 				if (base == NULL) {
813 					mutex_exit(&lib_va_init_mutex);
814 					MOBJ_STAT_ADD(lib_va_create_failure);
815 					goto nolibva;
816 				}
817 				lib_va_end = (size_t)base + len;
818 				lib_va_len = lib_va_end >> 1;
819 				lib_va_len = P2ROUNDUP(lib_va_len, PAGESIZE);
820 				lib_va_start = lib_va_end - lib_va_len;
821 				lib_va_32_arena = vmem_create("lib_va_32",
822 				    (void *)lib_va_start, lib_va_len, PAGESIZE,
823 				    NULL, NULL, NULL, 0,
824 				    VM_NOSLEEP | VMC_IDENTIFIER);
825 				if (lib_va_32_arena == NULL) {
826 					mutex_exit(&lib_va_init_mutex);
827 					goto nolibva;
828 				}
829 			}
830 			model_vmem = lib_va_32_arena;
831 			mutex_exit(&lib_va_init_mutex);
832 		}
833 
834 		if (model == DATAMODEL_LP64 || libs_mapped_32 < lib_threshold) {
835 			base = vmem_xalloc(model_vmem, len, align, 0, 0, NULL,
836 			    NULL, VM_NOSLEEP | VM_ENDALLOC);
837 			MOBJ_STAT_ADD(alloc_vmem);
838 		}
839 
840 		/*
841 		 * Even if the address fails to fit in our address space,
842 		 * or we can't use a reserved address,
843 		 * we should still save it off in lib_va_hash.
844 		 */
845 		*lvpp = lib_va_add_hash(base, len, align, vap);
846 
847 		/*
848 		 * Check for collision on insertion and free up our VA space.
849 		 * This is expected to be rare, so we'll just reset base to
850 		 * NULL instead of looking it up in the lib_va hash.
851 		 */
852 		if (*lvpp == NULL) {
853 			if (base != NULL) {
854 				vmem_xfree(model_vmem, base, len);
855 				base = NULL;
856 				MOBJ_STAT_ADD(add_collision);
857 			}
858 		}
859 	}
860 
861 nolibva:
862 	as_rangelock(as);
863 
864 	/*
865 	 * If we don't have an expected base address, or the one that we want
866 	 * to use is not available or acceptable, go get an acceptable
867 	 * address range.
868 	 *
869 	 * If ASLR is enabled, we should never have used the cache, and should
870 	 * also start our real work here, in the consequent of the next
871 	 * condition.
872 	 */
873 	if (randomize != 0)
874 		ASSERT(base == NULL);
875 
876 	if (base == NULL || as_gap(as, len, &base, &len, 0, NULL) ||
877 	    valid_usr_range(base, len, PROT_ALL, as, as->a_userlimit) !=
878 	    RANGE_OKAY || OVERLAPS_STACK(base + len, p)) {
879 		MOBJ_STAT_ADD(get_addr);
880 		base = (caddr_t)align;
881 		map_addr(&base, len, 0, 1, ma_flags);
882 	}
883 
884 	/*
885 	 * Need to reserve the address space we're going to use.
886 	 * Don't reserve swap space since we'll be mapping over this.
887 	 */
888 	if (base != NULL) {
889 		/* Don't reserve swap space since we'll be mapping over this */
890 		crargs.flags |= MAP_NORESERVE;
891 		error = as_map(as, base, len, segvn_create, &crargs);
892 		if (error) {
893 			base = NULL;
894 		}
895 	}
896 
897 	as_rangeunlock(as);
898 	return (base);
899 }
900 
901 /*
902  * Map the file associated with vp into the address space as a single
903  * read only private mapping.
904  * Returns 0 for success, and non-zero for failure to map the file.
905  */
906 static int
907 mmapobj_map_flat(vnode_t *vp, mmapobj_result_t *mrp, size_t padding,
908     cred_t *fcred)
909 {
910 	int error = 0;
911 	struct as *as = curproc->p_as;
912 	caddr_t addr = NULL;
913 	caddr_t start_addr;
914 	size_t len;
915 	size_t pad_len;
916 	int prot = PROT_USER | PROT_READ;
917 	uint_t ma_flags = _MAP_LOW32;
918 	vattr_t vattr;
919 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_USER, PROT_ALL);
920 
921 	if (get_udatamodel() == DATAMODEL_LP64) {
922 		ma_flags = 0;
923 	}
924 
925 	vattr.va_mask = AT_SIZE;
926 	error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL);
927 	if (error) {
928 		return (error);
929 	}
930 
931 	len = vattr.va_size;
932 
933 	ma_flags |= MAP_PRIVATE;
934 	if (padding == 0) {
935 		MOBJ_STAT_ADD(map_flat_no_padding);
936 		error = VOP_MAP(vp, 0, as, &addr, len, prot, PROT_ALL,
937 		    ma_flags, fcred, NULL);
938 		if (error == 0) {
939 			mrp[0].mr_addr = addr;
940 			mrp[0].mr_msize = len;
941 			mrp[0].mr_fsize = len;
942 			mrp[0].mr_offset = 0;
943 			mrp[0].mr_prot = prot;
944 			mrp[0].mr_flags = 0;
945 		}
946 		return (error);
947 	}
948 
949 	/* padding was requested so there's more work to be done */
950 	MOBJ_STAT_ADD(map_flat_padding);
951 
952 	/* No need to reserve swap space now since it will be reserved later */
953 	crargs.flags |= MAP_NORESERVE;
954 
955 	/* Need to setup padding which can only be in PAGESIZE increments. */
956 	ASSERT((padding & PAGEOFFSET) == 0);
957 	pad_len = len + (2 * padding);
958 
959 	as_rangelock(as);
960 	map_addr(&addr, pad_len, 0, 1, ma_flags);
961 	error = as_map(as, addr, pad_len, segvn_create, &crargs);
962 	as_rangeunlock(as);
963 	if (error) {
964 		return (error);
965 	}
966 	start_addr = addr;
967 	addr += padding;
968 	ma_flags |= MAP_FIXED;
969 	error = VOP_MAP(vp, 0, as, &addr, len, prot, PROT_ALL, ma_flags,
970 	    fcred, NULL);
971 	if (error == 0) {
972 		mrp[0].mr_addr = start_addr;
973 		mrp[0].mr_msize = padding;
974 		mrp[0].mr_fsize = 0;
975 		mrp[0].mr_offset = 0;
976 		mrp[0].mr_prot = 0;
977 		mrp[0].mr_flags = MR_PADDING;
978 
979 		mrp[1].mr_addr = addr;
980 		mrp[1].mr_msize = len;
981 		mrp[1].mr_fsize = len;
982 		mrp[1].mr_offset = 0;
983 		mrp[1].mr_prot = prot;
984 		mrp[1].mr_flags = 0;
985 
986 		mrp[2].mr_addr = addr + P2ROUNDUP(len, PAGESIZE);
987 		mrp[2].mr_msize = padding;
988 		mrp[2].mr_fsize = 0;
989 		mrp[2].mr_offset = 0;
990 		mrp[2].mr_prot = 0;
991 		mrp[2].mr_flags = MR_PADDING;
992 	} else {
993 		/* Need to cleanup the as_map from earlier */
994 		(void) as_unmap(as, start_addr, pad_len);
995 	}
996 	return (error);
997 }
998 
999 /*
1000  * Map a PT_LOAD or PT_SUNWBSS section of an executable file into the user's
1001  * address space.
1002  * vp - vnode to be mapped in
1003  * addr - start address
1004  * len - length of vp to be mapped
1005  * zfodlen - length of zero filled memory after len above
1006  * offset - offset into file where mapping should start
1007  * prot - protections for this mapping
1008  * fcred - credentials for the file associated with vp at open time.
1009  */
1010 static int
1011 mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
1012     off_t offset, int prot, cred_t *fcred)
1013 {
1014 	int error = 0;
1015 	caddr_t zfodbase, oldaddr;
1016 	size_t oldlen;
1017 	size_t end;
1018 	size_t zfoddiff;
1019 	label_t ljb;
1020 	struct as *as = curproc->p_as;
1021 	model_t model;
1022 	int full_page;
1023 
1024 	/*
1025 	 * See if addr and offset are aligned such that we can map in
1026 	 * full pages instead of partial pages.
1027 	 */
1028 	full_page = (((uintptr_t)addr & PAGEOFFSET) ==
1029 	    ((uintptr_t)offset & PAGEOFFSET));
1030 
1031 	model = get_udatamodel();
1032 
1033 	oldaddr = addr;
1034 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1035 	if (len) {
1036 		spgcnt_t availm, npages;
1037 		int preread;
1038 		uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1039 
1040 		if (model == DATAMODEL_ILP32) {
1041 			mflag |= _MAP_LOW32;
1042 		}
1043 		/* We may need to map in extra bytes */
1044 		oldlen = len;
1045 		len += ((size_t)oldaddr & PAGEOFFSET);
1046 
1047 		if (full_page) {
1048 			offset = (off_t)((uintptr_t)offset & PAGEMASK);
1049 			if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1050 				mflag |= MAP_TEXT;
1051 				MOBJ_STAT_ADD(map_ptload_text);
1052 			} else {
1053 				mflag |= MAP_INITDATA;
1054 				MOBJ_STAT_ADD(map_ptload_initdata);
1055 			}
1056 
1057 			/*
1058 			 * maxprot is passed as PROT_ALL so that mdb can
1059 			 * write to this segment.
1060 			 */
1061 			if (error = VOP_MAP(vp, (offset_t)offset, as, &addr,
1062 			    len, prot, PROT_ALL, mflag, fcred, NULL)) {
1063 				return (error);
1064 			}
1065 
1066 			/*
1067 			 * If the segment can fit and is relatively small, then
1068 			 * we prefault the entire segment in.  This is based
1069 			 * on the model that says the best working set of a
1070 			 * small program is all of its pages.
1071 			 * We only do this if freemem will not drop below
1072 			 * lotsfree since we don't want to induce paging.
1073 			 */
1074 			npages = (spgcnt_t)btopr(len);
1075 			availm = freemem - lotsfree;
1076 			preread = (npages < availm && len < PGTHRESH) ? 1 : 0;
1077 
1078 			/*
1079 			 * If we aren't prefaulting the segment,
1080 			 * increment "deficit", if necessary to ensure
1081 			 * that pages will become available when this
1082 			 * process starts executing.
1083 			 */
1084 			if (preread == 0 && npages > availm &&
1085 			    deficit < lotsfree) {
1086 				deficit += MIN((pgcnt_t)(npages - availm),
1087 				    lotsfree - deficit);
1088 			}
1089 
1090 			if (preread) {
1091 				(void) as_faulta(as, addr, len);
1092 				MOBJ_STAT_ADD(map_ptload_preread);
1093 			}
1094 		} else {
1095 			/*
1096 			 * addr and offset were not aligned such that we could
1097 			 * use VOP_MAP, thus we need to as_map the memory we
1098 			 * need and then read the data in from disk.
1099 			 * This code path is a corner case which should never
1100 			 * be taken, but hand crafted binaries could trigger
1101 			 * this logic and it needs to work correctly.
1102 			 */
1103 			MOBJ_STAT_ADD(map_ptload_unaligned_text);
1104 			as_rangelock(as);
1105 			(void) as_unmap(as, addr, len);
1106 
1107 			/*
1108 			 * We use zfod_argsp because we need to be able to
1109 			 * write to the mapping and then we'll change the
1110 			 * protections later if they are incorrect.
1111 			 */
1112 			error = as_map(as, addr, len, segvn_create, zfod_argsp);
1113 			as_rangeunlock(as);
1114 			if (error) {
1115 				MOBJ_STAT_ADD(map_ptload_unaligned_map_fail);
1116 				return (error);
1117 			}
1118 
1119 			/* Now read in the data from disk */
1120 			error = vn_rdwr(UIO_READ, vp, oldaddr, oldlen, offset,
1121 			    UIO_USERSPACE, 0, (rlim64_t)0, fcred, NULL);
1122 			if (error) {
1123 				MOBJ_STAT_ADD(map_ptload_unaligned_read_fail);
1124 				return (error);
1125 			}
1126 
1127 			/*
1128 			 * Now set protections.
1129 			 */
1130 			if (prot != PROT_ZFOD) {
1131 				(void) as_setprot(as, addr, len, prot);
1132 			}
1133 		}
1134 	}
1135 
1136 	if (zfodlen) {
1137 		end = (size_t)addr + len;
1138 		zfodbase = (caddr_t)P2ROUNDUP(end, PAGESIZE);
1139 		zfoddiff = (uintptr_t)zfodbase - end;
1140 		if (zfoddiff) {
1141 			/*
1142 			 * Before we go to zero the remaining space on the last
1143 			 * page, make sure we have write permission.
1144 			 *
1145 			 * We need to be careful how we zero-fill the last page
1146 			 * if the protection does not include PROT_WRITE. Using
1147 			 * as_setprot() can cause the VM segment code to call
1148 			 * segvn_vpage(), which must allocate a page struct for
1149 			 * each page in the segment. If we have a very large
1150 			 * segment, this may fail, so we check for that, even
1151 			 * though we ignore other return values from as_setprot.
1152 			 */
1153 			MOBJ_STAT_ADD(zfoddiff);
1154 			if ((prot & PROT_WRITE) == 0) {
1155 				if (as_setprot(as, (caddr_t)end, zfoddiff,
1156 				    prot | PROT_WRITE) == ENOMEM)
1157 					return (ENOMEM);
1158 				MOBJ_STAT_ADD(zfoddiff_nowrite);
1159 			}
1160 			if (on_fault(&ljb)) {
1161 				no_fault();
1162 				if ((prot & PROT_WRITE) == 0) {
1163 					(void) as_setprot(as, (caddr_t)end,
1164 					    zfoddiff, prot);
1165 				}
1166 				return (EFAULT);
1167 			}
1168 			uzero((void *)end, zfoddiff);
1169 			no_fault();
1170 
1171 			/*
1172 			 * Remove write protection to return to original state
1173 			 */
1174 			if ((prot & PROT_WRITE) == 0) {
1175 				(void) as_setprot(as, (caddr_t)end,
1176 				    zfoddiff, prot);
1177 			}
1178 		}
1179 		if (zfodlen > zfoddiff) {
1180 			struct segvn_crargs crargs =
1181 			    SEGVN_ZFOD_ARGS(prot, PROT_ALL);
1182 
1183 			MOBJ_STAT_ADD(zfodextra);
1184 			zfodlen -= zfoddiff;
1185 			crargs.szc = AS_MAP_NO_LPOOB;
1186 
1187 
1188 			as_rangelock(as);
1189 			(void) as_unmap(as, (caddr_t)zfodbase, zfodlen);
1190 			error = as_map(as, (caddr_t)zfodbase,
1191 			    zfodlen, segvn_create, &crargs);
1192 			as_rangeunlock(as);
1193 			if (error) {
1194 				return (error);
1195 			}
1196 		}
1197 	}
1198 	return (0);
1199 }
1200 
1201 /*
1202  * Map the ELF file represented by vp into the users address space.  The
1203  * first mapping will start at start_addr and there will be num_elements
1204  * mappings.  The mappings are described by the data in mrp which may be
1205  * modified upon returning from this function.
1206  * Returns 0 for success or errno for failure.
1207  */
1208 static int
1209 mmapobj_map_elf(struct vnode *vp, caddr_t start_addr, mmapobj_result_t *mrp,
1210     int num_elements, cred_t *fcred, ushort_t e_type)
1211 {
1212 	int i;
1213 	int ret;
1214 	caddr_t lo;
1215 	caddr_t hi;
1216 	struct as *as = curproc->p_as;
1217 
1218 	for (i = 0; i < num_elements; i++) {
1219 		caddr_t addr;
1220 		size_t p_memsz;
1221 		size_t p_filesz;
1222 		size_t zfodlen;
1223 		offset_t p_offset;
1224 		size_t dif;
1225 		int prot;
1226 
1227 		/* Always need to adjust mr_addr */
1228 		addr = start_addr + (size_t)(mrp[i].mr_addr);
1229 		mrp[i].mr_addr =
1230 		    (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1231 
1232 		/* Padding has already been mapped */
1233 		if (MR_GET_TYPE(mrp[i].mr_flags) == MR_PADDING) {
1234 			continue;
1235 		}
1236 
1237 		/* Can't execute code from "noexec" mounted filesystem. */
1238 		if (((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0) &&
1239 		    ((mrp[i].mr_prot & PROT_EXEC) != 0)) {
1240 			MOBJ_STAT_ADD(noexec_fs);
1241 			return (EACCES);
1242 		}
1243 
1244 		p_memsz = mrp[i].mr_msize;
1245 		p_filesz = mrp[i].mr_fsize;
1246 		zfodlen = p_memsz - p_filesz;
1247 		p_offset = mrp[i].mr_offset;
1248 		dif = (uintptr_t)(addr) & PAGEOFFSET;
1249 		prot = mrp[i].mr_prot | PROT_USER;
1250 		ret = mmapobj_map_ptload(vp, addr, p_filesz, zfodlen,
1251 		    p_offset, prot, fcred);
1252 		if (ret != 0) {
1253 			MOBJ_STAT_ADD(ptload_failed);
1254 			mmapobj_unmap(mrp, i, num_elements, e_type);
1255 			return (ret);
1256 		}
1257 
1258 		/* Need to cleanup mrp to reflect the actual values used */
1259 		mrp[i].mr_msize += dif;
1260 		mrp[i].mr_offset = (size_t)addr & PAGEOFFSET;
1261 	}
1262 
1263 	/* Also need to unmap any holes created above */
1264 	if (num_elements == 1) {
1265 		MOBJ_STAT_ADD(map_elf_no_holes);
1266 		return (0);
1267 	}
1268 	if (e_type == ET_EXEC) {
1269 		return (0);
1270 	}
1271 
1272 	as_rangelock(as);
1273 	lo = start_addr;
1274 	hi = mrp[0].mr_addr;
1275 
1276 	/* Remove holes made by the rest of the segments */
1277 	for (i = 0; i < num_elements - 1; i++) {
1278 		lo = (caddr_t)P2ROUNDUP((size_t)(mrp[i].mr_addr) +
1279 		    mrp[i].mr_msize, PAGESIZE);
1280 		hi = mrp[i + 1].mr_addr;
1281 		if (lo < hi) {
1282 			/*
1283 			 * If as_unmap fails we just use up a bit of extra
1284 			 * space
1285 			 */
1286 			(void) as_unmap(as, (caddr_t)lo,
1287 			    (size_t)hi - (size_t)lo);
1288 			MOBJ_STAT_ADD(unmap_hole);
1289 		}
1290 	}
1291 	as_rangeunlock(as);
1292 
1293 	return (0);
1294 }
1295 
1296 /* Ugly hack to get STRUCT_* macros to work below */
1297 struct myphdr {
1298 	Phdr		x;	/* native version */
1299 };
1300 
1301 struct myphdr32 {
1302 	Elf32_Phdr	x;
1303 };
1304 
1305 /*
1306  * Calculate and return the number of loadable segments in the ELF Phdr
1307  * represented by phdrbase as well as the len of the total mapping and
1308  * the max alignment that is needed for a given segment.  On success,
1309  * 0 is returned, and *len, *loadable and *align have been filled out.
1310  * On failure, errno will be returned, which in this case is ENOTSUP
1311  * if we were passed an ELF file with overlapping segments.
1312  */
1313 static int
1314 calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len,
1315     int *loadable, size_t *align)
1316 {
1317 	int i;
1318 	int hsize;
1319 	model_t model;
1320 	ushort_t e_type = ehdrp->e_type;	/* same offset 32 and 64 bit */
1321 	uint_t p_type;
1322 	offset_t p_offset;
1323 	size_t p_memsz;
1324 	size_t p_align;
1325 	caddr_t vaddr;
1326 	int num_segs = 0;
1327 	caddr_t start_addr = NULL;
1328 	caddr_t p_end = NULL;
1329 	size_t max_align = 0;
1330 	size_t min_align = PAGESIZE;	/* needed for vmem_xalloc */
1331 	STRUCT_HANDLE(myphdr, mph);
1332 #if defined(__sparc)
1333 	extern int vac_size;
1334 
1335 	/*
1336 	 * Want to prevent aliasing by making the start address at least be
1337 	 * aligned to vac_size.
1338 	 */
1339 	min_align = MAX(PAGESIZE, vac_size);
1340 #endif
1341 
1342 	model = get_udatamodel();
1343 	STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase);
1344 
1345 	/* hsize alignment should have been checked before calling this func */
1346 	if (model == DATAMODEL_LP64) {
1347 		hsize = ehdrp->e_phentsize;
1348 		if (hsize & 7) {
1349 			return (ENOTSUP);
1350 		}
1351 	} else {
1352 		ASSERT(model == DATAMODEL_ILP32);
1353 		hsize = ((Elf32_Ehdr *)ehdrp)->e_phentsize;
1354 		if (hsize & 3) {
1355 			return (ENOTSUP);
1356 		}
1357 	}
1358 
1359 	/*
1360 	 * Determine the span of all loadable segments and calculate the
1361 	 * number of loadable segments.
1362 	 */
1363 	for (i = 0; i < nphdrs; i++) {
1364 		p_type = STRUCT_FGET(mph, x.p_type);
1365 		if (p_type == PT_LOAD || p_type == PT_SUNWBSS) {
1366 			vaddr = (caddr_t)(uintptr_t)STRUCT_FGET(mph, x.p_vaddr);
1367 			p_memsz = STRUCT_FGET(mph, x.p_memsz);
1368 
1369 			/*
1370 			 * Skip this header if it requests no memory to be
1371 			 * mapped.
1372 			 */
1373 			if (p_memsz == 0) {
1374 				STRUCT_SET_HANDLE(mph, model,
1375 				    (struct myphdr *)((size_t)STRUCT_BUF(mph) +
1376 				    hsize));
1377 				MOBJ_STAT_ADD(nomem_header);
1378 				continue;
1379 			}
1380 			if (num_segs++ == 0) {
1381 				/*
1382 				 * The p_vaddr of the first PT_LOAD segment
1383 				 * must either be NULL or within the first
1384 				 * page in order to be interpreted.
1385 				 * Otherwise, its an invalid file.
1386 				 */
1387 				if (e_type == ET_DYN &&
1388 				    ((caddr_t)((uintptr_t)vaddr &
1389 				    (uintptr_t)PAGEMASK) != NULL)) {
1390 					MOBJ_STAT_ADD(inval_header);
1391 					return (ENOTSUP);
1392 				}
1393 				start_addr = vaddr;
1394 				/*
1395 				 * For the first segment, we need to map from
1396 				 * the beginning of the file, so we will
1397 				 * adjust the size of the mapping to include
1398 				 * this memory.
1399 				 */
1400 				p_offset = STRUCT_FGET(mph, x.p_offset);
1401 			} else {
1402 				p_offset = 0;
1403 			}
1404 			/*
1405 			 * Check to make sure that this mapping wouldn't
1406 			 * overlap a previous mapping.
1407 			 */
1408 			if (vaddr < p_end) {
1409 				MOBJ_STAT_ADD(overlap_header);
1410 				return (ENOTSUP);
1411 			}
1412 
1413 			p_end = vaddr + p_memsz + p_offset;
1414 			p_end = (caddr_t)P2ROUNDUP((size_t)p_end, PAGESIZE);
1415 
1416 			p_align = STRUCT_FGET(mph, x.p_align);
1417 			if (p_align > 1 && p_align > max_align) {
1418 				max_align = p_align;
1419 				if (max_align < min_align) {
1420 					max_align = min_align;
1421 					MOBJ_STAT_ADD(min_align);
1422 				}
1423 			}
1424 		}
1425 		STRUCT_SET_HANDLE(mph, model,
1426 		    (struct myphdr *)((size_t)STRUCT_BUF(mph) + hsize));
1427 	}
1428 
1429 	/*
1430 	 * The alignment should be a power of 2, if it isn't we forgive it
1431 	 * and round up.  On overflow, we'll set the alignment to max_align
1432 	 * rounded down to the nearest power of 2.
1433 	 */
1434 	if (max_align > 0 && !ISP2(max_align)) {
1435 		MOBJ_STAT_ADD(np2_align);
1436 		*align = 2 * (1L << (highbit(max_align) - 1));
1437 		if (*align < max_align ||
1438 		    (*align > UINT_MAX && model == DATAMODEL_ILP32)) {
1439 			MOBJ_STAT_ADD(np2_align_overflow);
1440 			*align = 1L << (highbit(max_align) - 1);
1441 		}
1442 	} else {
1443 		*align = max_align;
1444 	}
1445 
1446 	ASSERT(*align >= PAGESIZE || *align == 0);
1447 
1448 	*loadable = num_segs;
1449 	*len = p_end - start_addr;
1450 	return (0);
1451 }
1452 
1453 /*
1454  * Check the address space to see if the virtual addresses to be used are
1455  * available.  If they are not, return errno for failure.  On success, 0
1456  * will be returned, and the virtual addresses for each mmapobj_result_t
1457  * will be reserved.  Note that a reservation could have earlier been made
1458  * for a given segment via a /dev/null mapping.  If that is the case, then
1459  * we can use that VA space for our mappings.
1460  * Note: this function will only be used for ET_EXEC binaries.
1461  */
1462 int
1463 check_exec_addrs(int loadable, mmapobj_result_t *mrp, caddr_t start_addr)
1464 {
1465 	int i;
1466 	struct as *as = curproc->p_as;
1467 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1468 	int ret;
1469 	caddr_t myaddr;
1470 	size_t mylen;
1471 	struct seg *seg;
1472 
1473 	/* No need to reserve swap space now since it will be reserved later */
1474 	crargs.flags |= MAP_NORESERVE;
1475 	as_rangelock(as);
1476 	for (i = 0; i < loadable; i++) {
1477 
1478 		myaddr = start_addr + (size_t)mrp[i].mr_addr;
1479 		mylen = mrp[i].mr_msize;
1480 
1481 		/* See if there is a hole in the as for this range */
1482 		if (as_gap(as, mylen, &myaddr, &mylen, 0, NULL) == 0) {
1483 			ASSERT(myaddr == start_addr + (size_t)mrp[i].mr_addr);
1484 			ASSERT(mylen == mrp[i].mr_msize);
1485 
1486 #ifdef DEBUG
1487 			if (MR_GET_TYPE(mrp[i].mr_flags) == MR_PADDING) {
1488 				MOBJ_STAT_ADD(exec_padding);
1489 			}
1490 #endif
1491 			ret = as_map(as, myaddr, mylen, segvn_create, &crargs);
1492 			if (ret) {
1493 				as_rangeunlock(as);
1494 				mmapobj_unmap_exec(mrp, i, start_addr);
1495 				return (ret);
1496 			}
1497 		} else {
1498 			/*
1499 			 * There is a mapping that exists in the range
1500 			 * so check to see if it was a "reservation"
1501 			 * from /dev/null.  The mapping is from
1502 			 * /dev/null if the mapping comes from
1503 			 * segdev and the type is neither MAP_SHARED
1504 			 * nor MAP_PRIVATE.
1505 			 */
1506 			AS_LOCK_ENTER(as, RW_READER);
1507 			seg = as_findseg(as, myaddr, 0);
1508 			MOBJ_STAT_ADD(exec_addr_mapped);
1509 			if (seg && seg->s_ops == &segdev_ops &&
1510 			    ((SEGOP_GETTYPE(seg, myaddr) &
1511 			    (MAP_SHARED | MAP_PRIVATE)) == 0) &&
1512 			    myaddr >= seg->s_base &&
1513 			    myaddr + mylen <=
1514 			    seg->s_base + seg->s_size) {
1515 				MOBJ_STAT_ADD(exec_addr_devnull);
1516 				AS_LOCK_EXIT(as);
1517 				(void) as_unmap(as, myaddr, mylen);
1518 				ret = as_map(as, myaddr, mylen, segvn_create,
1519 				    &crargs);
1520 				mrp[i].mr_flags |= MR_RESV;
1521 				if (ret) {
1522 					as_rangeunlock(as);
1523 					/* Need to remap what we unmapped */
1524 					mmapobj_unmap_exec(mrp, i + 1,
1525 					    start_addr);
1526 					return (ret);
1527 				}
1528 			} else {
1529 				AS_LOCK_EXIT(as);
1530 				as_rangeunlock(as);
1531 				mmapobj_unmap_exec(mrp, i, start_addr);
1532 				MOBJ_STAT_ADD(exec_addr_in_use);
1533 				return (EADDRINUSE);
1534 			}
1535 		}
1536 	}
1537 	as_rangeunlock(as);
1538 	return (0);
1539 }
1540 
1541 /*
1542  * Walk through the ELF program headers and extract all useful information
1543  * for PT_LOAD and PT_SUNWBSS segments into mrp.
1544  * Return 0 on success or error on failure.
1545  */
1546 static int
1547 process_phdrs(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, mmapobj_result_t *mrp,
1548     vnode_t *vp, uint_t *num_mapped, size_t padding, cred_t *fcred)
1549 {
1550 	int i;
1551 	caddr_t start_addr = NULL;
1552 	caddr_t vaddr;
1553 	size_t len = 0;
1554 	size_t lib_len = 0;
1555 	int ret;
1556 	int prot;
1557 	struct lib_va *lvp = NULL;
1558 	vattr_t vattr;
1559 	struct as *as = curproc->p_as;
1560 	int error;
1561 	int loadable = 0;
1562 	int current = 0;
1563 	int use_lib_va = 1;
1564 	size_t align = 0;
1565 	size_t add_pad = 0;
1566 	int hdr_seen = 0;
1567 	ushort_t e_type = ehdrp->e_type;	/* same offset 32 and 64 bit */
1568 	uint_t p_type;
1569 	offset_t p_offset;
1570 	size_t p_memsz;
1571 	size_t p_filesz;
1572 	uint_t p_flags;
1573 	int hsize;
1574 	model_t model;
1575 	STRUCT_HANDLE(myphdr, mph);
1576 
1577 	model = get_udatamodel();
1578 	STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase);
1579 
1580 	/*
1581 	 * Need to make sure that hsize is aligned properly.
1582 	 * For 32bit processes, 4 byte alignment is required.
1583 	 * For 64bit processes, 8 byte alignment is required.
1584 	 * If the alignment isn't correct, we need to return failure
1585 	 * since it could cause an alignment error panic while walking
1586 	 * the phdr array.
1587 	 */
1588 	if (model == DATAMODEL_LP64) {
1589 		hsize = ehdrp->e_phentsize;
1590 		if (hsize & 7) {
1591 			MOBJ_STAT_ADD(phent_align64);
1592 			return (ENOTSUP);
1593 		}
1594 	} else {
1595 		ASSERT(model == DATAMODEL_ILP32);
1596 		hsize = ((Elf32_Ehdr *)ehdrp)->e_phentsize;
1597 		if (hsize & 3) {
1598 			MOBJ_STAT_ADD(phent_align32);
1599 			return (ENOTSUP);
1600 		}
1601 	}
1602 
1603 	if ((padding != 0) || secflag_enabled(curproc, PROC_SEC_ASLR)) {
1604 		use_lib_va = 0;
1605 	}
1606 	if (e_type == ET_DYN) {
1607 		vattr.va_mask = AT_FSID | AT_NODEID | AT_CTIME | AT_MTIME;
1608 		error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL);
1609 		if (error) {
1610 			return (error);
1611 		}
1612 		/* Check to see if we already have a description for this lib */
1613 		if (!secflag_enabled(curproc, PROC_SEC_ASLR))
1614 			lvp = lib_va_find(&vattr);
1615 
1616 		if (lvp != NULL) {
1617 			MOBJ_STAT_ADD(lvp_found);
1618 			if (use_lib_va) {
1619 				start_addr = mmapobj_lookup_start_addr(lvp);
1620 				if (start_addr == NULL) {
1621 					lib_va_release(lvp);
1622 					return (ENOMEM);
1623 				}
1624 			}
1625 
1626 			/*
1627 			 * loadable may be zero if the original allocator
1628 			 * of lvp hasn't finished setting it up but the rest
1629 			 * of the fields will be accurate.
1630 			 */
1631 			loadable = lvp->lv_num_segs;
1632 			len = lvp->lv_len;
1633 			align = lvp->lv_align;
1634 		}
1635 	}
1636 
1637 	/*
1638 	 * Determine the span of all loadable segments and calculate the
1639 	 * number of loadable segments, the total len spanned by the mappings
1640 	 * and the max alignment, if we didn't get them above.
1641 	 */
1642 	if (loadable == 0) {
1643 		MOBJ_STAT_ADD(no_loadable_yet);
1644 		ret = calc_loadable(ehdrp, phdrbase, nphdrs, &len,
1645 		    &loadable, &align);
1646 		if (ret != 0) {
1647 			/*
1648 			 * Since it'd be an invalid file, we shouldn't have
1649 			 * cached it previously.
1650 			 */
1651 			ASSERT(lvp == NULL);
1652 			return (ret);
1653 		}
1654 #ifdef DEBUG
1655 		if (lvp) {
1656 			ASSERT(len == lvp->lv_len);
1657 			ASSERT(align == lvp->lv_align);
1658 		}
1659 #endif
1660 	}
1661 
1662 	/* Make sure there's something to map. */
1663 	if (len == 0 || loadable == 0) {
1664 		/*
1665 		 * Since it'd be an invalid file, we shouldn't have
1666 		 * cached it previously.
1667 		 */
1668 		ASSERT(lvp == NULL);
1669 		MOBJ_STAT_ADD(nothing_to_map);
1670 		return (ENOTSUP);
1671 	}
1672 
1673 	lib_len = len;
1674 	if (padding != 0) {
1675 		loadable += 2;
1676 	}
1677 	if (loadable > *num_mapped) {
1678 		*num_mapped = loadable;
1679 		/* cleanup previous reservation */
1680 		if (start_addr) {
1681 			(void) as_unmap(as, start_addr, lib_len);
1682 		}
1683 		MOBJ_STAT_ADD(e2big);
1684 		if (lvp) {
1685 			lib_va_release(lvp);
1686 		}
1687 		return (E2BIG);
1688 	}
1689 
1690 	/*
1691 	 * We now know the size of the object to map and now we need to
1692 	 * get the start address to map it at.  It's possible we already
1693 	 * have it if we found all the info we need in the lib_va cache.
1694 	 */
1695 	if (e_type == ET_DYN && start_addr == NULL) {
1696 		/*
1697 		 * Need to make sure padding does not throw off
1698 		 * required alignment.  We can only specify an
1699 		 * alignment for the starting address to be mapped,
1700 		 * so we round padding up to the alignment and map
1701 		 * from there and then throw out the extra later.
1702 		 */
1703 		if (padding != 0) {
1704 			if (align > 1) {
1705 				add_pad = P2ROUNDUP(padding, align);
1706 				len += add_pad;
1707 				MOBJ_STAT_ADD(dyn_pad_align);
1708 			} else {
1709 				MOBJ_STAT_ADD(dyn_pad_noalign);
1710 				len += padding;	/* at beginning */
1711 			}
1712 			len += padding;	/* at end of mapping */
1713 		}
1714 		/*
1715 		 * At this point, if lvp is non-NULL, then above we
1716 		 * already found it in the cache but did not get
1717 		 * the start address since we were not going to use lib_va.
1718 		 * Since we know that lib_va will not be used, it's safe
1719 		 * to call mmapobj_alloc_start_addr and know that lvp
1720 		 * will not be modified.
1721 		 */
1722 		ASSERT(lvp ? use_lib_va == 0 : 1);
1723 		start_addr = mmapobj_alloc_start_addr(&lvp, len,
1724 		    use_lib_va,
1725 		    secflag_enabled(curproc, PROC_SEC_ASLR),
1726 		    align, &vattr);
1727 		if (start_addr == NULL) {
1728 			if (lvp) {
1729 				lib_va_release(lvp);
1730 			}
1731 			MOBJ_STAT_ADD(alloc_start_fail);
1732 			return (ENOMEM);
1733 		}
1734 		/*
1735 		 * If we can't cache it, no need to hang on to it.
1736 		 * Setting lv_num_segs to non-zero will make that
1737 		 * field active and since there are too many segments
1738 		 * to cache, all future users will not try to use lv_mps.
1739 		 */
1740 		if (lvp != NULL && loadable > LIBVA_CACHED_SEGS && use_lib_va) {
1741 			lvp->lv_num_segs = loadable;
1742 			lib_va_release(lvp);
1743 			lvp = NULL;
1744 			MOBJ_STAT_ADD(lvp_nocache);
1745 		}
1746 		/*
1747 		 * Free the beginning of the mapping if the padding
1748 		 * was not aligned correctly.
1749 		 */
1750 		if (padding != 0 && add_pad != padding) {
1751 			(void) as_unmap(as, start_addr,
1752 			    add_pad - padding);
1753 			start_addr += (add_pad - padding);
1754 			MOBJ_STAT_ADD(extra_padding);
1755 		}
1756 	}
1757 
1758 	/*
1759 	 * At this point, we have reserved the virtual address space
1760 	 * for our mappings.  Now we need to start filling out the mrp
1761 	 * array to describe all of the individual mappings we are going
1762 	 * to return.
1763 	 * For ET_EXEC there has been no memory reservation since we are
1764 	 * using fixed addresses.  While filling in the mrp array below,
1765 	 * we will have the first segment biased to start at addr 0
1766 	 * and the rest will be biased by this same amount.  Thus if there
1767 	 * is padding, the first padding will start at addr 0, and the next
1768 	 * segment will start at the value of padding.
1769 	 */
1770 
1771 	/* We'll fill out padding later, so start filling in mrp at index 1 */
1772 	if (padding != 0) {
1773 		current = 1;
1774 	}
1775 
1776 	/* If we have no more need for lvp let it go now */
1777 	if (lvp != NULL && use_lib_va == 0) {
1778 		lib_va_release(lvp);
1779 		MOBJ_STAT_ADD(lvp_not_needed);
1780 		lvp = NULL;
1781 	}
1782 
1783 	/* Now fill out the mrp structs from the program headers */
1784 	STRUCT_SET_HANDLE(mph, model, (struct myphdr *)phdrbase);
1785 	for (i = 0; i < nphdrs; i++) {
1786 		p_type = STRUCT_FGET(mph, x.p_type);
1787 		if (p_type == PT_LOAD || p_type == PT_SUNWBSS) {
1788 			vaddr = (caddr_t)(uintptr_t)STRUCT_FGET(mph, x.p_vaddr);
1789 			p_memsz = STRUCT_FGET(mph, x.p_memsz);
1790 			p_filesz = STRUCT_FGET(mph, x.p_filesz);
1791 			p_offset = STRUCT_FGET(mph, x.p_offset);
1792 			p_flags = STRUCT_FGET(mph, x.p_flags);
1793 
1794 			/*
1795 			 * Skip this header if it requests no memory to be
1796 			 * mapped.
1797 			 */
1798 			if (p_memsz == 0) {
1799 				STRUCT_SET_HANDLE(mph, model,
1800 				    (struct myphdr *)((size_t)STRUCT_BUF(mph) +
1801 				    hsize));
1802 				MOBJ_STAT_ADD(no_mem_map_sz);
1803 				continue;
1804 			}
1805 
1806 			prot = 0;
1807 			if (p_flags & PF_R)
1808 				prot |= PROT_READ;
1809 			if (p_flags & PF_W)
1810 				prot |= PROT_WRITE;
1811 			if (p_flags & PF_X)
1812 				prot |= PROT_EXEC;
1813 
1814 			ASSERT(current < loadable);
1815 			mrp[current].mr_msize = p_memsz;
1816 			mrp[current].mr_fsize = p_filesz;
1817 			mrp[current].mr_offset = p_offset;
1818 			mrp[current].mr_prot = prot;
1819 
1820 			if (hdr_seen == 0 && p_filesz != 0) {
1821 				mrp[current].mr_flags = MR_HDR_ELF;
1822 				/*
1823 				 * We modify mr_offset because we
1824 				 * need to map the ELF header as well, and if
1825 				 * we didn't then the header could be left out
1826 				 * of the mapping that we will create later.
1827 				 * Since we're removing the offset, we need to
1828 				 * account for that in the other fields as well
1829 				 * since we will be mapping the memory from 0
1830 				 * to p_offset.
1831 				 */
1832 				if (e_type == ET_DYN) {
1833 					mrp[current].mr_offset = 0;
1834 					mrp[current].mr_msize += p_offset;
1835 					mrp[current].mr_fsize += p_offset;
1836 				} else {
1837 					ASSERT(e_type == ET_EXEC);
1838 					/*
1839 					 * Save off the start addr which will be
1840 					 * our bias for the rest of the
1841 					 * ET_EXEC mappings.
1842 					 */
1843 					start_addr = vaddr - padding;
1844 				}
1845 				mrp[current].mr_addr = (caddr_t)padding;
1846 				hdr_seen = 1;
1847 			} else {
1848 				if (e_type == ET_EXEC) {
1849 					/* bias mr_addr */
1850 					mrp[current].mr_addr =
1851 					    vaddr - (size_t)start_addr;
1852 				} else {
1853 					mrp[current].mr_addr = vaddr + padding;
1854 				}
1855 				mrp[current].mr_flags = 0;
1856 			}
1857 			current++;
1858 		}
1859 
1860 		/* Move to next phdr */
1861 		STRUCT_SET_HANDLE(mph, model,
1862 		    (struct myphdr *)((size_t)STRUCT_BUF(mph) +
1863 		    hsize));
1864 	}
1865 
1866 	/* Now fill out the padding segments */
1867 	if (padding != 0) {
1868 		mrp[0].mr_addr = NULL;
1869 		mrp[0].mr_msize = padding;
1870 		mrp[0].mr_fsize = 0;
1871 		mrp[0].mr_offset = 0;
1872 		mrp[0].mr_prot = 0;
1873 		mrp[0].mr_flags = MR_PADDING;
1874 
1875 		/* Setup padding for the last segment */
1876 		ASSERT(current == loadable - 1);
1877 		mrp[current].mr_addr = (caddr_t)lib_len + padding;
1878 		mrp[current].mr_msize = padding;
1879 		mrp[current].mr_fsize = 0;
1880 		mrp[current].mr_offset = 0;
1881 		mrp[current].mr_prot = 0;
1882 		mrp[current].mr_flags = MR_PADDING;
1883 	}
1884 
1885 	/*
1886 	 * Need to make sure address ranges desired are not in use or
1887 	 * are previously allocated reservations from /dev/null.  For
1888 	 * ET_DYN, we already made sure our address range was free.
1889 	 */
1890 	if (e_type == ET_EXEC) {
1891 		ret = check_exec_addrs(loadable, mrp, start_addr);
1892 		if (ret != 0) {
1893 			ASSERT(lvp == NULL);
1894 			MOBJ_STAT_ADD(check_exec_failed);
1895 			return (ret);
1896 		}
1897 	}
1898 
1899 	/* Finish up our business with lvp. */
1900 	if (lvp) {
1901 		ASSERT(e_type == ET_DYN);
1902 		if (lvp->lv_num_segs == 0 && loadable <= LIBVA_CACHED_SEGS) {
1903 			bcopy(mrp, lvp->lv_mps,
1904 			    loadable * sizeof (mmapobj_result_t));
1905 			membar_producer();
1906 		}
1907 		/*
1908 		 * Setting lv_num_segs to a non-zero value indicates that
1909 		 * lv_mps is now valid and can be used by other threads.
1910 		 * So, the above stores need to finish before lv_num_segs
1911 		 * is updated. lv_mps is only valid if lv_num_segs is
1912 		 * greater than LIBVA_CACHED_SEGS.
1913 		 */
1914 		lvp->lv_num_segs = loadable;
1915 		lib_va_release(lvp);
1916 		MOBJ_STAT_ADD(lvp_used);
1917 	}
1918 
1919 	/* Now that we have mrp completely filled out go map it */
1920 	ret = mmapobj_map_elf(vp, start_addr, mrp, loadable, fcred, e_type);
1921 	if (ret == 0) {
1922 		*num_mapped = loadable;
1923 	}
1924 
1925 	return (ret);
1926 }
1927 
1928 /*
1929  * Take the ELF file passed in, and do the work of mapping it.
1930  * num_mapped in - # elements in user buffer
1931  * num_mapped out - # sections mapped and length of mrp array if
1932  *			no errors.
1933  */
1934 static int
1935 doelfwork(Ehdr *ehdrp, vnode_t *vp, mmapobj_result_t *mrp,
1936     uint_t *num_mapped, size_t padding, cred_t *fcred)
1937 {
1938 	int error;
1939 	offset_t phoff;
1940 	int nphdrs;
1941 	unsigned char ei_class;
1942 	unsigned short phentsize;
1943 	ssize_t phsizep;
1944 	caddr_t phbasep;
1945 	int to_map;
1946 	model_t model;
1947 
1948 	ei_class = ehdrp->e_ident[EI_CLASS];
1949 	model = get_udatamodel();
1950 	if ((model == DATAMODEL_ILP32 && ei_class == ELFCLASS64) ||
1951 	    (model == DATAMODEL_LP64 && ei_class == ELFCLASS32)) {
1952 		MOBJ_STAT_ADD(wrong_model);
1953 		return (ENOTSUP);
1954 	}
1955 
1956 	/* Can't execute code from "noexec" mounted filesystem. */
1957 	if (ehdrp->e_type == ET_EXEC &&
1958 	    (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0) {
1959 		MOBJ_STAT_ADD(noexec_fs);
1960 		return (EACCES);
1961 	}
1962 
1963 	/*
1964 	 * Relocatable and core files are mapped as a single flat file
1965 	 * since no interpretation is done on them by mmapobj.
1966 	 */
1967 	if (ehdrp->e_type == ET_REL || ehdrp->e_type == ET_CORE) {
1968 		to_map = padding ? 3 : 1;
1969 		if (*num_mapped < to_map) {
1970 			*num_mapped = to_map;
1971 			MOBJ_STAT_ADD(e2big_et_rel);
1972 			return (E2BIG);
1973 		}
1974 		error = mmapobj_map_flat(vp, mrp, padding, fcred);
1975 		if (error == 0) {
1976 			*num_mapped = to_map;
1977 			mrp[padding ? 1 : 0].mr_flags = MR_HDR_ELF;
1978 			MOBJ_STAT_ADD(et_rel_mapped);
1979 		}
1980 		return (error);
1981 	}
1982 
1983 	/* Check for an unknown ELF type */
1984 	if (ehdrp->e_type != ET_EXEC && ehdrp->e_type != ET_DYN) {
1985 		MOBJ_STAT_ADD(unknown_elf_type);
1986 		return (ENOTSUP);
1987 	}
1988 
1989 	if (ei_class == ELFCLASS32) {
1990 		Elf32_Ehdr *e32hdr = (Elf32_Ehdr *)ehdrp;
1991 		ASSERT(model == DATAMODEL_ILP32);
1992 		nphdrs = e32hdr->e_phnum;
1993 		phentsize = e32hdr->e_phentsize;
1994 		if (phentsize < sizeof (Elf32_Phdr)) {
1995 			MOBJ_STAT_ADD(phent32_too_small);
1996 			return (ENOTSUP);
1997 		}
1998 		phoff = e32hdr->e_phoff;
1999 	} else if (ei_class == ELFCLASS64) {
2000 		Elf64_Ehdr *e64hdr = (Elf64_Ehdr *)ehdrp;
2001 		ASSERT(model == DATAMODEL_LP64);
2002 		nphdrs = e64hdr->e_phnum;
2003 		phentsize = e64hdr->e_phentsize;
2004 		if (phentsize < sizeof (Elf64_Phdr)) {
2005 			MOBJ_STAT_ADD(phent64_too_small);
2006 			return (ENOTSUP);
2007 		}
2008 		phoff = e64hdr->e_phoff;
2009 	} else {
2010 		/* fallthrough case for an invalid ELF class */
2011 		MOBJ_STAT_ADD(inval_elf_class);
2012 		return (ENOTSUP);
2013 	}
2014 
2015 	/*
2016 	 * nphdrs should only have this value for core files which are handled
2017 	 * above as a single mapping.  If other file types ever use this
2018 	 * sentinel, then we'll add the support needed to handle this here.
2019 	 */
2020 	if (nphdrs == PN_XNUM) {
2021 		MOBJ_STAT_ADD(too_many_phdrs);
2022 		return (ENOTSUP);
2023 	}
2024 
2025 	phsizep = nphdrs * phentsize;
2026 
2027 	if (phsizep == 0) {
2028 		MOBJ_STAT_ADD(no_phsize);
2029 		return (ENOTSUP);
2030 	}
2031 
2032 	/* Make sure we only wait for memory if it's a reasonable request */
2033 	if (phsizep > mmapobj_alloc_threshold) {
2034 		MOBJ_STAT_ADD(phsize_large);
2035 		if ((phbasep = kmem_alloc(phsizep, KM_NOSLEEP)) == NULL) {
2036 			MOBJ_STAT_ADD(phsize_xtralarge);
2037 			return (ENOMEM);
2038 		}
2039 	} else {
2040 		phbasep = kmem_alloc(phsizep, KM_SLEEP);
2041 	}
2042 
2043 	if ((error = vn_rdwr(UIO_READ, vp, phbasep, phsizep,
2044 	    (offset_t)phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
2045 	    fcred, NULL)) != 0) {
2046 		kmem_free(phbasep, phsizep);
2047 		return (error);
2048 	}
2049 
2050 	/* Now process the phdr's */
2051 	error = process_phdrs(ehdrp, phbasep, nphdrs, mrp, vp, num_mapped,
2052 	    padding, fcred);
2053 	kmem_free(phbasep, phsizep);
2054 	return (error);
2055 }
2056 
2057 #if defined(__sparc)
2058 /*
2059  * Hack to support 64 bit kernels running AOUT 4.x programs.
2060  * This is the sizeof (struct nlist) for a 32 bit kernel.
2061  * Since AOUT programs are 32 bit only, they will never use the 64 bit
2062  * sizeof (struct nlist) and thus creating a #define is the simplest
2063  * way around this since this is a format which is not being updated.
2064  * This will be used in the place of sizeof (struct nlist) below.
2065  */
2066 #define	NLIST_SIZE	(0xC)
2067 
2068 static int
2069 doaoutwork(vnode_t *vp, mmapobj_result_t *mrp,
2070     uint_t *num_mapped, struct exec *hdr, cred_t *fcred)
2071 {
2072 	int error;
2073 	size_t size;
2074 	size_t osize;
2075 	size_t nsize;	/* nlist size */
2076 	size_t msize;
2077 	size_t zfoddiff;
2078 	caddr_t addr;
2079 	caddr_t start_addr;
2080 	struct as *as = curproc->p_as;
2081 	int prot = PROT_USER | PROT_READ | PROT_EXEC;
2082 	uint_t mflag = MAP_PRIVATE | _MAP_LOW32;
2083 	offset_t off = 0;
2084 	int segnum = 0;
2085 	uint_t to_map;
2086 	int is_library = 0;
2087 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
2088 
2089 	/* Only 32bit apps supported by this file format */
2090 	if (get_udatamodel() != DATAMODEL_ILP32) {
2091 		MOBJ_STAT_ADD(aout_64bit_try);
2092 		return (ENOTSUP);
2093 	}
2094 
2095 	/* Check to see if this is a library */
2096 	if (hdr->a_magic == ZMAGIC && hdr->a_entry < PAGESIZE) {
2097 		is_library = 1;
2098 	}
2099 
2100 	/*
2101 	 * Can't execute code from "noexec" mounted filesystem.  Unlike ELF,
2102 	 * aout libraries are always mapped with something PROT_EXEC, so this
2103 	 * doesn't need to be checked for specific parts
2104 	 */
2105 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0) {
2106 		MOBJ_STAT_ADD(aout_noexec);
2107 		return (EACCES);
2108 	}
2109 
2110 	/*
2111 	 * There are 2 ways to calculate the mapped size of executable:
2112 	 * 1) rounded text size + data size + bss size.
2113 	 * 2) starting offset for text + text size + data size + text relocation
2114 	 *    size + data relocation size + room for nlist data structure.
2115 	 *
2116 	 * The larger of the two sizes will be used to map this binary.
2117 	 */
2118 	osize = P2ROUNDUP(hdr->a_text, PAGESIZE) + hdr->a_data + hdr->a_bss;
2119 
2120 	off = hdr->a_magic == ZMAGIC ? 0 : sizeof (struct exec);
2121 
2122 	nsize = off + hdr->a_text + hdr->a_data + hdr->a_trsize +
2123 	    hdr->a_drsize + NLIST_SIZE;
2124 
2125 	size = MAX(osize, nsize);
2126 	if (size != nsize) {
2127 		nsize = 0;
2128 	}
2129 
2130 	/*
2131 	 * 1 seg for text and 1 seg for initialized data.
2132 	 * 1 seg for bss (if can't fit in leftover space of init data)
2133 	 * 1 seg for nlist if needed.
2134 	 */
2135 	to_map = 2 + (nsize ? 1 : 0) +
2136 	    (hdr->a_bss > PAGESIZE - P2PHASE(hdr->a_data, PAGESIZE) ? 1 : 0);
2137 	if (*num_mapped < to_map) {
2138 		*num_mapped = to_map;
2139 		MOBJ_STAT_ADD(aout_e2big);
2140 		return (E2BIG);
2141 	}
2142 
2143 	/* Reserve address space for the whole mapping */
2144 	if (is_library) {
2145 		/* We'll let VOP_MAP below pick our address for us */
2146 		addr = NULL;
2147 		MOBJ_STAT_ADD(aout_lib);
2148 	} else {
2149 		/*
2150 		 * default start address for fixed binaries from AOUT 4.x
2151 		 * standard.
2152 		 */
2153 		MOBJ_STAT_ADD(aout_fixed);
2154 		mflag |= MAP_FIXED;
2155 		addr = (caddr_t)0x2000;
2156 		as_rangelock(as);
2157 		if (as_gap(as, size, &addr, &size, 0, NULL) != 0) {
2158 			as_rangeunlock(as);
2159 			MOBJ_STAT_ADD(aout_addr_in_use);
2160 			return (EADDRINUSE);
2161 		}
2162 		crargs.flags |= MAP_NORESERVE;
2163 		error = as_map(as, addr, size, segvn_create, &crargs);
2164 		ASSERT(addr == (caddr_t)0x2000);
2165 		as_rangeunlock(as);
2166 	}
2167 
2168 	start_addr = addr;
2169 	osize = size;
2170 
2171 	/*
2172 	 * Map as large as we need, backed by file, this will be text, and
2173 	 * possibly the nlist segment.  We map over this mapping for bss and
2174 	 * initialized data segments.
2175 	 */
2176 	error = VOP_MAP(vp, off, as, &addr, size, prot, PROT_ALL,
2177 	    mflag, fcred, NULL);
2178 	if (error) {
2179 		if (!is_library) {
2180 			(void) as_unmap(as, start_addr, osize);
2181 		}
2182 		return (error);
2183 	}
2184 
2185 	/* pickup the value of start_addr and osize for libraries */
2186 	start_addr = addr;
2187 	osize = size;
2188 
2189 	/*
2190 	 * We have our initial reservation/allocation so we need to use fixed
2191 	 * addresses from now on.
2192 	 */
2193 	mflag |= MAP_FIXED;
2194 
2195 	mrp[0].mr_addr = addr;
2196 	mrp[0].mr_msize = hdr->a_text;
2197 	mrp[0].mr_fsize = hdr->a_text;
2198 	mrp[0].mr_offset = 0;
2199 	mrp[0].mr_prot = PROT_READ | PROT_EXEC;
2200 	mrp[0].mr_flags = MR_HDR_AOUT;
2201 
2202 
2203 	/*
2204 	 * Map initialized data. We are mapping over a portion of the
2205 	 * previous mapping which will be unmapped in VOP_MAP below.
2206 	 */
2207 	off = P2ROUNDUP((offset_t)(hdr->a_text), PAGESIZE);
2208 	msize = off;
2209 	addr += off;
2210 	size = hdr->a_data;
2211 	error = VOP_MAP(vp, off, as, &addr, size, PROT_ALL, PROT_ALL,
2212 	    mflag, fcred, NULL);
2213 	if (error) {
2214 		(void) as_unmap(as, start_addr, osize);
2215 		return (error);
2216 	}
2217 	msize += size;
2218 	mrp[1].mr_addr = addr;
2219 	mrp[1].mr_msize = size;
2220 	mrp[1].mr_fsize = size;
2221 	mrp[1].mr_offset = 0;
2222 	mrp[1].mr_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
2223 	mrp[1].mr_flags = 0;
2224 
2225 	/* Need to zero out remainder of page */
2226 	addr += hdr->a_data;
2227 	zfoddiff = P2PHASE((size_t)addr, PAGESIZE);
2228 	if (zfoddiff) {
2229 		label_t ljb;
2230 
2231 		MOBJ_STAT_ADD(aout_zfoddiff);
2232 		zfoddiff = PAGESIZE - zfoddiff;
2233 		if (on_fault(&ljb)) {
2234 			no_fault();
2235 			MOBJ_STAT_ADD(aout_uzero_fault);
2236 			(void) as_unmap(as, start_addr, osize);
2237 			return (EFAULT);
2238 		}
2239 		uzero(addr, zfoddiff);
2240 		no_fault();
2241 	}
2242 	msize += zfoddiff;
2243 	segnum = 2;
2244 
2245 	/* Map bss */
2246 	if (hdr->a_bss > zfoddiff) {
2247 		struct segvn_crargs crargs =
2248 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
2249 		MOBJ_STAT_ADD(aout_map_bss);
2250 		addr += zfoddiff;
2251 		size = hdr->a_bss - zfoddiff;
2252 		as_rangelock(as);
2253 		(void) as_unmap(as, addr, size);
2254 		error = as_map(as, addr, size, segvn_create, &crargs);
2255 		as_rangeunlock(as);
2256 		msize += size;
2257 
2258 		if (error) {
2259 			MOBJ_STAT_ADD(aout_bss_fail);
2260 			(void) as_unmap(as, start_addr, osize);
2261 			return (error);
2262 		}
2263 		mrp[2].mr_addr = addr;
2264 		mrp[2].mr_msize = size;
2265 		mrp[2].mr_fsize = 0;
2266 		mrp[2].mr_offset = 0;
2267 		mrp[2].mr_prot = PROT_READ | PROT_WRITE | PROT_EXEC;
2268 		mrp[2].mr_flags = 0;
2269 
2270 		addr += size;
2271 		segnum = 3;
2272 	}
2273 
2274 	/*
2275 	 * If we have extra bits left over, we need to include that in how
2276 	 * much we mapped to make sure the nlist logic is correct
2277 	 */
2278 	msize = P2ROUNDUP(msize, PAGESIZE);
2279 
2280 	if (nsize && msize < nsize) {
2281 		MOBJ_STAT_ADD(aout_nlist);
2282 		mrp[segnum].mr_addr = addr;
2283 		mrp[segnum].mr_msize = nsize - msize;
2284 		mrp[segnum].mr_fsize = 0;
2285 		mrp[segnum].mr_offset = 0;
2286 		mrp[segnum].mr_prot = PROT_READ | PROT_EXEC;
2287 		mrp[segnum].mr_flags = 0;
2288 	}
2289 
2290 	*num_mapped = to_map;
2291 	return (0);
2292 }
2293 #endif
2294 
2295 /*
2296  * These are the two types of files that we can interpret and we want to read
2297  * in enough info to cover both types when looking at the initial header.
2298  */
2299 #define	MAX_HEADER_SIZE	(MAX(sizeof (Ehdr), sizeof (struct exec)))
2300 
2301 /*
2302  * Map vp passed in in an interpreted manner.  ELF and AOUT files will be
2303  * interpreted and mapped appropriately for execution.
2304  * num_mapped in - # elements in mrp
2305  * num_mapped out - # sections mapped and length of mrp array if
2306  *		    no errors or E2BIG returned.
2307  *
2308  * Returns 0 on success, errno value on failure.
2309  */
2310 static int
2311 mmapobj_map_interpret(vnode_t *vp, mmapobj_result_t *mrp,
2312     uint_t *num_mapped, size_t padding, cred_t *fcred)
2313 {
2314 	int error = 0;
2315 	vattr_t vattr;
2316 	struct lib_va *lvp;
2317 	caddr_t start_addr;
2318 	model_t model;
2319 
2320 	/*
2321 	 * header has to be aligned to the native size of ulong_t in order
2322 	 * to avoid an unaligned access when dereferencing the header as
2323 	 * a ulong_t.  Thus we allocate our array on the stack of type
2324 	 * ulong_t and then have header, which we dereference later as a char
2325 	 * array point at lheader.
2326 	 */
2327 	ulong_t lheader[(MAX_HEADER_SIZE / (sizeof (ulong_t))) + 1];
2328 	caddr_t header = (caddr_t)&lheader;
2329 
2330 	vattr.va_mask = AT_FSID | AT_NODEID | AT_CTIME | AT_MTIME | AT_SIZE;
2331 	error = VOP_GETATTR(vp, &vattr, 0, fcred, NULL);
2332 	if (error) {
2333 		return (error);
2334 	}
2335 
2336 	/*
2337 	 * Check lib_va to see if we already have a full description
2338 	 * for this library.  This is the fast path and only used for
2339 	 * ET_DYN ELF files (dynamic libraries).
2340 	 */
2341 	if (padding == 0 && !secflag_enabled(curproc, PROC_SEC_ASLR) &&
2342 	    ((lvp = lib_va_find(&vattr)) != NULL)) {
2343 		int num_segs;
2344 
2345 		model = get_udatamodel();
2346 		if ((model == DATAMODEL_ILP32 &&
2347 		    lvp->lv_flags & LV_ELF64) ||
2348 		    (model == DATAMODEL_LP64 &&
2349 		    lvp->lv_flags & LV_ELF32)) {
2350 			lib_va_release(lvp);
2351 			MOBJ_STAT_ADD(fast_wrong_model);
2352 			return (ENOTSUP);
2353 		}
2354 		num_segs = lvp->lv_num_segs;
2355 		if (*num_mapped < num_segs) {
2356 			*num_mapped = num_segs;
2357 			lib_va_release(lvp);
2358 			MOBJ_STAT_ADD(fast_e2big);
2359 			return (E2BIG);
2360 		}
2361 
2362 		/*
2363 		 * Check to see if we have all the mappable program headers
2364 		 * cached.
2365 		 */
2366 		if (num_segs <= LIBVA_CACHED_SEGS && num_segs != 0) {
2367 			MOBJ_STAT_ADD(fast);
2368 			start_addr = mmapobj_lookup_start_addr(lvp);
2369 			if (start_addr == NULL) {
2370 				lib_va_release(lvp);
2371 				return (ENOMEM);
2372 			}
2373 
2374 			bcopy(lvp->lv_mps, mrp,
2375 			    num_segs * sizeof (mmapobj_result_t));
2376 
2377 			error = mmapobj_map_elf(vp, start_addr, mrp,
2378 			    num_segs, fcred, ET_DYN);
2379 
2380 			lib_va_release(lvp);
2381 			if (error == 0) {
2382 				*num_mapped = num_segs;
2383 				MOBJ_STAT_ADD(fast_success);
2384 			}
2385 			return (error);
2386 		}
2387 		MOBJ_STAT_ADD(fast_not_now);
2388 
2389 		/* Release it for now since we'll look it up below */
2390 		lib_va_release(lvp);
2391 	}
2392 
2393 	/*
2394 	 * Time to see if this is a file we can interpret.  If it's smaller
2395 	 * than this, then we can't interpret it.
2396 	 */
2397 	if (vattr.va_size < MAX_HEADER_SIZE) {
2398 		MOBJ_STAT_ADD(small_file);
2399 		return (ENOTSUP);
2400 	}
2401 
2402 	if ((error = vn_rdwr(UIO_READ, vp, header, MAX_HEADER_SIZE, 0,
2403 	    UIO_SYSSPACE, 0, (rlim64_t)0, fcred, NULL)) != 0) {
2404 		MOBJ_STAT_ADD(read_error);
2405 		return (error);
2406 	}
2407 
2408 	/* Verify file type */
2409 	if (header[EI_MAG0] == ELFMAG0 && header[EI_MAG1] == ELFMAG1 &&
2410 	    header[EI_MAG2] == ELFMAG2 && header[EI_MAG3] == ELFMAG3) {
2411 		return (doelfwork((Ehdr *)lheader, vp, mrp, num_mapped,
2412 		    padding, fcred));
2413 	}
2414 
2415 #if defined(__sparc)
2416 	/* On sparc, check for 4.X AOUT format */
2417 	switch (((struct exec *)header)->a_magic) {
2418 	case OMAGIC:
2419 	case ZMAGIC:
2420 	case NMAGIC:
2421 		return (doaoutwork(vp, mrp, num_mapped,
2422 		    (struct exec *)lheader, fcred));
2423 	}
2424 #endif
2425 
2426 	/* Unsupported type */
2427 	MOBJ_STAT_ADD(unsupported);
2428 	return (ENOTSUP);
2429 }
2430 
2431 /*
2432  * Given a vnode, map it as either a flat file or interpret it and map
2433  * it according to the rules of the file type.
2434  * *num_mapped will contain the size of the mmapobj_result_t array passed in.
2435  * If padding is non-zero, the mappings will be padded by that amount
2436  * rounded up to the nearest pagesize.
2437  * If the mapping is successful, *num_mapped will contain the number of
2438  * distinct mappings created, and mrp will point to the array of
2439  * mmapobj_result_t's which describe these mappings.
2440  *
2441  * On error, -1 is returned and errno is set appropriately.
2442  * A special error case will set errno to E2BIG when there are more than
2443  * *num_mapped mappings to be created and *num_mapped will be set to the
2444  * number of mappings needed.
2445  */
2446 int
2447 mmapobj(vnode_t *vp, uint_t flags, mmapobj_result_t *mrp,
2448     uint_t *num_mapped, size_t padding, cred_t *fcred)
2449 {
2450 	int to_map;
2451 	int error = 0;
2452 
2453 	ASSERT((padding & PAGEOFFSET) == 0);
2454 	ASSERT((flags & ~MMOBJ_ALL_FLAGS) == 0);
2455 	ASSERT(num_mapped != NULL);
2456 	ASSERT((flags & MMOBJ_PADDING) ? padding != 0 : padding == 0);
2457 
2458 	if ((flags & MMOBJ_INTERPRET) == 0) {
2459 		to_map = padding ? 3 : 1;
2460 		if (*num_mapped < to_map) {
2461 			*num_mapped = to_map;
2462 			MOBJ_STAT_ADD(flat_e2big);
2463 			return (E2BIG);
2464 		}
2465 		error = mmapobj_map_flat(vp, mrp, padding, fcred);
2466 
2467 		if (error) {
2468 			return (error);
2469 		}
2470 		*num_mapped = to_map;
2471 		return (0);
2472 	}
2473 
2474 	error = mmapobj_map_interpret(vp, mrp, num_mapped, padding, fcred);
2475 	return (error);
2476 }
2477