xref: /illumos-gate/usr/src/uts/intel/io/vmm/vmm_vm.c (revision 518062b351ad2770c7529db1397091d695284665)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */
12 
13 /*
14  * Copyright 2019 Joyent, Inc.
15  * Copyright 2023 Oxide Computer Company
16  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/kmem.h>
21 #include <sys/thread.h>
22 #include <sys/list.h>
23 #include <sys/mman.h>
24 #include <sys/types.h>
25 #include <sys/ddi.h>
26 #include <sys/sysmacros.h>
27 #include <sys/machsystm.h>
28 #include <sys/vmsystm.h>
29 #include <sys/x86_archext.h>
30 #include <vm/as.h>
31 #include <vm/hat_i86.h>
32 #include <vm/seg_vn.h>
33 #include <vm/seg_kmem.h>
34 
35 #include <sys/vmm_vm.h>
36 #include <sys/seg_vmm.h>
37 #include <sys/vmm_kernel.h>
38 #include <sys/vmm_reservoir.h>
39 #include <sys/vmm_gpt.h>
40 
41 
42 /*
43  * VMM Virtual Memory
44  *
45  * History
46  *
47  * When bhyve was ported to illumos, one significant hole was handling guest
48  * memory and memory accesses.  In the original Pluribus port, bhyve itself
49  * manually handled the EPT structures for guest memory.  The updated sources
50  * (from FreeBSD 11) took a different approach, using the native FreeBSD VM
51  * system for memory allocations and management of the EPT structures.  Keeping
52  * source differences to a minimum was a priority, so illumos-bhyve implemented
53  * a makeshift "VM shim" which exposed the bare minimum of those interfaces to
54  * boot and run guests.
55  *
56  * While the VM shim was successful in getting illumos-bhyve to a functional
57  * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the
58  * compatibility interfaces made it awkward to use.  As source differences with
59  * the upstream kernel code became less of a concern, and upcoming features
60  * (such as live migration) would demand more of those VM interfaces, it became
61  * clear that an overhaul was prudent.
62  *
63  * Design
64  *
65  * The new VM system for bhyve retains a number of the same concepts as what it
66  * replaces:
67  *
68  * - `vmspace_t` is the top-level entity for a guest memory space
69  * - `vm_object_t` represents a memory object which can be mapped into a vmspace
70  * - `vm_page_t` represents a page hold within a given vmspace, providing access
71  *   to the underlying memory page
72  *
73  * Unlike the old code, where most of the involved structures were exposed via
74  * public definitions, this replacement VM interface keeps all involved
75  * structures opaque to consumers.  Furthermore, there is a clear delineation
76  * between infrequent administrative operations (such as mapping/unmapping
77  * regions) and common data-path operations (attempting a page hold at a given
78  * guest-physical address).  Those administrative operations are performed
79  * directly against the vmspace, whereas the data-path operations are performed
80  * through a `vm_client_t` handle.  That VM client abstraction is meant to
81  * reduce contention and overhead for frequent access operations and provide
82  * debugging insight into how different subcomponents are accessing the vmspace.
83  * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv
84  * interface) and each VMM userspace segment mapping.
85  *
86  * Exclusion
87  *
88  * Making changes to the vmspace (such as mapping or unmapping regions) requires
89  * other accessors be excluded while the change is underway to prevent them from
90  * observing invalid intermediate states.  A simple approach could use a mutex
91  * or rwlock to achieve this, but that risks contention when the rate of access
92  * to the vmspace is high.
93  *
94  * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion
95  * at a per-vm_client_t basis.  While this raises the cost for vmspace changes,
96  * it means that the much more common page accesses through the vm_client can
97  * normally proceed unimpeded and independently.
98  *
99  * When a change to the vmspace is required, the caller will put the vmspace in
100  * a 'hold' state, iterating over all associated vm_client instances, waiting
101  * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before
102  * setting VCS_HOLD in their state flag fields.  With VCS_HOLD set, any call on
103  * the vm_client which would access the vmspace state (vmc_hold or vmc_fault)
104  * will block until the hold condition is cleared.  Once the hold is asserted
105  * for all clients, the vmspace change can proceed with confidence.  Upon
106  * completion of that operation, VCS_HOLD is cleared from the clients, and they
107  * are released to resume vmspace accesses.
108  *
109  * vCPU Consumers
110  *
111  * Access to the vmspace for vCPUs running in guest context is different from
112  * emulation-related vm_client activity: they solely rely on the contents of the
113  * page tables.  Furthermore, the existing VCS_HOLD mechanism used to exclude
114  * client access is not feasible when entering guest context, since interrupts
115  * are disabled, making it impossible to block entry.  This is not a concern as
116  * long as vmspace modifications never place the page tables in invalid states
117  * (either intermediate, or final).  The vm_client hold mechanism does provide
118  * the means to IPI vCPU consumers which will trigger a notification once they
119  * report their exit from guest context.  This can be used to ensure that page
120  * table modifications are made visible to those vCPUs within a certain
121  * time frame.
122  */
123 
124 typedef struct vmspace_mapping {
125 	list_node_t	vmsm_node;
126 	vm_object_t	*vmsm_object;	/* object backing this mapping */
127 	uintptr_t	vmsm_addr;	/* start addr in vmspace for mapping */
128 	size_t		vmsm_len;	/* length (in bytes) of mapping */
129 	off_t		vmsm_offset;	/* byte offset into object */
130 	uint_t		vmsm_prot;
131 } vmspace_mapping_t;
132 
133 #define	VMSM_OFFSET(vmsm, addr)	(			\
134 	    (vmsm)->vmsm_offset +			\
135 	    ((addr) - (uintptr_t)(vmsm)->vmsm_addr))
136 
137 typedef enum vm_client_state {
138 	VCS_IDLE	= 0,
139 	/* currently accessing vmspace for client operation (hold or fault) */
140 	VCS_ACTIVE	= (1 << 0),
141 	/* client hold requested/asserted */
142 	VCS_HOLD	= (1 << 1),
143 	/* vCPU is accessing page tables in guest context */
144 	VCS_ON_CPU	= (1 << 2),
145 	/* client has been orphaned (no more access to vmspace) */
146 	VCS_ORPHANED	= (1 << 3),
147 	/* client undergoing destroy operation */
148 	VCS_DESTROY	= (1 << 4),
149 } vm_client_state_t;
150 
151 struct vmspace {
152 	kmutex_t	vms_lock;
153 	kcondvar_t	vms_cv;
154 	bool		vms_held;
155 	uintptr_t	vms_size;	/* immutable after creation */
156 
157 	/* (nested) page table state */
158 	vmm_gpt_t	*vms_gpt;
159 	uint64_t	vms_pt_gen;
160 	uint64_t	vms_pages_mapped;
161 	bool		vms_track_dirty;
162 
163 	list_t		vms_maplist;
164 	list_t		vms_clients;
165 };
166 
167 struct vm_client {
168 	vmspace_t	*vmc_space;
169 	list_node_t	vmc_node;
170 
171 	kmutex_t	vmc_lock;
172 	kcondvar_t	vmc_cv;
173 	vm_client_state_t vmc_state;
174 	int		vmc_cpu_active;
175 	uint64_t	vmc_cpu_gen;
176 	bool		vmc_track_dirty;
177 	vmc_inval_cb_t	vmc_inval_func;
178 	void		*vmc_inval_data;
179 
180 	list_t		vmc_held_pages;
181 };
182 
183 typedef enum vm_object_type {
184 	VMOT_NONE,
185 	VMOT_MEM,
186 	VMOT_MMIO,
187 } vm_object_type_t;
188 
189 struct vm_object {
190 	uint_t		vmo_refcnt;	/* manipulated with atomic ops */
191 
192 	/* Fields below are fixed at creation time */
193 	vm_object_type_t vmo_type;
194 	size_t		vmo_size;
195 	void		*vmo_data;
196 	uint8_t		vmo_attr;
197 };
198 
199 /* Convenience consolidation of all flag(s) for validity checking */
200 #define	VPF_ALL		(VPF_DEFER_DIRTY)
201 
202 struct vm_page {
203 	vm_client_t	*vmp_client;
204 	list_node_t	vmp_node;
205 	vm_page_t	*vmp_chain;
206 	uintptr_t	vmp_gpa;
207 	pfn_t		vmp_pfn;
208 	uint64_t	*vmp_ptep;
209 	vm_object_t	*vmp_obj_ref;
210 	uint8_t		vmp_prot;
211 	uint8_t		vmp_flags;
212 };
213 
214 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t);
215 static void vmspace_hold_enter(vmspace_t *);
216 static void vmspace_hold_exit(vmspace_t *, bool);
217 static void vmc_space_hold(vm_client_t *);
218 static void vmc_space_release(vm_client_t *, bool);
219 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t);
220 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *);
221 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *);
222 
223 
224 /*
225  * Create a new vmspace with a maximum address of `end`.
226  */
227 vmspace_t *
228 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty)
229 {
230 	vmspace_t *vms;
231 	const uintptr_t size = end + 1;
232 
233 	/*
234 	 * This whole mess is built on the assumption that a 64-bit address
235 	 * space is available to work with for the various pagetable tricks.
236 	 */
237 	VERIFY(size > 0 && (size & PAGEOFFSET) == 0 &&
238 	    size <= (uintptr_t)USERLIMIT);
239 
240 	vms = kmem_zalloc(sizeof (*vms), KM_SLEEP);
241 	vms->vms_size = size;
242 	list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t),
243 	    offsetof(vmspace_mapping_t, vmsm_node));
244 	list_create(&vms->vms_clients, sizeof (vm_client_t),
245 	    offsetof(vm_client_t, vmc_node));
246 
247 	vms->vms_gpt = vmm_gpt_alloc(pte_ops);
248 	vms->vms_pt_gen = 1;
249 	vms->vms_track_dirty = track_dirty;
250 
251 	return (vms);
252 }
253 
254 /*
255  * Destroy a vmspace.  All regions in the space must be unmapped.  Any remaining
256  * clients will be orphaned.
257  */
258 void
259 vmspace_destroy(vmspace_t *vms)
260 {
261 	mutex_enter(&vms->vms_lock);
262 	VERIFY(list_is_empty(&vms->vms_maplist));
263 
264 	if (!list_is_empty(&vms->vms_clients)) {
265 		vm_client_t *vmc = list_head(&vms->vms_clients);
266 		while (vmc != NULL) {
267 			vmc = vmc_space_orphan(vmc, vms);
268 		}
269 		/*
270 		 * Wait for any clients which were in the process of destroying
271 		 * themselves to disappear.
272 		 */
273 		while (!list_is_empty(&vms->vms_clients)) {
274 			cv_wait(&vms->vms_cv, &vms->vms_lock);
275 		}
276 	}
277 	VERIFY(list_is_empty(&vms->vms_clients));
278 
279 	vmm_gpt_free(vms->vms_gpt);
280 	mutex_exit(&vms->vms_lock);
281 
282 	mutex_destroy(&vms->vms_lock);
283 	cv_destroy(&vms->vms_cv);
284 	list_destroy(&vms->vms_maplist);
285 	list_destroy(&vms->vms_clients);
286 
287 	kmem_free(vms, sizeof (*vms));
288 }
289 
290 /*
291  * Retrieve the count of resident (mapped into the page tables) pages.
292  */
293 uint64_t
294 vmspace_resident_count(vmspace_t *vms)
295 {
296 	return (vms->vms_pages_mapped);
297 }
298 
299 int
300 vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap)
301 {
302 	if (!vms->vms_track_dirty)
303 		return (EPERM);
304 
305 	/*
306 	 * Accumulate dirty bits into the given bit vector.  Note that this
307 	 * races both against hardware writes from running vCPUs and
308 	 * reflections from userspace.
309 	 *
310 	 * Called from a userspace-visible ioctl, this depends on the VM
311 	 * instance being read-locked to prevent vmspace_map/vmspace_unmap
312 	 * operations from changing the page tables during the walk.
313 	 */
314 	for (size_t offset = 0; offset < len; offset += PAGESIZE) {
315 		bool bit = false;
316 		uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset);
317 		if (entry != NULL)
318 			bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false);
319 		uint64_t pfn_offset = offset >> PAGESHIFT;
320 		size_t bit_offset = pfn_offset / 8;
321 		size_t bit_index = pfn_offset % 8;
322 		bitmap[bit_offset] |= (bit << bit_index);
323 	}
324 
325 	/*
326 	 * Now invalidate those bits and shoot down address spaces that
327 	 * may have them cached.
328 	 */
329 	vmspace_hold_enter(vms);
330 	vms->vms_pt_gen++;
331 	for (vm_client_t *vmc = list_head(&vms->vms_clients);
332 	    vmc != NULL;
333 	    vmc = list_next(&vms->vms_clients, vmc)) {
334 		vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen);
335 	}
336 	vmspace_hold_exit(vms, true);
337 
338 	return (0);
339 }
340 
341 static pfn_t
342 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off)
343 {
344 	vmmr_region_t *region;
345 	pfn_t pfn;
346 
347 	ASSERT3U(vmo->vmo_type, ==, VMOT_MEM);
348 
349 	region = vmo->vmo_data;
350 	pfn = vmmr_region_pfn_at(region, off);
351 
352 	return (pfn);
353 }
354 
355 static pfn_t
356 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off)
357 {
358 	pfn_t pfn;
359 
360 	ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO);
361 	ASSERT3P(vmo->vmo_data, !=, NULL);
362 	ASSERT3U(off, <, vmo->vmo_size);
363 
364 	pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT;
365 
366 	return (pfn);
367 }
368 
369 /*
370  * Allocate a VM object backed by VMM reservoir memory.
371  */
372 vm_object_t *
373 vm_object_mem_allocate(size_t size, bool transient)
374 {
375 	int err;
376 	vmmr_region_t *region = NULL;
377 	vm_object_t *vmo;
378 
379 	ASSERT3U(size, !=, 0);
380 	ASSERT3U(size & PAGEOFFSET, ==, 0);
381 
382 	err = vmmr_alloc(size, transient, &region);
383 	if (err != 0) {
384 		return (NULL);
385 	}
386 
387 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
388 
389 	/* For now, these are to stay fixed after allocation */
390 	vmo->vmo_type = VMOT_MEM;
391 	vmo->vmo_size = size;
392 	vmo->vmo_attr = MTRR_TYPE_WB;
393 	vmo->vmo_data = region;
394 	vmo->vmo_refcnt = 1;
395 
396 	return (vmo);
397 }
398 
399 static vm_object_t *
400 vm_object_mmio_allocate(size_t size, uintptr_t hpa)
401 {
402 	vm_object_t *vmo;
403 
404 	ASSERT3U(size, !=, 0);
405 	ASSERT3U(size & PAGEOFFSET, ==, 0);
406 	ASSERT3U(hpa & PAGEOFFSET, ==, 0);
407 
408 	vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP);
409 
410 	/* For now, these are to stay fixed after allocation */
411 	vmo->vmo_type = VMOT_MMIO;
412 	vmo->vmo_size = size;
413 	vmo->vmo_attr = MTRR_TYPE_UC;
414 	vmo->vmo_data = (void *)hpa;
415 	vmo->vmo_refcnt = 1;
416 
417 	return (vmo);
418 }
419 
420 /*
421  * Allocate a VM object backed by an existing range of physical memory.
422  */
423 vm_object_t *
424 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa)
425 {
426 	int error;
427 	vm_object_t *obj;
428 
429 	obj = vm_object_mmio_allocate(len, hpa);
430 	if (obj != NULL) {
431 		error = vmspace_map(vmspace, obj, 0, gpa, len,
432 		    PROT_READ | PROT_WRITE);
433 		if (error != 0) {
434 			vm_object_release(obj);
435 			obj = NULL;
436 		}
437 	}
438 
439 	return (obj);
440 }
441 
442 /*
443  * Release a vm_object reference
444  */
445 void
446 vm_object_release(vm_object_t *vmo)
447 {
448 	ASSERT(vmo != NULL);
449 
450 	uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt);
451 	/* underflow would be a deadly serious mistake */
452 	VERIFY3U(ref, !=, UINT_MAX);
453 	if (ref != 0) {
454 		return;
455 	}
456 
457 	switch (vmo->vmo_type) {
458 	case VMOT_MEM:
459 		vmmr_free((vmmr_region_t *)vmo->vmo_data);
460 		break;
461 	case VMOT_MMIO:
462 		break;
463 	default:
464 		panic("unexpected object type %u", vmo->vmo_type);
465 		break;
466 	}
467 
468 	vmo->vmo_data = NULL;
469 	vmo->vmo_size = 0;
470 	kmem_free(vmo, sizeof (*vmo));
471 }
472 
473 /*
474  * Increase refcount for vm_object reference
475  */
476 void
477 vm_object_reference(vm_object_t *vmo)
478 {
479 	ASSERT(vmo != NULL);
480 
481 	uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt);
482 	/* overflow would be a deadly serious mistake */
483 	VERIFY3U(ref, !=, 0);
484 }
485 
486 /*
487  * Get the host-physical PFN for a given offset into a vm_object.
488  *
489  * The provided `off` must be within the allocated size of the vm_object.
490  */
491 pfn_t
492 vm_object_pfn(vm_object_t *vmo, uintptr_t off)
493 {
494 	const uintptr_t aligned_off = off & PAGEMASK;
495 
496 	switch (vmo->vmo_type) {
497 	case VMOT_MEM:
498 		return (vm_object_pager_reservoir(vmo, aligned_off));
499 	case VMOT_MMIO:
500 		return (vm_object_pager_mmio(vmo, aligned_off));
501 	case VMOT_NONE:
502 		break;
503 	}
504 	panic("unexpected object type %u", vmo->vmo_type);
505 }
506 
507 static vmspace_mapping_t *
508 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size)
509 {
510 	vmspace_mapping_t *vmsm;
511 	list_t *ml = &vms->vms_maplist;
512 	const uintptr_t range_end = addr + size;
513 
514 	ASSERT3U(addr, <=, range_end);
515 
516 	if (addr >= vms->vms_size) {
517 		return (NULL);
518 	}
519 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
520 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len;
521 
522 		if (addr >= vmsm->vmsm_addr && addr < seg_end) {
523 			if (range_end <= seg_end) {
524 				return (vmsm);
525 			} else {
526 				return (NULL);
527 			}
528 		}
529 	}
530 	return (NULL);
531 }
532 
533 /*
534  * Check to see if any mappings reside within [addr, addr + size) span in the
535  * vmspace, returning true if that span is indeed empty.
536  */
537 static bool
538 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size)
539 {
540 	vmspace_mapping_t *vmsm;
541 	list_t *ml = &vms->vms_maplist;
542 	const uintptr_t range_end = addr + size - 1;
543 
544 	ASSERT(MUTEX_HELD(&vms->vms_lock));
545 	ASSERT(size > 0);
546 
547 	for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) {
548 		const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1;
549 
550 		/*
551 		 * The two ranges do not overlap if the start of either of
552 		 * them is after the end of the other.
553 		 */
554 		if (vmsm->vmsm_addr > range_end || addr > seg_end)
555 			continue;
556 		return (false);
557 	}
558 	return (true);
559 }
560 
561 static void
562 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm)
563 {
564 	list_t *ml = &vms->vms_maplist;
565 
566 	ASSERT(MUTEX_HELD(&vms->vms_lock));
567 	ASSERT(vms->vms_held);
568 
569 	list_remove(ml, vmsm);
570 	vm_object_release(vmsm->vmsm_object);
571 	kmem_free(vmsm, sizeof (*vmsm));
572 }
573 
574 /*
575  * Enter a hold state on the vmspace.  This ensures that all VM clients
576  * associated with the vmspace are excluded from establishing new page holds,
577  * or any other actions which would require accessing vmspace state subject to
578  * potential change.
579  *
580  * Returns with vmspace_t`vms_lock held.
581  */
582 static void
583 vmspace_hold_enter(vmspace_t *vms)
584 {
585 	mutex_enter(&vms->vms_lock);
586 	VERIFY(!vms->vms_held);
587 
588 	vm_client_t *vmc = list_head(&vms->vms_clients);
589 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
590 		vmc_space_hold(vmc);
591 	}
592 	vms->vms_held = true;
593 }
594 
595 /*
596  * Exit a hold state on the vmspace.  This releases all VM clients associated
597  * with the vmspace to be able to establish new page holds, and partake in other
598  * actions which require accessing changed vmspace state.  If `kick_on_cpu` is
599  * true, then any CPUs actively using the page tables will be IPIed, and the
600  * call will block until they have acknowledged being ready to use the latest
601  * state of the tables.
602  *
603  * Requires vmspace_t`vms_lock be held, which is released as part of the call.
604  */
605 static void
606 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu)
607 {
608 	ASSERT(MUTEX_HELD(&vms->vms_lock));
609 	VERIFY(vms->vms_held);
610 
611 	vm_client_t *vmc = list_head(&vms->vms_clients);
612 	for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) {
613 		vmc_space_release(vmc, kick_on_cpu);
614 	}
615 	vms->vms_held = false;
616 	mutex_exit(&vms->vms_lock);
617 }
618 
619 /*
620  * Attempt to map a vm_object span into the vmspace.
621  *
622  * Requirements:
623  * - `obj_off`, `addr`, and `len` must be page-aligned
624  * - `obj_off` cannot be greater than the allocated size of the object
625  * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated
626  *   size of the object
627  * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address
628  *   of the vmspace
629  */
630 int
631 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr,
632     size_t len, uint8_t prot)
633 {
634 	vmspace_mapping_t *vmsm;
635 	int res = 0;
636 
637 	if (len == 0 || (addr + len) < addr ||
638 	    obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) {
639 		return (EINVAL);
640 	}
641 	if ((addr + len) >= vms->vms_size) {
642 		return (ENOMEM);
643 	}
644 
645 	vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP);
646 
647 	vmspace_hold_enter(vms);
648 	if (!vm_mapping_gap(vms, addr, len)) {
649 		kmem_free(vmsm, sizeof (*vmsm));
650 		res = ENOMEM;
651 	} else {
652 		vmsm->vmsm_object = vmo;
653 		vmsm->vmsm_addr = addr;
654 		vmsm->vmsm_len = len;
655 		vmsm->vmsm_offset = (off_t)obj_off;
656 		vmsm->vmsm_prot = prot;
657 		list_insert_tail(&vms->vms_maplist, vmsm);
658 
659 		/*
660 		 * Make sure the GPT has tables ready for leaf entries across
661 		 * the entire new mapping.
662 		 */
663 		vmm_gpt_populate_region(vms->vms_gpt, addr, len);
664 	}
665 	vmspace_hold_exit(vms, false);
666 	return (res);
667 }
668 
669 /*
670  * Unmap a region of the vmspace.
671  *
672  * Presently the [start, end) span must equal a region previously mapped by a
673  * call to vmspace_map().
674  */
675 int
676 vmspace_unmap(vmspace_t *vms, uintptr_t addr, uintptr_t len)
677 {
678 	const uintptr_t end = addr + len;
679 	vmspace_mapping_t *vmsm;
680 	vm_client_t *vmc;
681 	uint64_t gen = 0;
682 
683 	ASSERT3U(addr, <, end);
684 
685 	vmspace_hold_enter(vms);
686 	/* expect to match existing mapping exactly */
687 	if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL ||
688 	    vmsm->vmsm_addr != addr || vmsm->vmsm_len != len) {
689 		vmspace_hold_exit(vms, false);
690 		return (ENOENT);
691 	}
692 
693 	/* Prepare clients (and their held pages) for the unmap. */
694 	for (vmc = list_head(&vms->vms_clients); vmc != NULL;
695 	    vmc = list_next(&vms->vms_clients, vmc)) {
696 		vmc_space_unmap(vmc, addr, len, vmsm->vmsm_object);
697 	}
698 
699 	/* Clear all PTEs for region */
700 	if (vmm_gpt_unmap_region(vms->vms_gpt, addr, len) != 0) {
701 		vms->vms_pt_gen++;
702 		gen = vms->vms_pt_gen;
703 	}
704 	/* ... and the intermediate (directory) PTEs as well */
705 	vmm_gpt_vacate_region(vms->vms_gpt, addr, len);
706 
707 	/*
708 	 * If pages were actually unmapped from the GPT, provide clients with
709 	 * an invalidation notice.
710 	 */
711 	if (gen != 0) {
712 		for (vmc = list_head(&vms->vms_clients); vmc != NULL;
713 		    vmc = list_next(&vms->vms_clients, vmc)) {
714 			vmc_space_invalidate(vmc, addr, len, vms->vms_pt_gen);
715 		}
716 	}
717 
718 	vm_mapping_remove(vms, vmsm);
719 	vmspace_hold_exit(vms, true);
720 	return (0);
721 }
722 
723 static int
724 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp,
725     uint64_t **ptepp)
726 {
727 	vmm_gpt_t *gpt = vms->vms_gpt;
728 	uint64_t *entries[MAX_GPT_LEVEL], *leaf;
729 	pfn_t pfn = PFN_INVALID;
730 	uint_t prot;
731 
732 	ASSERT0(gpa & PAGEOFFSET);
733 	ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE);
734 
735 	vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL);
736 	leaf = entries[LEVEL1];
737 	if (leaf == NULL) {
738 		/*
739 		 * Since we populated the intermediate tables for any regions
740 		 * mapped in the GPT, an empty leaf entry indicates there is no
741 		 * mapping, populated or not, at this GPT.
742 		 */
743 		return (FC_NOMAP);
744 	}
745 
746 	if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) {
747 		if ((req_prot & prot) != req_prot) {
748 			return (FC_PROT);
749 		}
750 	} else {
751 		vmspace_mapping_t *vmsm;
752 		vm_object_t *vmo;
753 
754 		vmsm = vm_mapping_find(vms, gpa, PAGESIZE);
755 		if (vmsm == NULL) {
756 			return (FC_NOMAP);
757 		}
758 
759 		if ((req_prot & vmsm->vmsm_prot) != req_prot) {
760 			return (FC_PROT);
761 		}
762 		vmo = vmsm->vmsm_object;
763 		pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
764 		VERIFY(pfn != PFN_INVALID);
765 
766 		if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot,
767 		    vmo->vmo_attr)) {
768 			atomic_inc_64(&vms->vms_pages_mapped);
769 		}
770 	}
771 
772 	ASSERT(pfn != PFN_INVALID && leaf != NULL);
773 	if (pfnp != NULL) {
774 		*pfnp = pfn;
775 	}
776 	if (ptepp != NULL) {
777 		*ptepp = leaf;
778 	}
779 	return (0);
780 }
781 
782 /*
783  * Populate (make resident in the page tables) a region of the vmspace.
784  *
785  * Presently the [start, end) span must equal a region previously mapped by a
786  * call to vmspace_map().
787  */
788 int
789 vmspace_populate(vmspace_t *vms, uintptr_t addr, uintptr_t len)
790 {
791 	vmspace_mapping_t *vmsm;
792 	mutex_enter(&vms->vms_lock);
793 
794 	/* For the time being, only exact-match mappings are expected */
795 	if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL) {
796 		mutex_exit(&vms->vms_lock);
797 		return (FC_NOMAP);
798 	}
799 
800 	vm_object_t *vmo = vmsm->vmsm_object;
801 	const int prot = vmsm->vmsm_prot;
802 	const uint8_t attr = vmo->vmo_attr;
803 	size_t populated = 0;
804 	const size_t end = addr + len;
805 	for (uintptr_t gpa = addr & PAGEMASK; gpa < end; gpa += PAGESIZE) {
806 		const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa));
807 		VERIFY(pfn != PFN_INVALID);
808 
809 		if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) {
810 			populated++;
811 		}
812 	}
813 	atomic_add_64(&vms->vms_pages_mapped, populated);
814 
815 	mutex_exit(&vms->vms_lock);
816 	return (0);
817 }
818 
819 /*
820  * Allocate a client from a given vmspace.
821  */
822 vm_client_t *
823 vmspace_client_alloc(vmspace_t *vms)
824 {
825 	vm_client_t *vmc;
826 
827 	vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP);
828 	vmc->vmc_space = vms;
829 	mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL);
830 	cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL);
831 	vmc->vmc_state = VCS_IDLE;
832 	vmc->vmc_cpu_active = -1;
833 	list_create(&vmc->vmc_held_pages, sizeof (vm_page_t),
834 	    offsetof(vm_page_t, vmp_node));
835 	vmc->vmc_track_dirty = vms->vms_track_dirty;
836 
837 	mutex_enter(&vms->vms_lock);
838 	list_insert_tail(&vms->vms_clients, vmc);
839 	mutex_exit(&vms->vms_lock);
840 
841 	return (vmc);
842 }
843 
844 /*
845  * Get the nested page table root pointer (EPTP/NCR3) value.
846  */
847 uint64_t
848 vmspace_table_root(vmspace_t *vms)
849 {
850 	return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty));
851 }
852 
853 /*
854  * Get the current generation number of the nested page table.
855  */
856 uint64_t
857 vmspace_table_gen(vmspace_t *vms)
858 {
859 	return (vms->vms_pt_gen);
860 }
861 
862 /*
863  * Mark a vm_client as active.  This will block if/while the client is held by
864  * the vmspace.  On success, it returns with vm_client_t`vmc_lock held.  It will
865  * fail if the vm_client has been orphaned.
866  */
867 static int
868 vmc_activate(vm_client_t *vmc)
869 {
870 	mutex_enter(&vmc->vmc_lock);
871 	VERIFY0(vmc->vmc_state & VCS_ACTIVE);
872 	if ((vmc->vmc_state & VCS_ORPHANED) != 0) {
873 		mutex_exit(&vmc->vmc_lock);
874 		return (ENXIO);
875 	}
876 	while ((vmc->vmc_state & VCS_HOLD) != 0) {
877 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
878 	}
879 	vmc->vmc_state |= VCS_ACTIVE;
880 	return (0);
881 }
882 
883 /*
884  * Mark a vm_client as no longer active.  It must be called with
885  * vm_client_t`vmc_lock already held, and will return with it released.
886  */
887 static void
888 vmc_deactivate(vm_client_t *vmc)
889 {
890 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
891 	VERIFY(vmc->vmc_state & VCS_ACTIVE);
892 
893 	vmc->vmc_state ^= VCS_ACTIVE;
894 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
895 		cv_broadcast(&vmc->vmc_cv);
896 	}
897 	mutex_exit(&vmc->vmc_lock);
898 }
899 
900 /*
901  * Indicate that a CPU will be utilizing the nested page tables through this VM
902  * client.  Interrupts (and/or the GIF) are expected to be disabled when calling
903  * this function.  Returns the generation number of the nested page table (to be
904  * used for TLB invalidations).
905  */
906 uint64_t
907 vmc_table_enter(vm_client_t *vmc)
908 {
909 	vmspace_t *vms = vmc->vmc_space;
910 	uint64_t gen;
911 
912 	ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
913 	ASSERT3S(vmc->vmc_cpu_active, ==, -1);
914 
915 	/*
916 	 * Since the NPT activation occurs with interrupts disabled, this must
917 	 * be done without taking vmc_lock like normal.
918 	 */
919 	gen = vms->vms_pt_gen;
920 	vmc->vmc_cpu_active = CPU->cpu_id;
921 	vmc->vmc_cpu_gen = gen;
922 	atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU);
923 
924 	return (gen);
925 }
926 
927 /*
928  * Indicate that this VM client is not longer (directly) using the underlying
929  * page tables.  Interrupts (and/or the GIF) must be enabled prior to calling
930  * this function.
931  */
932 void
933 vmc_table_exit(vm_client_t *vmc)
934 {
935 	mutex_enter(&vmc->vmc_lock);
936 
937 	ASSERT(vmc->vmc_state & VCS_ON_CPU);
938 	vmc->vmc_state ^= VCS_ON_CPU;
939 	vmc->vmc_cpu_active = -1;
940 	if ((vmc->vmc_state & VCS_HOLD) != 0) {
941 		cv_broadcast(&vmc->vmc_cv);
942 	}
943 
944 	mutex_exit(&vmc->vmc_lock);
945 }
946 
947 static void
948 vmc_space_hold(vm_client_t *vmc)
949 {
950 	mutex_enter(&vmc->vmc_lock);
951 	VERIFY0(vmc->vmc_state & VCS_HOLD);
952 
953 	/*
954 	 * Because vmc_table_enter() alters vmc_state from a context where
955 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting
956 	 * VMC_HOLD must be done atomically here.
957 	 */
958 	atomic_or_uint(&vmc->vmc_state, VCS_HOLD);
959 
960 	/* Wait for client to go inactive */
961 	while ((vmc->vmc_state & VCS_ACTIVE) != 0) {
962 		cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
963 	}
964 	mutex_exit(&vmc->vmc_lock);
965 }
966 
967 static void
968 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu)
969 {
970 	mutex_enter(&vmc->vmc_lock);
971 	VERIFY(vmc->vmc_state & VCS_HOLD);
972 
973 	if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) {
974 		poke_cpu(vmc->vmc_cpu_active);
975 
976 		while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
977 			cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
978 		}
979 	}
980 
981 	/*
982 	 * Because vmc_table_enter() alters vmc_state from a context where
983 	 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing
984 	 * VMC_HOLD must be done atomically here.
985 	 */
986 	atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD);
987 	cv_broadcast(&vmc->vmc_cv);
988 	mutex_exit(&vmc->vmc_lock);
989 }
990 
991 static void
992 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size,
993     uint64_t gen)
994 {
995 	mutex_enter(&vmc->vmc_lock);
996 	VERIFY(vmc->vmc_state & VCS_HOLD);
997 	if ((vmc->vmc_state & VCS_ON_CPU) != 0) {
998 		/*
999 		 * Wait for clients using an old generation of the page tables
1000 		 * to exit guest context, where they subsequently flush the TLB
1001 		 * for the new generation.
1002 		 */
1003 		if (vmc->vmc_cpu_gen < gen) {
1004 			poke_cpu(vmc->vmc_cpu_active);
1005 
1006 			while ((vmc->vmc_state & VCS_ON_CPU) != 0) {
1007 				cv_wait(&vmc->vmc_cv, &vmc->vmc_lock);
1008 			}
1009 		}
1010 	}
1011 	if (vmc->vmc_inval_func != NULL) {
1012 		vmc_inval_cb_t func = vmc->vmc_inval_func;
1013 		void *data = vmc->vmc_inval_data;
1014 
1015 		/*
1016 		 * Perform the actual invalidation call outside vmc_lock to
1017 		 * avoid lock ordering issues in the consumer.  Since the client
1018 		 * is under VCS_HOLD, this is safe.
1019 		 */
1020 		mutex_exit(&vmc->vmc_lock);
1021 		func(data, addr, size);
1022 		mutex_enter(&vmc->vmc_lock);
1023 	}
1024 	mutex_exit(&vmc->vmc_lock);
1025 }
1026 
1027 static void
1028 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size,
1029     vm_object_t *vmo)
1030 {
1031 	mutex_enter(&vmc->vmc_lock);
1032 	VERIFY(vmc->vmc_state & VCS_HOLD);
1033 
1034 	/*
1035 	 * With the current vCPU exclusion invariants in place, we do not expect
1036 	 * a vCPU to be in guest context during an unmap.
1037 	 */
1038 	VERIFY0(vmc->vmc_state & VCS_ON_CPU);
1039 
1040 	/*
1041 	 * Any holds against the unmapped region need to establish their own
1042 	 * reference to the underlying object to avoid a potential
1043 	 * use-after-free.
1044 	 */
1045 	for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1046 	    vmp != NULL;
1047 	    vmp = list_next(&vmc->vmc_held_pages, vmc)) {
1048 		if (vmp->vmp_gpa < addr ||
1049 		    vmp->vmp_gpa >= (addr + size)) {
1050 			/* Hold outside region in question */
1051 			continue;
1052 		}
1053 		if (vmp->vmp_obj_ref == NULL) {
1054 			vm_object_reference(vmo);
1055 			vmp->vmp_obj_ref = vmo;
1056 			/* For an unmapped region, PTE is now meaningless */
1057 			vmp->vmp_ptep = NULL;
1058 		} else {
1059 			/*
1060 			 * Object could have gone through cycle of
1061 			 * unmap-map-unmap before the hold was released.
1062 			 */
1063 			VERIFY3P(vmp->vmp_ptep, ==, NULL);
1064 		}
1065 	}
1066 	mutex_exit(&vmc->vmc_lock);
1067 }
1068 
1069 static vm_client_t *
1070 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms)
1071 {
1072 	vm_client_t *next;
1073 
1074 	ASSERT(MUTEX_HELD(&vms->vms_lock));
1075 
1076 	mutex_enter(&vmc->vmc_lock);
1077 	VERIFY3P(vmc->vmc_space, ==, vms);
1078 	VERIFY0(vmc->vmc_state & VCS_ORPHANED);
1079 	if (vmc->vmc_state & VCS_DESTROY) {
1080 		/*
1081 		 * This vm_client is currently undergoing destruction, so it
1082 		 * does not need to be orphaned.  Let it proceed with its own
1083 		 * clean-up task.
1084 		 */
1085 		next = list_next(&vms->vms_clients, vmc);
1086 	} else {
1087 		/*
1088 		 * Clients are only orphaned when the containing vmspace is
1089 		 * being torn down.  All mappings from the vmspace should
1090 		 * already be gone, meaning any remaining held pages should have
1091 		 * direct references to the object.
1092 		 */
1093 		for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages);
1094 		    vmp != NULL;
1095 		    vmp = list_next(&vmc->vmc_held_pages, vmp)) {
1096 			ASSERT3P(vmp->vmp_ptep, ==, NULL);
1097 			ASSERT3P(vmp->vmp_obj_ref, !=, NULL);
1098 		}
1099 
1100 		/*
1101 		 * After this point, the client will be orphaned, unable to
1102 		 * establish new page holds (or access any vmspace-related
1103 		 * resources) and is in charge of cleaning up after itself.
1104 		 */
1105 		vmc->vmc_state |= VCS_ORPHANED;
1106 		next = list_next(&vms->vms_clients, vmc);
1107 		list_remove(&vms->vms_clients, vmc);
1108 		vmc->vmc_space = NULL;
1109 	}
1110 	mutex_exit(&vmc->vmc_lock);
1111 	return (next);
1112 }
1113 
1114 /*
1115  * Attempt to hold a page at `gpa` inside the referenced vmspace.
1116  */
1117 vm_page_t *
1118 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags)
1119 {
1120 	vmspace_t *vms = vmc->vmc_space;
1121 	vm_page_t *vmp;
1122 	pfn_t pfn = PFN_INVALID;
1123 	uint64_t *ptep = NULL;
1124 
1125 	ASSERT0(gpa & PAGEOFFSET);
1126 	ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE);
1127 	ASSERT0(prot & ~PROT_ALL);
1128 	ASSERT0(flags & ~VPF_ALL);
1129 
1130 	vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP);
1131 	if (vmc_activate(vmc) != 0) {
1132 		kmem_free(vmp, sizeof (*vmp));
1133 		return (NULL);
1134 	}
1135 
1136 	if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) {
1137 		vmc_deactivate(vmc);
1138 		kmem_free(vmp, sizeof (*vmp));
1139 		return (NULL);
1140 	}
1141 	ASSERT(pfn != PFN_INVALID && ptep != NULL);
1142 
1143 	vmp->vmp_client = vmc;
1144 	vmp->vmp_chain = NULL;
1145 	vmp->vmp_gpa = gpa;
1146 	vmp->vmp_pfn = pfn;
1147 	vmp->vmp_ptep = ptep;
1148 	vmp->vmp_obj_ref = NULL;
1149 	vmp->vmp_prot = (uint8_t)prot;
1150 	vmp->vmp_flags = (uint8_t)flags;
1151 	list_insert_tail(&vmc->vmc_held_pages, vmp);
1152 	vmc_deactivate(vmc);
1153 
1154 	return (vmp);
1155 }
1156 
1157 /*
1158  * Attempt to hold a page at `gpa` inside the referenced vmspace.
1159  */
1160 vm_page_t *
1161 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot)
1162 {
1163 	return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT));
1164 }
1165 
1166 int
1167 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot)
1168 {
1169 	vmspace_t *vms = vmc->vmc_space;
1170 	int err;
1171 
1172 	err = vmc_activate(vmc);
1173 	if (err == 0) {
1174 		err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL);
1175 		vmc_deactivate(vmc);
1176 	}
1177 
1178 	return (err);
1179 }
1180 
1181 /*
1182  * Allocate an additional vm_client_t, based on an existing one.  Only the
1183  * associatation with the vmspace is cloned, not existing holds or any
1184  * configured invalidation function.
1185  */
1186 vm_client_t *
1187 vmc_clone(vm_client_t *vmc)
1188 {
1189 	vmspace_t *vms = vmc->vmc_space;
1190 
1191 	return (vmspace_client_alloc(vms));
1192 }
1193 
1194 /*
1195  * Register a function (and associated data pointer) to be called when an
1196  * address range in the vmspace is invalidated.
1197  */
1198 int
1199 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data)
1200 {
1201 	int err;
1202 
1203 	err = vmc_activate(vmc);
1204 	if (err == 0) {
1205 		vmc->vmc_inval_func = func;
1206 		vmc->vmc_inval_data = data;
1207 		vmc_deactivate(vmc);
1208 	}
1209 
1210 	return (err);
1211 }
1212 
1213 /*
1214  * Destroy a vm_client_t instance.
1215  *
1216  * No pages held through this vm_client_t may be outstanding when performing a
1217  * vmc_destroy().  For vCPU clients, the client cannot be on-CPU (a call to
1218  * vmc_table_exit() has been made).
1219  */
1220 void
1221 vmc_destroy(vm_client_t *vmc)
1222 {
1223 	mutex_enter(&vmc->vmc_lock);
1224 
1225 	VERIFY(list_is_empty(&vmc->vmc_held_pages));
1226 	VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU));
1227 
1228 	if ((vmc->vmc_state & VCS_ORPHANED) == 0) {
1229 		vmspace_t *vms;
1230 
1231 		/*
1232 		 * Deassociation with the parent vmspace must be done carefully:
1233 		 * The vmspace could attempt to orphan this vm_client while we
1234 		 * release vmc_lock in order to take vms_lock (the required
1235 		 * order).  The client is marked to indicate that destruction is
1236 		 * under way.  Doing so prevents any racing orphan operation
1237 		 * from applying to this client, allowing us to deassociate from
1238 		 * the vmspace safely.
1239 		 */
1240 		vmc->vmc_state |= VCS_DESTROY;
1241 		vms = vmc->vmc_space;
1242 		mutex_exit(&vmc->vmc_lock);
1243 
1244 		mutex_enter(&vms->vms_lock);
1245 		mutex_enter(&vmc->vmc_lock);
1246 		list_remove(&vms->vms_clients, vmc);
1247 		/*
1248 		 * If the vmspace began its own destruction operation while we
1249 		 * were navigating the locks, be sure to notify it about this
1250 		 * vm_client being deassociated.
1251 		 */
1252 		cv_signal(&vms->vms_cv);
1253 		mutex_exit(&vmc->vmc_lock);
1254 		mutex_exit(&vms->vms_lock);
1255 	} else {
1256 		VERIFY3P(vmc->vmc_space, ==, NULL);
1257 		mutex_exit(&vmc->vmc_lock);
1258 	}
1259 
1260 	mutex_destroy(&vmc->vmc_lock);
1261 	cv_destroy(&vmc->vmc_cv);
1262 	list_destroy(&vmc->vmc_held_pages);
1263 
1264 	kmem_free(vmc, sizeof (*vmc));
1265 }
1266 
1267 static __inline void *
1268 vmp_ptr(const vm_page_t *vmp)
1269 {
1270 	ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID);
1271 
1272 	const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT);
1273 	return ((void *)((uintptr_t)kpm_vbase + paddr));
1274 }
1275 
1276 /*
1277  * Get a readable kernel-virtual pointer for a held page.
1278  *
1279  * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold()
1280  * call to acquire this page reference.
1281  */
1282 const void *
1283 vmp_get_readable(const vm_page_t *vmp)
1284 {
1285 	ASSERT(vmp->vmp_prot & PROT_READ);
1286 
1287 	return (vmp_ptr(vmp));
1288 }
1289 
1290 /*
1291  * Get a writable kernel-virtual pointer for a held page.
1292  *
1293  * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold()
1294  * call to acquire this page reference.
1295  */
1296 void *
1297 vmp_get_writable(const vm_page_t *vmp)
1298 {
1299 	ASSERT(vmp->vmp_prot & PROT_WRITE);
1300 
1301 	return (vmp_ptr(vmp));
1302 }
1303 
1304 /*
1305  * Get the host-physical PFN for a held page.
1306  */
1307 pfn_t
1308 vmp_get_pfn(const vm_page_t *vmp)
1309 {
1310 	return (vmp->vmp_pfn);
1311 }
1312 
1313 /*
1314  * If this page was deferring dirty-marking in the corresponding vmspace page
1315  * tables, clear such a state so it is considered dirty from now on.
1316  */
1317 void
1318 vmp_mark_dirty(vm_page_t *vmp)
1319 {
1320 	ASSERT((vmp->vmp_prot & PROT_WRITE) != 0);
1321 
1322 	atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY);
1323 }
1324 
1325 /*
1326  * Store a pointer to `to_chain` in the page-chaining slot of `vmp`.
1327  */
1328 void
1329 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain)
1330 {
1331 	ASSERT3P(vmp->vmp_chain, ==, NULL);
1332 
1333 	vmp->vmp_chain = to_chain;
1334 }
1335 
1336 /*
1337  * Retrieve the pointer from the page-chaining in `vmp`.
1338  */
1339 vm_page_t *
1340 vmp_next(const vm_page_t *vmp)
1341 {
1342 	return (vmp->vmp_chain);
1343 }
1344 
1345 static __inline bool
1346 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc)
1347 {
1348 	ASSERT(MUTEX_HELD(&vmc->vmc_lock));
1349 
1350 	bool was_unmapped = false;
1351 
1352 	list_remove(&vmc->vmc_held_pages, vmp);
1353 	if (vmp->vmp_obj_ref != NULL) {
1354 		ASSERT3P(vmp->vmp_ptep, ==, NULL);
1355 
1356 		vm_object_release(vmp->vmp_obj_ref);
1357 		was_unmapped = true;
1358 	} else {
1359 		ASSERT3P(vmp->vmp_ptep, !=, NULL);
1360 
1361 		/*
1362 		 * Track appropriate (accessed/dirty) bits for the guest-virtual
1363 		 * address corresponding to this page, if it is from the vmspace
1364 		 * rather than a direct reference to an underlying object.
1365 		 *
1366 		 * The protection and/or configured flags may obviate the need
1367 		 * for such an update.
1368 		 */
1369 		if ((vmp->vmp_prot & PROT_WRITE) != 0 &&
1370 		    (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 &&
1371 		    vmc->vmc_track_dirty) {
1372 			vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt;
1373 			(void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true);
1374 		}
1375 	}
1376 	kmem_free(vmp, sizeof (*vmp));
1377 	return (was_unmapped);
1378 }
1379 
1380 /*
1381  * Release held page.  Returns true if page resided on region which was
1382  * subsequently unmapped.
1383  */
1384 bool
1385 vmp_release(vm_page_t *vmp)
1386 {
1387 	vm_client_t *vmc = vmp->vmp_client;
1388 
1389 	VERIFY(vmc != NULL);
1390 
1391 	mutex_enter(&vmc->vmc_lock);
1392 	const bool was_unmapped = vmp_release_inner(vmp, vmc);
1393 	mutex_exit(&vmc->vmc_lock);
1394 	return (was_unmapped);
1395 }
1396 
1397 /*
1398  * Release a chain of pages which were associated via vmp_chain() (setting
1399  * page-chaining pointer).  Returns true if any pages resided upon a region
1400  * which was subsequently unmapped.
1401  *
1402  * All of those pages must have been held through the same vm_client_t.
1403  */
1404 bool
1405 vmp_release_chain(vm_page_t *vmp)
1406 {
1407 	vm_client_t *vmc = vmp->vmp_client;
1408 	bool any_unmapped = false;
1409 
1410 	ASSERT(vmp != NULL);
1411 
1412 	mutex_enter(&vmc->vmc_lock);
1413 	while (vmp != NULL) {
1414 		vm_page_t *next = vmp->vmp_chain;
1415 
1416 		/* We expect all pages in chain to be from same client */
1417 		ASSERT3P(vmp->vmp_client, ==, vmc);
1418 
1419 		if (vmp_release_inner(vmp, vmc)) {
1420 			any_unmapped = true;
1421 		}
1422 		vmp = next;
1423 	}
1424 	mutex_exit(&vmc->vmc_lock);
1425 	return (any_unmapped);
1426 }
1427 
1428 
1429 int
1430 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len,
1431     struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags)
1432 {
1433 	vm_object_t *vmo;
1434 	int err;
1435 
1436 	if (segoff < 0 || len <= 0 ||
1437 	    (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) {
1438 		return (EINVAL);
1439 	}
1440 	if ((prot & PROT_USER) == 0) {
1441 		return (ENOTSUP);
1442 	}
1443 	err = vm_get_memseg(vm, segid, NULL, NULL, &vmo);
1444 	if (err != 0) {
1445 		return (err);
1446 	}
1447 
1448 	VERIFY(segoff >= 0);
1449 	VERIFY(len <= vmo->vmo_size);
1450 	VERIFY((len + segoff) <= vmo->vmo_size);
1451 
1452 	if (vmo->vmo_type != VMOT_MEM) {
1453 		/* Only support memory objects for now */
1454 		return (ENOTSUP);
1455 	}
1456 
1457 	as_rangelock(as);
1458 
1459 	err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags);
1460 	if (err == 0) {
1461 		segvmm_crargs_t svma;
1462 
1463 		svma.prot = prot;
1464 		svma.offset = segoff;
1465 		svma.vmo = vmo;
1466 		svma.vmc = NULL;
1467 
1468 		err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma);
1469 	}
1470 
1471 	as_rangeunlock(as);
1472 	return (err);
1473 }
1474 
1475 int
1476 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp,
1477     off_t len, uint_t prot, uint_t maxprot, uint_t flags)
1478 {
1479 
1480 	const uintptr_t gpa = (uintptr_t)off;
1481 	const size_t size = (uintptr_t)len;
1482 	int err;
1483 
1484 	if (off < 0 || len <= 0 ||
1485 	    (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
1486 		return (EINVAL);
1487 	}
1488 	if ((prot & PROT_USER) == 0) {
1489 		return (ENOTSUP);
1490 	}
1491 
1492 	as_rangelock(as);
1493 
1494 	err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags);
1495 	if (err == 0) {
1496 		segvmm_crargs_t svma;
1497 
1498 		svma.prot = prot;
1499 		svma.offset = gpa;
1500 		svma.vmo = NULL;
1501 		svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm));
1502 
1503 		err = as_map(as, *addrp, len, segvmm_create, &svma);
1504 	}
1505 
1506 	as_rangeunlock(as);
1507 	return (err);
1508 }
1509