1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 /* This file is dual-licensed; see usr/src/contrib/bhyve/LICENSE */ 12 13 /* 14 * Copyright 2019 Joyent, Inc. 15 * Copyright 2023 Oxide Computer Company 16 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 17 */ 18 19 #include <sys/param.h> 20 #include <sys/kmem.h> 21 #include <sys/thread.h> 22 #include <sys/list.h> 23 #include <sys/mman.h> 24 #include <sys/types.h> 25 #include <sys/ddi.h> 26 #include <sys/sysmacros.h> 27 #include <sys/machsystm.h> 28 #include <sys/vmsystm.h> 29 #include <sys/x86_archext.h> 30 #include <vm/as.h> 31 #include <vm/hat_i86.h> 32 #include <vm/seg_vn.h> 33 #include <vm/seg_kmem.h> 34 35 #include <sys/vmm_vm.h> 36 #include <sys/seg_vmm.h> 37 #include <sys/vmm_kernel.h> 38 #include <sys/vmm_reservoir.h> 39 #include <sys/vmm_gpt.h> 40 41 42 /* 43 * VMM Virtual Memory 44 * 45 * History 46 * 47 * When bhyve was ported to illumos, one significant hole was handling guest 48 * memory and memory accesses. In the original Pluribus port, bhyve itself 49 * manually handled the EPT structures for guest memory. The updated sources 50 * (from FreeBSD 11) took a different approach, using the native FreeBSD VM 51 * system for memory allocations and management of the EPT structures. Keeping 52 * source differences to a minimum was a priority, so illumos-bhyve implemented 53 * a makeshift "VM shim" which exposed the bare minimum of those interfaces to 54 * boot and run guests. 55 * 56 * While the VM shim was successful in getting illumos-bhyve to a functional 57 * state on Intel (and later AMD) gear, the FreeBSD-specific nature of the 58 * compatibility interfaces made it awkward to use. As source differences with 59 * the upstream kernel code became less of a concern, and upcoming features 60 * (such as live migration) would demand more of those VM interfaces, it became 61 * clear that an overhaul was prudent. 62 * 63 * Design 64 * 65 * The new VM system for bhyve retains a number of the same concepts as what it 66 * replaces: 67 * 68 * - `vmspace_t` is the top-level entity for a guest memory space 69 * - `vm_object_t` represents a memory object which can be mapped into a vmspace 70 * - `vm_page_t` represents a page hold within a given vmspace, providing access 71 * to the underlying memory page 72 * 73 * Unlike the old code, where most of the involved structures were exposed via 74 * public definitions, this replacement VM interface keeps all involved 75 * structures opaque to consumers. Furthermore, there is a clear delineation 76 * between infrequent administrative operations (such as mapping/unmapping 77 * regions) and common data-path operations (attempting a page hold at a given 78 * guest-physical address). Those administrative operations are performed 79 * directly against the vmspace, whereas the data-path operations are performed 80 * through a `vm_client_t` handle. That VM client abstraction is meant to 81 * reduce contention and overhead for frequent access operations and provide 82 * debugging insight into how different subcomponents are accessing the vmspace. 83 * A VM client is allocated for each vCPU, each viona ring (via the vmm_drv 84 * interface) and each VMM userspace segment mapping. 85 * 86 * Exclusion 87 * 88 * Making changes to the vmspace (such as mapping or unmapping regions) requires 89 * other accessors be excluded while the change is underway to prevent them from 90 * observing invalid intermediate states. A simple approach could use a mutex 91 * or rwlock to achieve this, but that risks contention when the rate of access 92 * to the vmspace is high. 93 * 94 * Since vmspace changes (map/unmap) are rare, we can instead do the exclusion 95 * at a per-vm_client_t basis. While this raises the cost for vmspace changes, 96 * it means that the much more common page accesses through the vm_client can 97 * normally proceed unimpeded and independently. 98 * 99 * When a change to the vmspace is required, the caller will put the vmspace in 100 * a 'hold' state, iterating over all associated vm_client instances, waiting 101 * for them to complete any in-flight lookup (indicated by VCS_ACTIVE) before 102 * setting VCS_HOLD in their state flag fields. With VCS_HOLD set, any call on 103 * the vm_client which would access the vmspace state (vmc_hold or vmc_fault) 104 * will block until the hold condition is cleared. Once the hold is asserted 105 * for all clients, the vmspace change can proceed with confidence. Upon 106 * completion of that operation, VCS_HOLD is cleared from the clients, and they 107 * are released to resume vmspace accesses. 108 * 109 * vCPU Consumers 110 * 111 * Access to the vmspace for vCPUs running in guest context is different from 112 * emulation-related vm_client activity: they solely rely on the contents of the 113 * page tables. Furthermore, the existing VCS_HOLD mechanism used to exclude 114 * client access is not feasible when entering guest context, since interrupts 115 * are disabled, making it impossible to block entry. This is not a concern as 116 * long as vmspace modifications never place the page tables in invalid states 117 * (either intermediate, or final). The vm_client hold mechanism does provide 118 * the means to IPI vCPU consumers which will trigger a notification once they 119 * report their exit from guest context. This can be used to ensure that page 120 * table modifications are made visible to those vCPUs within a certain 121 * time frame. 122 */ 123 124 typedef struct vmspace_mapping { 125 list_node_t vmsm_node; 126 vm_object_t *vmsm_object; /* object backing this mapping */ 127 uintptr_t vmsm_addr; /* start addr in vmspace for mapping */ 128 size_t vmsm_len; /* length (in bytes) of mapping */ 129 off_t vmsm_offset; /* byte offset into object */ 130 uint_t vmsm_prot; 131 } vmspace_mapping_t; 132 133 #define VMSM_OFFSET(vmsm, addr) ( \ 134 (vmsm)->vmsm_offset + \ 135 ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) 136 137 typedef enum vm_client_state { 138 VCS_IDLE = 0, 139 /* currently accessing vmspace for client operation (hold or fault) */ 140 VCS_ACTIVE = (1 << 0), 141 /* client hold requested/asserted */ 142 VCS_HOLD = (1 << 1), 143 /* vCPU is accessing page tables in guest context */ 144 VCS_ON_CPU = (1 << 2), 145 /* client has been orphaned (no more access to vmspace) */ 146 VCS_ORPHANED = (1 << 3), 147 /* client undergoing destroy operation */ 148 VCS_DESTROY = (1 << 4), 149 } vm_client_state_t; 150 151 struct vmspace { 152 kmutex_t vms_lock; 153 kcondvar_t vms_cv; 154 bool vms_held; 155 uintptr_t vms_size; /* immutable after creation */ 156 157 /* (nested) page table state */ 158 vmm_gpt_t *vms_gpt; 159 uint64_t vms_pt_gen; 160 uint64_t vms_pages_mapped; 161 bool vms_track_dirty; 162 163 list_t vms_maplist; 164 list_t vms_clients; 165 }; 166 167 struct vm_client { 168 vmspace_t *vmc_space; 169 list_node_t vmc_node; 170 171 kmutex_t vmc_lock; 172 kcondvar_t vmc_cv; 173 vm_client_state_t vmc_state; 174 int vmc_cpu_active; 175 uint64_t vmc_cpu_gen; 176 bool vmc_track_dirty; 177 vmc_inval_cb_t vmc_inval_func; 178 void *vmc_inval_data; 179 180 list_t vmc_held_pages; 181 }; 182 183 typedef enum vm_object_type { 184 VMOT_NONE, 185 VMOT_MEM, 186 VMOT_MMIO, 187 } vm_object_type_t; 188 189 struct vm_object { 190 uint_t vmo_refcnt; /* manipulated with atomic ops */ 191 192 /* Fields below are fixed at creation time */ 193 vm_object_type_t vmo_type; 194 size_t vmo_size; 195 void *vmo_data; 196 uint8_t vmo_attr; 197 }; 198 199 /* Convenience consolidation of all flag(s) for validity checking */ 200 #define VPF_ALL (VPF_DEFER_DIRTY) 201 202 struct vm_page { 203 vm_client_t *vmp_client; 204 list_node_t vmp_node; 205 vm_page_t *vmp_chain; 206 uintptr_t vmp_gpa; 207 pfn_t vmp_pfn; 208 uint64_t *vmp_ptep; 209 vm_object_t *vmp_obj_ref; 210 uint8_t vmp_prot; 211 uint8_t vmp_flags; 212 }; 213 214 static vmspace_mapping_t *vm_mapping_find(vmspace_t *, uintptr_t, size_t); 215 static void vmspace_hold_enter(vmspace_t *); 216 static void vmspace_hold_exit(vmspace_t *, bool); 217 static void vmc_space_hold(vm_client_t *); 218 static void vmc_space_release(vm_client_t *, bool); 219 static void vmc_space_invalidate(vm_client_t *, uintptr_t, size_t, uint64_t); 220 static void vmc_space_unmap(vm_client_t *, uintptr_t, size_t, vm_object_t *); 221 static vm_client_t *vmc_space_orphan(vm_client_t *, vmspace_t *); 222 223 224 /* 225 * Create a new vmspace with a maximum address of `end`. 226 */ 227 vmspace_t * 228 vmspace_alloc(size_t end, vmm_pte_ops_t *pte_ops, bool track_dirty) 229 { 230 vmspace_t *vms; 231 const uintptr_t size = end + 1; 232 233 /* 234 * This whole mess is built on the assumption that a 64-bit address 235 * space is available to work with for the various pagetable tricks. 236 */ 237 VERIFY(size > 0 && (size & PAGEOFFSET) == 0 && 238 size <= (uintptr_t)USERLIMIT); 239 240 vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); 241 vms->vms_size = size; 242 list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), 243 offsetof(vmspace_mapping_t, vmsm_node)); 244 list_create(&vms->vms_clients, sizeof (vm_client_t), 245 offsetof(vm_client_t, vmc_node)); 246 247 vms->vms_gpt = vmm_gpt_alloc(pte_ops); 248 vms->vms_pt_gen = 1; 249 vms->vms_track_dirty = track_dirty; 250 251 return (vms); 252 } 253 254 /* 255 * Destroy a vmspace. All regions in the space must be unmapped. Any remaining 256 * clients will be orphaned. 257 */ 258 void 259 vmspace_destroy(vmspace_t *vms) 260 { 261 mutex_enter(&vms->vms_lock); 262 VERIFY(list_is_empty(&vms->vms_maplist)); 263 264 if (!list_is_empty(&vms->vms_clients)) { 265 vm_client_t *vmc = list_head(&vms->vms_clients); 266 while (vmc != NULL) { 267 vmc = vmc_space_orphan(vmc, vms); 268 } 269 /* 270 * Wait for any clients which were in the process of destroying 271 * themselves to disappear. 272 */ 273 while (!list_is_empty(&vms->vms_clients)) { 274 cv_wait(&vms->vms_cv, &vms->vms_lock); 275 } 276 } 277 VERIFY(list_is_empty(&vms->vms_clients)); 278 279 vmm_gpt_free(vms->vms_gpt); 280 mutex_exit(&vms->vms_lock); 281 282 mutex_destroy(&vms->vms_lock); 283 cv_destroy(&vms->vms_cv); 284 list_destroy(&vms->vms_maplist); 285 list_destroy(&vms->vms_clients); 286 287 kmem_free(vms, sizeof (*vms)); 288 } 289 290 /* 291 * Retrieve the count of resident (mapped into the page tables) pages. 292 */ 293 uint64_t 294 vmspace_resident_count(vmspace_t *vms) 295 { 296 return (vms->vms_pages_mapped); 297 } 298 299 int 300 vmspace_track_dirty(vmspace_t *vms, uint64_t gpa, size_t len, uint8_t *bitmap) 301 { 302 if (!vms->vms_track_dirty) 303 return (EPERM); 304 305 /* 306 * Accumulate dirty bits into the given bit vector. Note that this 307 * races both against hardware writes from running vCPUs and 308 * reflections from userspace. 309 * 310 * Called from a userspace-visible ioctl, this depends on the VM 311 * instance being read-locked to prevent vmspace_map/vmspace_unmap 312 * operations from changing the page tables during the walk. 313 */ 314 for (size_t offset = 0; offset < len; offset += PAGESIZE) { 315 bool bit = false; 316 uint64_t *entry = vmm_gpt_lookup(vms->vms_gpt, gpa + offset); 317 if (entry != NULL) 318 bit = vmm_gpt_reset_dirty(vms->vms_gpt, entry, false); 319 uint64_t pfn_offset = offset >> PAGESHIFT; 320 size_t bit_offset = pfn_offset / 8; 321 size_t bit_index = pfn_offset % 8; 322 bitmap[bit_offset] |= (bit << bit_index); 323 } 324 325 /* 326 * Now invalidate those bits and shoot down address spaces that 327 * may have them cached. 328 */ 329 vmspace_hold_enter(vms); 330 vms->vms_pt_gen++; 331 for (vm_client_t *vmc = list_head(&vms->vms_clients); 332 vmc != NULL; 333 vmc = list_next(&vms->vms_clients, vmc)) { 334 vmc_space_invalidate(vmc, gpa, len, vms->vms_pt_gen); 335 } 336 vmspace_hold_exit(vms, true); 337 338 return (0); 339 } 340 341 static pfn_t 342 vm_object_pager_reservoir(vm_object_t *vmo, uintptr_t off) 343 { 344 vmmr_region_t *region; 345 pfn_t pfn; 346 347 ASSERT3U(vmo->vmo_type, ==, VMOT_MEM); 348 349 region = vmo->vmo_data; 350 pfn = vmmr_region_pfn_at(region, off); 351 352 return (pfn); 353 } 354 355 static pfn_t 356 vm_object_pager_mmio(vm_object_t *vmo, uintptr_t off) 357 { 358 pfn_t pfn; 359 360 ASSERT3U(vmo->vmo_type, ==, VMOT_MMIO); 361 ASSERT3P(vmo->vmo_data, !=, NULL); 362 ASSERT3U(off, <, vmo->vmo_size); 363 364 pfn = ((uintptr_t)vmo->vmo_data + off) >> PAGESHIFT; 365 366 return (pfn); 367 } 368 369 /* 370 * Allocate a VM object backed by VMM reservoir memory. 371 */ 372 vm_object_t * 373 vm_object_mem_allocate(size_t size, bool transient) 374 { 375 int err; 376 vmmr_region_t *region = NULL; 377 vm_object_t *vmo; 378 379 ASSERT3U(size, !=, 0); 380 ASSERT3U(size & PAGEOFFSET, ==, 0); 381 382 err = vmmr_alloc(size, transient, ®ion); 383 if (err != 0) { 384 return (NULL); 385 } 386 387 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 388 389 /* For now, these are to stay fixed after allocation */ 390 vmo->vmo_type = VMOT_MEM; 391 vmo->vmo_size = size; 392 vmo->vmo_attr = MTRR_TYPE_WB; 393 vmo->vmo_data = region; 394 vmo->vmo_refcnt = 1; 395 396 return (vmo); 397 } 398 399 static vm_object_t * 400 vm_object_mmio_allocate(size_t size, uintptr_t hpa) 401 { 402 vm_object_t *vmo; 403 404 ASSERT3U(size, !=, 0); 405 ASSERT3U(size & PAGEOFFSET, ==, 0); 406 ASSERT3U(hpa & PAGEOFFSET, ==, 0); 407 408 vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); 409 410 /* For now, these are to stay fixed after allocation */ 411 vmo->vmo_type = VMOT_MMIO; 412 vmo->vmo_size = size; 413 vmo->vmo_attr = MTRR_TYPE_UC; 414 vmo->vmo_data = (void *)hpa; 415 vmo->vmo_refcnt = 1; 416 417 return (vmo); 418 } 419 420 /* 421 * Allocate a VM object backed by an existing range of physical memory. 422 */ 423 vm_object_t * 424 vmm_mmio_alloc(vmspace_t *vmspace, uintptr_t gpa, size_t len, uintptr_t hpa) 425 { 426 int error; 427 vm_object_t *obj; 428 429 obj = vm_object_mmio_allocate(len, hpa); 430 if (obj != NULL) { 431 error = vmspace_map(vmspace, obj, 0, gpa, len, 432 PROT_READ | PROT_WRITE); 433 if (error != 0) { 434 vm_object_release(obj); 435 obj = NULL; 436 } 437 } 438 439 return (obj); 440 } 441 442 /* 443 * Release a vm_object reference 444 */ 445 void 446 vm_object_release(vm_object_t *vmo) 447 { 448 ASSERT(vmo != NULL); 449 450 uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); 451 /* underflow would be a deadly serious mistake */ 452 VERIFY3U(ref, !=, UINT_MAX); 453 if (ref != 0) { 454 return; 455 } 456 457 switch (vmo->vmo_type) { 458 case VMOT_MEM: 459 vmmr_free((vmmr_region_t *)vmo->vmo_data); 460 break; 461 case VMOT_MMIO: 462 break; 463 default: 464 panic("unexpected object type %u", vmo->vmo_type); 465 break; 466 } 467 468 vmo->vmo_data = NULL; 469 vmo->vmo_size = 0; 470 kmem_free(vmo, sizeof (*vmo)); 471 } 472 473 /* 474 * Increase refcount for vm_object reference 475 */ 476 void 477 vm_object_reference(vm_object_t *vmo) 478 { 479 ASSERT(vmo != NULL); 480 481 uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); 482 /* overflow would be a deadly serious mistake */ 483 VERIFY3U(ref, !=, 0); 484 } 485 486 /* 487 * Get the host-physical PFN for a given offset into a vm_object. 488 * 489 * The provided `off` must be within the allocated size of the vm_object. 490 */ 491 pfn_t 492 vm_object_pfn(vm_object_t *vmo, uintptr_t off) 493 { 494 const uintptr_t aligned_off = off & PAGEMASK; 495 496 switch (vmo->vmo_type) { 497 case VMOT_MEM: 498 return (vm_object_pager_reservoir(vmo, aligned_off)); 499 case VMOT_MMIO: 500 return (vm_object_pager_mmio(vmo, aligned_off)); 501 case VMOT_NONE: 502 break; 503 } 504 panic("unexpected object type %u", vmo->vmo_type); 505 } 506 507 static vmspace_mapping_t * 508 vm_mapping_find(vmspace_t *vms, uintptr_t addr, size_t size) 509 { 510 vmspace_mapping_t *vmsm; 511 list_t *ml = &vms->vms_maplist; 512 const uintptr_t range_end = addr + size; 513 514 ASSERT3U(addr, <=, range_end); 515 516 if (addr >= vms->vms_size) { 517 return (NULL); 518 } 519 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 520 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; 521 522 if (addr >= vmsm->vmsm_addr && addr < seg_end) { 523 if (range_end <= seg_end) { 524 return (vmsm); 525 } else { 526 return (NULL); 527 } 528 } 529 } 530 return (NULL); 531 } 532 533 /* 534 * Check to see if any mappings reside within [addr, addr + size) span in the 535 * vmspace, returning true if that span is indeed empty. 536 */ 537 static bool 538 vm_mapping_gap(vmspace_t *vms, uintptr_t addr, size_t size) 539 { 540 vmspace_mapping_t *vmsm; 541 list_t *ml = &vms->vms_maplist; 542 const uintptr_t range_end = addr + size - 1; 543 544 ASSERT(MUTEX_HELD(&vms->vms_lock)); 545 ASSERT(size > 0); 546 547 for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { 548 const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len - 1; 549 550 /* 551 * The two ranges do not overlap if the start of either of 552 * them is after the end of the other. 553 */ 554 if (vmsm->vmsm_addr > range_end || addr > seg_end) 555 continue; 556 return (false); 557 } 558 return (true); 559 } 560 561 static void 562 vm_mapping_remove(vmspace_t *vms, vmspace_mapping_t *vmsm) 563 { 564 list_t *ml = &vms->vms_maplist; 565 566 ASSERT(MUTEX_HELD(&vms->vms_lock)); 567 ASSERT(vms->vms_held); 568 569 list_remove(ml, vmsm); 570 vm_object_release(vmsm->vmsm_object); 571 kmem_free(vmsm, sizeof (*vmsm)); 572 } 573 574 /* 575 * Enter a hold state on the vmspace. This ensures that all VM clients 576 * associated with the vmspace are excluded from establishing new page holds, 577 * or any other actions which would require accessing vmspace state subject to 578 * potential change. 579 * 580 * Returns with vmspace_t`vms_lock held. 581 */ 582 static void 583 vmspace_hold_enter(vmspace_t *vms) 584 { 585 mutex_enter(&vms->vms_lock); 586 VERIFY(!vms->vms_held); 587 588 vm_client_t *vmc = list_head(&vms->vms_clients); 589 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 590 vmc_space_hold(vmc); 591 } 592 vms->vms_held = true; 593 } 594 595 /* 596 * Exit a hold state on the vmspace. This releases all VM clients associated 597 * with the vmspace to be able to establish new page holds, and partake in other 598 * actions which require accessing changed vmspace state. If `kick_on_cpu` is 599 * true, then any CPUs actively using the page tables will be IPIed, and the 600 * call will block until they have acknowledged being ready to use the latest 601 * state of the tables. 602 * 603 * Requires vmspace_t`vms_lock be held, which is released as part of the call. 604 */ 605 static void 606 vmspace_hold_exit(vmspace_t *vms, bool kick_on_cpu) 607 { 608 ASSERT(MUTEX_HELD(&vms->vms_lock)); 609 VERIFY(vms->vms_held); 610 611 vm_client_t *vmc = list_head(&vms->vms_clients); 612 for (; vmc != NULL; vmc = list_next(&vms->vms_clients, vmc)) { 613 vmc_space_release(vmc, kick_on_cpu); 614 } 615 vms->vms_held = false; 616 mutex_exit(&vms->vms_lock); 617 } 618 619 /* 620 * Attempt to map a vm_object span into the vmspace. 621 * 622 * Requirements: 623 * - `obj_off`, `addr`, and `len` must be page-aligned 624 * - `obj_off` cannot be greater than the allocated size of the object 625 * - [`obj_off`, `obj_off` + `len`) span cannot extend beyond the allocated 626 * size of the object 627 * - [`addr`, `addr` + `len`) span cannot reside beyond the maximum address 628 * of the vmspace 629 */ 630 int 631 vmspace_map(vmspace_t *vms, vm_object_t *vmo, uintptr_t obj_off, uintptr_t addr, 632 size_t len, uint8_t prot) 633 { 634 vmspace_mapping_t *vmsm; 635 int res = 0; 636 637 if (len == 0 || (addr + len) < addr || 638 obj_off >= (obj_off + len) || vmo->vmo_size < (obj_off + len)) { 639 return (EINVAL); 640 } 641 if ((addr + len) >= vms->vms_size) { 642 return (ENOMEM); 643 } 644 645 vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); 646 647 vmspace_hold_enter(vms); 648 if (!vm_mapping_gap(vms, addr, len)) { 649 kmem_free(vmsm, sizeof (*vmsm)); 650 res = ENOMEM; 651 } else { 652 vmsm->vmsm_object = vmo; 653 vmsm->vmsm_addr = addr; 654 vmsm->vmsm_len = len; 655 vmsm->vmsm_offset = (off_t)obj_off; 656 vmsm->vmsm_prot = prot; 657 list_insert_tail(&vms->vms_maplist, vmsm); 658 659 /* 660 * Make sure the GPT has tables ready for leaf entries across 661 * the entire new mapping. 662 */ 663 vmm_gpt_populate_region(vms->vms_gpt, addr, len); 664 } 665 vmspace_hold_exit(vms, false); 666 return (res); 667 } 668 669 /* 670 * Unmap a region of the vmspace. 671 * 672 * Presently the [start, end) span must equal a region previously mapped by a 673 * call to vmspace_map(). 674 */ 675 int 676 vmspace_unmap(vmspace_t *vms, uintptr_t addr, uintptr_t len) 677 { 678 const uintptr_t end = addr + len; 679 vmspace_mapping_t *vmsm; 680 vm_client_t *vmc; 681 uint64_t gen = 0; 682 683 ASSERT3U(addr, <, end); 684 685 vmspace_hold_enter(vms); 686 /* expect to match existing mapping exactly */ 687 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL || 688 vmsm->vmsm_addr != addr || vmsm->vmsm_len != len) { 689 vmspace_hold_exit(vms, false); 690 return (ENOENT); 691 } 692 693 /* Prepare clients (and their held pages) for the unmap. */ 694 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 695 vmc = list_next(&vms->vms_clients, vmc)) { 696 vmc_space_unmap(vmc, addr, len, vmsm->vmsm_object); 697 } 698 699 /* Clear all PTEs for region */ 700 if (vmm_gpt_unmap_region(vms->vms_gpt, addr, len) != 0) { 701 vms->vms_pt_gen++; 702 gen = vms->vms_pt_gen; 703 } 704 /* ... and the intermediate (directory) PTEs as well */ 705 vmm_gpt_vacate_region(vms->vms_gpt, addr, len); 706 707 /* 708 * If pages were actually unmapped from the GPT, provide clients with 709 * an invalidation notice. 710 */ 711 if (gen != 0) { 712 for (vmc = list_head(&vms->vms_clients); vmc != NULL; 713 vmc = list_next(&vms->vms_clients, vmc)) { 714 vmc_space_invalidate(vmc, addr, len, vms->vms_pt_gen); 715 } 716 } 717 718 vm_mapping_remove(vms, vmsm); 719 vmspace_hold_exit(vms, true); 720 return (0); 721 } 722 723 static int 724 vmspace_lookup_map(vmspace_t *vms, uintptr_t gpa, int req_prot, pfn_t *pfnp, 725 uint64_t **ptepp) 726 { 727 vmm_gpt_t *gpt = vms->vms_gpt; 728 uint64_t *entries[MAX_GPT_LEVEL], *leaf; 729 pfn_t pfn = PFN_INVALID; 730 uint_t prot; 731 732 ASSERT0(gpa & PAGEOFFSET); 733 ASSERT((req_prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) != PROT_NONE); 734 735 vmm_gpt_walk(gpt, gpa, entries, MAX_GPT_LEVEL); 736 leaf = entries[LEVEL1]; 737 if (leaf == NULL) { 738 /* 739 * Since we populated the intermediate tables for any regions 740 * mapped in the GPT, an empty leaf entry indicates there is no 741 * mapping, populated or not, at this GPT. 742 */ 743 return (FC_NOMAP); 744 } 745 746 if (vmm_gpt_is_mapped(gpt, leaf, &pfn, &prot)) { 747 if ((req_prot & prot) != req_prot) { 748 return (FC_PROT); 749 } 750 } else { 751 vmspace_mapping_t *vmsm; 752 vm_object_t *vmo; 753 754 vmsm = vm_mapping_find(vms, gpa, PAGESIZE); 755 if (vmsm == NULL) { 756 return (FC_NOMAP); 757 } 758 759 if ((req_prot & vmsm->vmsm_prot) != req_prot) { 760 return (FC_PROT); 761 } 762 vmo = vmsm->vmsm_object; 763 pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 764 VERIFY(pfn != PFN_INVALID); 765 766 if (vmm_gpt_map_at(gpt, leaf, pfn, vmsm->vmsm_prot, 767 vmo->vmo_attr)) { 768 atomic_inc_64(&vms->vms_pages_mapped); 769 } 770 } 771 772 ASSERT(pfn != PFN_INVALID && leaf != NULL); 773 if (pfnp != NULL) { 774 *pfnp = pfn; 775 } 776 if (ptepp != NULL) { 777 *ptepp = leaf; 778 } 779 return (0); 780 } 781 782 /* 783 * Populate (make resident in the page tables) a region of the vmspace. 784 * 785 * Presently the [start, end) span must equal a region previously mapped by a 786 * call to vmspace_map(). 787 */ 788 int 789 vmspace_populate(vmspace_t *vms, uintptr_t addr, uintptr_t len) 790 { 791 vmspace_mapping_t *vmsm; 792 mutex_enter(&vms->vms_lock); 793 794 /* For the time being, only exact-match mappings are expected */ 795 if ((vmsm = vm_mapping_find(vms, addr, len)) == NULL) { 796 mutex_exit(&vms->vms_lock); 797 return (FC_NOMAP); 798 } 799 800 vm_object_t *vmo = vmsm->vmsm_object; 801 const int prot = vmsm->vmsm_prot; 802 const uint8_t attr = vmo->vmo_attr; 803 size_t populated = 0; 804 const size_t end = addr + len; 805 for (uintptr_t gpa = addr & PAGEMASK; gpa < end; gpa += PAGESIZE) { 806 const pfn_t pfn = vm_object_pfn(vmo, VMSM_OFFSET(vmsm, gpa)); 807 VERIFY(pfn != PFN_INVALID); 808 809 if (vmm_gpt_map(vms->vms_gpt, gpa, pfn, prot, attr)) { 810 populated++; 811 } 812 } 813 atomic_add_64(&vms->vms_pages_mapped, populated); 814 815 mutex_exit(&vms->vms_lock); 816 return (0); 817 } 818 819 /* 820 * Allocate a client from a given vmspace. 821 */ 822 vm_client_t * 823 vmspace_client_alloc(vmspace_t *vms) 824 { 825 vm_client_t *vmc; 826 827 vmc = kmem_zalloc(sizeof (vm_client_t), KM_SLEEP); 828 vmc->vmc_space = vms; 829 mutex_init(&vmc->vmc_lock, NULL, MUTEX_DRIVER, NULL); 830 cv_init(&vmc->vmc_cv, NULL, CV_DRIVER, NULL); 831 vmc->vmc_state = VCS_IDLE; 832 vmc->vmc_cpu_active = -1; 833 list_create(&vmc->vmc_held_pages, sizeof (vm_page_t), 834 offsetof(vm_page_t, vmp_node)); 835 vmc->vmc_track_dirty = vms->vms_track_dirty; 836 837 mutex_enter(&vms->vms_lock); 838 list_insert_tail(&vms->vms_clients, vmc); 839 mutex_exit(&vms->vms_lock); 840 841 return (vmc); 842 } 843 844 /* 845 * Get the nested page table root pointer (EPTP/NCR3) value. 846 */ 847 uint64_t 848 vmspace_table_root(vmspace_t *vms) 849 { 850 return (vmm_gpt_get_pmtp(vms->vms_gpt, vms->vms_track_dirty)); 851 } 852 853 /* 854 * Get the current generation number of the nested page table. 855 */ 856 uint64_t 857 vmspace_table_gen(vmspace_t *vms) 858 { 859 return (vms->vms_pt_gen); 860 } 861 862 /* 863 * Mark a vm_client as active. This will block if/while the client is held by 864 * the vmspace. On success, it returns with vm_client_t`vmc_lock held. It will 865 * fail if the vm_client has been orphaned. 866 */ 867 static int 868 vmc_activate(vm_client_t *vmc) 869 { 870 mutex_enter(&vmc->vmc_lock); 871 VERIFY0(vmc->vmc_state & VCS_ACTIVE); 872 if ((vmc->vmc_state & VCS_ORPHANED) != 0) { 873 mutex_exit(&vmc->vmc_lock); 874 return (ENXIO); 875 } 876 while ((vmc->vmc_state & VCS_HOLD) != 0) { 877 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 878 } 879 vmc->vmc_state |= VCS_ACTIVE; 880 return (0); 881 } 882 883 /* 884 * Mark a vm_client as no longer active. It must be called with 885 * vm_client_t`vmc_lock already held, and will return with it released. 886 */ 887 static void 888 vmc_deactivate(vm_client_t *vmc) 889 { 890 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 891 VERIFY(vmc->vmc_state & VCS_ACTIVE); 892 893 vmc->vmc_state ^= VCS_ACTIVE; 894 if ((vmc->vmc_state & VCS_HOLD) != 0) { 895 cv_broadcast(&vmc->vmc_cv); 896 } 897 mutex_exit(&vmc->vmc_lock); 898 } 899 900 /* 901 * Indicate that a CPU will be utilizing the nested page tables through this VM 902 * client. Interrupts (and/or the GIF) are expected to be disabled when calling 903 * this function. Returns the generation number of the nested page table (to be 904 * used for TLB invalidations). 905 */ 906 uint64_t 907 vmc_table_enter(vm_client_t *vmc) 908 { 909 vmspace_t *vms = vmc->vmc_space; 910 uint64_t gen; 911 912 ASSERT0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 913 ASSERT3S(vmc->vmc_cpu_active, ==, -1); 914 915 /* 916 * Since the NPT activation occurs with interrupts disabled, this must 917 * be done without taking vmc_lock like normal. 918 */ 919 gen = vms->vms_pt_gen; 920 vmc->vmc_cpu_active = CPU->cpu_id; 921 vmc->vmc_cpu_gen = gen; 922 atomic_or_uint(&vmc->vmc_state, VCS_ON_CPU); 923 924 return (gen); 925 } 926 927 /* 928 * Indicate that this VM client is not longer (directly) using the underlying 929 * page tables. Interrupts (and/or the GIF) must be enabled prior to calling 930 * this function. 931 */ 932 void 933 vmc_table_exit(vm_client_t *vmc) 934 { 935 mutex_enter(&vmc->vmc_lock); 936 937 ASSERT(vmc->vmc_state & VCS_ON_CPU); 938 vmc->vmc_state ^= VCS_ON_CPU; 939 vmc->vmc_cpu_active = -1; 940 if ((vmc->vmc_state & VCS_HOLD) != 0) { 941 cv_broadcast(&vmc->vmc_cv); 942 } 943 944 mutex_exit(&vmc->vmc_lock); 945 } 946 947 static void 948 vmc_space_hold(vm_client_t *vmc) 949 { 950 mutex_enter(&vmc->vmc_lock); 951 VERIFY0(vmc->vmc_state & VCS_HOLD); 952 953 /* 954 * Because vmc_table_enter() alters vmc_state from a context where 955 * interrupts are disabled, it cannot pay heed to vmc_lock, so setting 956 * VMC_HOLD must be done atomically here. 957 */ 958 atomic_or_uint(&vmc->vmc_state, VCS_HOLD); 959 960 /* Wait for client to go inactive */ 961 while ((vmc->vmc_state & VCS_ACTIVE) != 0) { 962 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 963 } 964 mutex_exit(&vmc->vmc_lock); 965 } 966 967 static void 968 vmc_space_release(vm_client_t *vmc, bool kick_on_cpu) 969 { 970 mutex_enter(&vmc->vmc_lock); 971 VERIFY(vmc->vmc_state & VCS_HOLD); 972 973 if (kick_on_cpu && (vmc->vmc_state & VCS_ON_CPU) != 0) { 974 poke_cpu(vmc->vmc_cpu_active); 975 976 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 977 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 978 } 979 } 980 981 /* 982 * Because vmc_table_enter() alters vmc_state from a context where 983 * interrupts are disabled, it cannot pay heed to vmc_lock, so clearing 984 * VMC_HOLD must be done atomically here. 985 */ 986 atomic_and_uint(&vmc->vmc_state, ~VCS_HOLD); 987 cv_broadcast(&vmc->vmc_cv); 988 mutex_exit(&vmc->vmc_lock); 989 } 990 991 static void 992 vmc_space_invalidate(vm_client_t *vmc, uintptr_t addr, size_t size, 993 uint64_t gen) 994 { 995 mutex_enter(&vmc->vmc_lock); 996 VERIFY(vmc->vmc_state & VCS_HOLD); 997 if ((vmc->vmc_state & VCS_ON_CPU) != 0) { 998 /* 999 * Wait for clients using an old generation of the page tables 1000 * to exit guest context, where they subsequently flush the TLB 1001 * for the new generation. 1002 */ 1003 if (vmc->vmc_cpu_gen < gen) { 1004 poke_cpu(vmc->vmc_cpu_active); 1005 1006 while ((vmc->vmc_state & VCS_ON_CPU) != 0) { 1007 cv_wait(&vmc->vmc_cv, &vmc->vmc_lock); 1008 } 1009 } 1010 } 1011 if (vmc->vmc_inval_func != NULL) { 1012 vmc_inval_cb_t func = vmc->vmc_inval_func; 1013 void *data = vmc->vmc_inval_data; 1014 1015 /* 1016 * Perform the actual invalidation call outside vmc_lock to 1017 * avoid lock ordering issues in the consumer. Since the client 1018 * is under VCS_HOLD, this is safe. 1019 */ 1020 mutex_exit(&vmc->vmc_lock); 1021 func(data, addr, size); 1022 mutex_enter(&vmc->vmc_lock); 1023 } 1024 mutex_exit(&vmc->vmc_lock); 1025 } 1026 1027 static void 1028 vmc_space_unmap(vm_client_t *vmc, uintptr_t addr, size_t size, 1029 vm_object_t *vmo) 1030 { 1031 mutex_enter(&vmc->vmc_lock); 1032 VERIFY(vmc->vmc_state & VCS_HOLD); 1033 1034 /* 1035 * With the current vCPU exclusion invariants in place, we do not expect 1036 * a vCPU to be in guest context during an unmap. 1037 */ 1038 VERIFY0(vmc->vmc_state & VCS_ON_CPU); 1039 1040 /* 1041 * Any holds against the unmapped region need to establish their own 1042 * reference to the underlying object to avoid a potential 1043 * use-after-free. 1044 */ 1045 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1046 vmp != NULL; 1047 vmp = list_next(&vmc->vmc_held_pages, vmc)) { 1048 if (vmp->vmp_gpa < addr || 1049 vmp->vmp_gpa >= (addr + size)) { 1050 /* Hold outside region in question */ 1051 continue; 1052 } 1053 if (vmp->vmp_obj_ref == NULL) { 1054 vm_object_reference(vmo); 1055 vmp->vmp_obj_ref = vmo; 1056 /* For an unmapped region, PTE is now meaningless */ 1057 vmp->vmp_ptep = NULL; 1058 } else { 1059 /* 1060 * Object could have gone through cycle of 1061 * unmap-map-unmap before the hold was released. 1062 */ 1063 VERIFY3P(vmp->vmp_ptep, ==, NULL); 1064 } 1065 } 1066 mutex_exit(&vmc->vmc_lock); 1067 } 1068 1069 static vm_client_t * 1070 vmc_space_orphan(vm_client_t *vmc, vmspace_t *vms) 1071 { 1072 vm_client_t *next; 1073 1074 ASSERT(MUTEX_HELD(&vms->vms_lock)); 1075 1076 mutex_enter(&vmc->vmc_lock); 1077 VERIFY3P(vmc->vmc_space, ==, vms); 1078 VERIFY0(vmc->vmc_state & VCS_ORPHANED); 1079 if (vmc->vmc_state & VCS_DESTROY) { 1080 /* 1081 * This vm_client is currently undergoing destruction, so it 1082 * does not need to be orphaned. Let it proceed with its own 1083 * clean-up task. 1084 */ 1085 next = list_next(&vms->vms_clients, vmc); 1086 } else { 1087 /* 1088 * Clients are only orphaned when the containing vmspace is 1089 * being torn down. All mappings from the vmspace should 1090 * already be gone, meaning any remaining held pages should have 1091 * direct references to the object. 1092 */ 1093 for (vm_page_t *vmp = list_head(&vmc->vmc_held_pages); 1094 vmp != NULL; 1095 vmp = list_next(&vmc->vmc_held_pages, vmp)) { 1096 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1097 ASSERT3P(vmp->vmp_obj_ref, !=, NULL); 1098 } 1099 1100 /* 1101 * After this point, the client will be orphaned, unable to 1102 * establish new page holds (or access any vmspace-related 1103 * resources) and is in charge of cleaning up after itself. 1104 */ 1105 vmc->vmc_state |= VCS_ORPHANED; 1106 next = list_next(&vms->vms_clients, vmc); 1107 list_remove(&vms->vms_clients, vmc); 1108 vmc->vmc_space = NULL; 1109 } 1110 mutex_exit(&vmc->vmc_lock); 1111 return (next); 1112 } 1113 1114 /* 1115 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1116 */ 1117 vm_page_t * 1118 vmc_hold_ext(vm_client_t *vmc, uintptr_t gpa, int prot, int flags) 1119 { 1120 vmspace_t *vms = vmc->vmc_space; 1121 vm_page_t *vmp; 1122 pfn_t pfn = PFN_INVALID; 1123 uint64_t *ptep = NULL; 1124 1125 ASSERT0(gpa & PAGEOFFSET); 1126 ASSERT((prot & (PROT_READ | PROT_WRITE)) != PROT_NONE); 1127 ASSERT0(prot & ~PROT_ALL); 1128 ASSERT0(flags & ~VPF_ALL); 1129 1130 vmp = kmem_alloc(sizeof (*vmp), KM_SLEEP); 1131 if (vmc_activate(vmc) != 0) { 1132 kmem_free(vmp, sizeof (*vmp)); 1133 return (NULL); 1134 } 1135 1136 if (vmspace_lookup_map(vms, gpa, prot, &pfn, &ptep) != 0) { 1137 vmc_deactivate(vmc); 1138 kmem_free(vmp, sizeof (*vmp)); 1139 return (NULL); 1140 } 1141 ASSERT(pfn != PFN_INVALID && ptep != NULL); 1142 1143 vmp->vmp_client = vmc; 1144 vmp->vmp_chain = NULL; 1145 vmp->vmp_gpa = gpa; 1146 vmp->vmp_pfn = pfn; 1147 vmp->vmp_ptep = ptep; 1148 vmp->vmp_obj_ref = NULL; 1149 vmp->vmp_prot = (uint8_t)prot; 1150 vmp->vmp_flags = (uint8_t)flags; 1151 list_insert_tail(&vmc->vmc_held_pages, vmp); 1152 vmc_deactivate(vmc); 1153 1154 return (vmp); 1155 } 1156 1157 /* 1158 * Attempt to hold a page at `gpa` inside the referenced vmspace. 1159 */ 1160 vm_page_t * 1161 vmc_hold(vm_client_t *vmc, uintptr_t gpa, int prot) 1162 { 1163 return (vmc_hold_ext(vmc, gpa, prot, VPF_DEFAULT)); 1164 } 1165 1166 int 1167 vmc_fault(vm_client_t *vmc, uintptr_t gpa, int prot) 1168 { 1169 vmspace_t *vms = vmc->vmc_space; 1170 int err; 1171 1172 err = vmc_activate(vmc); 1173 if (err == 0) { 1174 err = vmspace_lookup_map(vms, gpa & PAGEMASK, prot, NULL, NULL); 1175 vmc_deactivate(vmc); 1176 } 1177 1178 return (err); 1179 } 1180 1181 /* 1182 * Allocate an additional vm_client_t, based on an existing one. Only the 1183 * associatation with the vmspace is cloned, not existing holds or any 1184 * configured invalidation function. 1185 */ 1186 vm_client_t * 1187 vmc_clone(vm_client_t *vmc) 1188 { 1189 vmspace_t *vms = vmc->vmc_space; 1190 1191 return (vmspace_client_alloc(vms)); 1192 } 1193 1194 /* 1195 * Register a function (and associated data pointer) to be called when an 1196 * address range in the vmspace is invalidated. 1197 */ 1198 int 1199 vmc_set_inval_cb(vm_client_t *vmc, vmc_inval_cb_t func, void *data) 1200 { 1201 int err; 1202 1203 err = vmc_activate(vmc); 1204 if (err == 0) { 1205 vmc->vmc_inval_func = func; 1206 vmc->vmc_inval_data = data; 1207 vmc_deactivate(vmc); 1208 } 1209 1210 return (err); 1211 } 1212 1213 /* 1214 * Destroy a vm_client_t instance. 1215 * 1216 * No pages held through this vm_client_t may be outstanding when performing a 1217 * vmc_destroy(). For vCPU clients, the client cannot be on-CPU (a call to 1218 * vmc_table_exit() has been made). 1219 */ 1220 void 1221 vmc_destroy(vm_client_t *vmc) 1222 { 1223 mutex_enter(&vmc->vmc_lock); 1224 1225 VERIFY(list_is_empty(&vmc->vmc_held_pages)); 1226 VERIFY0(vmc->vmc_state & (VCS_ACTIVE | VCS_ON_CPU)); 1227 1228 if ((vmc->vmc_state & VCS_ORPHANED) == 0) { 1229 vmspace_t *vms; 1230 1231 /* 1232 * Deassociation with the parent vmspace must be done carefully: 1233 * The vmspace could attempt to orphan this vm_client while we 1234 * release vmc_lock in order to take vms_lock (the required 1235 * order). The client is marked to indicate that destruction is 1236 * under way. Doing so prevents any racing orphan operation 1237 * from applying to this client, allowing us to deassociate from 1238 * the vmspace safely. 1239 */ 1240 vmc->vmc_state |= VCS_DESTROY; 1241 vms = vmc->vmc_space; 1242 mutex_exit(&vmc->vmc_lock); 1243 1244 mutex_enter(&vms->vms_lock); 1245 mutex_enter(&vmc->vmc_lock); 1246 list_remove(&vms->vms_clients, vmc); 1247 /* 1248 * If the vmspace began its own destruction operation while we 1249 * were navigating the locks, be sure to notify it about this 1250 * vm_client being deassociated. 1251 */ 1252 cv_signal(&vms->vms_cv); 1253 mutex_exit(&vmc->vmc_lock); 1254 mutex_exit(&vms->vms_lock); 1255 } else { 1256 VERIFY3P(vmc->vmc_space, ==, NULL); 1257 mutex_exit(&vmc->vmc_lock); 1258 } 1259 1260 mutex_destroy(&vmc->vmc_lock); 1261 cv_destroy(&vmc->vmc_cv); 1262 list_destroy(&vmc->vmc_held_pages); 1263 1264 kmem_free(vmc, sizeof (*vmc)); 1265 } 1266 1267 static __inline void * 1268 vmp_ptr(const vm_page_t *vmp) 1269 { 1270 ASSERT3U(vmp->vmp_pfn, !=, PFN_INVALID); 1271 1272 const uintptr_t paddr = (vmp->vmp_pfn << PAGESHIFT); 1273 return ((void *)((uintptr_t)kpm_vbase + paddr)); 1274 } 1275 1276 /* 1277 * Get a readable kernel-virtual pointer for a held page. 1278 * 1279 * Only legal to call if PROT_READ was specified in `prot` for the vmc_hold() 1280 * call to acquire this page reference. 1281 */ 1282 const void * 1283 vmp_get_readable(const vm_page_t *vmp) 1284 { 1285 ASSERT(vmp->vmp_prot & PROT_READ); 1286 1287 return (vmp_ptr(vmp)); 1288 } 1289 1290 /* 1291 * Get a writable kernel-virtual pointer for a held page. 1292 * 1293 * Only legal to call if PROT_WRITE was specified in `prot` for the vmc_hold() 1294 * call to acquire this page reference. 1295 */ 1296 void * 1297 vmp_get_writable(const vm_page_t *vmp) 1298 { 1299 ASSERT(vmp->vmp_prot & PROT_WRITE); 1300 1301 return (vmp_ptr(vmp)); 1302 } 1303 1304 /* 1305 * Get the host-physical PFN for a held page. 1306 */ 1307 pfn_t 1308 vmp_get_pfn(const vm_page_t *vmp) 1309 { 1310 return (vmp->vmp_pfn); 1311 } 1312 1313 /* 1314 * If this page was deferring dirty-marking in the corresponding vmspace page 1315 * tables, clear such a state so it is considered dirty from now on. 1316 */ 1317 void 1318 vmp_mark_dirty(vm_page_t *vmp) 1319 { 1320 ASSERT((vmp->vmp_prot & PROT_WRITE) != 0); 1321 1322 atomic_and_8(&vmp->vmp_flags, ~VPF_DEFER_DIRTY); 1323 } 1324 1325 /* 1326 * Store a pointer to `to_chain` in the page-chaining slot of `vmp`. 1327 */ 1328 void 1329 vmp_chain(vm_page_t *vmp, vm_page_t *to_chain) 1330 { 1331 ASSERT3P(vmp->vmp_chain, ==, NULL); 1332 1333 vmp->vmp_chain = to_chain; 1334 } 1335 1336 /* 1337 * Retrieve the pointer from the page-chaining in `vmp`. 1338 */ 1339 vm_page_t * 1340 vmp_next(const vm_page_t *vmp) 1341 { 1342 return (vmp->vmp_chain); 1343 } 1344 1345 static __inline bool 1346 vmp_release_inner(vm_page_t *vmp, vm_client_t *vmc) 1347 { 1348 ASSERT(MUTEX_HELD(&vmc->vmc_lock)); 1349 1350 bool was_unmapped = false; 1351 1352 list_remove(&vmc->vmc_held_pages, vmp); 1353 if (vmp->vmp_obj_ref != NULL) { 1354 ASSERT3P(vmp->vmp_ptep, ==, NULL); 1355 1356 vm_object_release(vmp->vmp_obj_ref); 1357 was_unmapped = true; 1358 } else { 1359 ASSERT3P(vmp->vmp_ptep, !=, NULL); 1360 1361 /* 1362 * Track appropriate (accessed/dirty) bits for the guest-virtual 1363 * address corresponding to this page, if it is from the vmspace 1364 * rather than a direct reference to an underlying object. 1365 * 1366 * The protection and/or configured flags may obviate the need 1367 * for such an update. 1368 */ 1369 if ((vmp->vmp_prot & PROT_WRITE) != 0 && 1370 (vmp->vmp_flags & VPF_DEFER_DIRTY) == 0 && 1371 vmc->vmc_track_dirty) { 1372 vmm_gpt_t *gpt = vmc->vmc_space->vms_gpt; 1373 (void) vmm_gpt_reset_dirty(gpt, vmp->vmp_ptep, true); 1374 } 1375 } 1376 kmem_free(vmp, sizeof (*vmp)); 1377 return (was_unmapped); 1378 } 1379 1380 /* 1381 * Release held page. Returns true if page resided on region which was 1382 * subsequently unmapped. 1383 */ 1384 bool 1385 vmp_release(vm_page_t *vmp) 1386 { 1387 vm_client_t *vmc = vmp->vmp_client; 1388 1389 VERIFY(vmc != NULL); 1390 1391 mutex_enter(&vmc->vmc_lock); 1392 const bool was_unmapped = vmp_release_inner(vmp, vmc); 1393 mutex_exit(&vmc->vmc_lock); 1394 return (was_unmapped); 1395 } 1396 1397 /* 1398 * Release a chain of pages which were associated via vmp_chain() (setting 1399 * page-chaining pointer). Returns true if any pages resided upon a region 1400 * which was subsequently unmapped. 1401 * 1402 * All of those pages must have been held through the same vm_client_t. 1403 */ 1404 bool 1405 vmp_release_chain(vm_page_t *vmp) 1406 { 1407 vm_client_t *vmc = vmp->vmp_client; 1408 bool any_unmapped = false; 1409 1410 ASSERT(vmp != NULL); 1411 1412 mutex_enter(&vmc->vmc_lock); 1413 while (vmp != NULL) { 1414 vm_page_t *next = vmp->vmp_chain; 1415 1416 /* We expect all pages in chain to be from same client */ 1417 ASSERT3P(vmp->vmp_client, ==, vmc); 1418 1419 if (vmp_release_inner(vmp, vmc)) { 1420 any_unmapped = true; 1421 } 1422 vmp = next; 1423 } 1424 mutex_exit(&vmc->vmc_lock); 1425 return (any_unmapped); 1426 } 1427 1428 1429 int 1430 vm_segmap_obj(struct vm *vm, int segid, off_t segoff, off_t len, 1431 struct as *as, caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) 1432 { 1433 vm_object_t *vmo; 1434 int err; 1435 1436 if (segoff < 0 || len <= 0 || 1437 (segoff & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0) { 1438 return (EINVAL); 1439 } 1440 if ((prot & PROT_USER) == 0) { 1441 return (ENOTSUP); 1442 } 1443 err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); 1444 if (err != 0) { 1445 return (err); 1446 } 1447 1448 VERIFY(segoff >= 0); 1449 VERIFY(len <= vmo->vmo_size); 1450 VERIFY((len + segoff) <= vmo->vmo_size); 1451 1452 if (vmo->vmo_type != VMOT_MEM) { 1453 /* Only support memory objects for now */ 1454 return (ENOTSUP); 1455 } 1456 1457 as_rangelock(as); 1458 1459 err = choose_addr(as, addrp, (size_t)len, 0, ADDR_VACALIGN, flags); 1460 if (err == 0) { 1461 segvmm_crargs_t svma; 1462 1463 svma.prot = prot; 1464 svma.offset = segoff; 1465 svma.vmo = vmo; 1466 svma.vmc = NULL; 1467 1468 err = as_map(as, *addrp, (size_t)len, segvmm_create, &svma); 1469 } 1470 1471 as_rangeunlock(as); 1472 return (err); 1473 } 1474 1475 int 1476 vm_segmap_space(struct vm *vm, off_t off, struct as *as, caddr_t *addrp, 1477 off_t len, uint_t prot, uint_t maxprot, uint_t flags) 1478 { 1479 1480 const uintptr_t gpa = (uintptr_t)off; 1481 const size_t size = (uintptr_t)len; 1482 int err; 1483 1484 if (off < 0 || len <= 0 || 1485 (gpa & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { 1486 return (EINVAL); 1487 } 1488 if ((prot & PROT_USER) == 0) { 1489 return (ENOTSUP); 1490 } 1491 1492 as_rangelock(as); 1493 1494 err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); 1495 if (err == 0) { 1496 segvmm_crargs_t svma; 1497 1498 svma.prot = prot; 1499 svma.offset = gpa; 1500 svma.vmo = NULL; 1501 svma.vmc = vmspace_client_alloc(vm_get_vmspace(vm)); 1502 1503 err = as_map(as, *addrp, len, segvmm_create, &svma); 1504 } 1505 1506 as_rangeunlock(as); 1507 return (err); 1508 } 1509