1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/modctl.h> 27 #include <sys/types.h> 28 #include <sys/archsystm.h> 29 #include <sys/machsystm.h> 30 #include <sys/sunndi.h> 31 #include <sys/sunddi.h> 32 #include <sys/ddi_subrdefs.h> 33 #include <sys/xpv_support.h> 34 #include <sys/xen_errno.h> 35 #include <sys/hypervisor.h> 36 #include <sys/gnttab.h> 37 #include <sys/xenbus_comms.h> 38 #include <sys/xenbus_impl.h> 39 #include <xen/sys/xendev.h> 40 #include <sys/sysmacros.h> 41 #include <sys/x86_archext.h> 42 #include <sys/mman.h> 43 #include <sys/stat.h> 44 #include <sys/conf.h> 45 #include <sys/devops.h> 46 #include <sys/pc_mmu.h> 47 #include <sys/cmn_err.h> 48 #include <sys/cpr.h> 49 #include <sys/ddi.h> 50 #include <vm/seg_kmem.h> 51 #include <vm/as.h> 52 #include <vm/hat_pte.h> 53 #include <vm/hat_i86.h> 54 55 #define XPV_MINOR 0 56 #define XPV_BUFSIZE 128 57 58 /* 59 * This structure is ordinarily constructed by Xen. In the HVM world, we 60 * manually fill in the few fields the PV drivers need. 61 */ 62 start_info_t *xen_info = NULL; 63 64 /* Xen version number. */ 65 int xen_major, xen_minor; 66 67 /* Metadata page shared between domain and Xen */ 68 shared_info_t *HYPERVISOR_shared_info = NULL; 69 70 /* Page containing code to issue hypercalls. */ 71 extern caddr_t hypercall_page; 72 73 /* Is the hypervisor 64-bit? */ 74 int xen_is_64bit = -1; 75 76 /* virtual addr for the store_mfn page */ 77 caddr_t xb_addr; 78 79 dev_info_t *xpv_dip; 80 static dev_info_t *xpvd_dip; 81 82 /* saved pfn of the shared info page */ 83 static pfn_t shared_info_frame; 84 85 #ifdef DEBUG 86 int xen_suspend_debug; 87 88 #define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf 89 #else 90 #define SUSPEND_DEBUG(...) 91 #endif 92 93 /* 94 * Forward declarations 95 */ 96 static int xpv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 97 static int xpv_attach(dev_info_t *, ddi_attach_cmd_t); 98 static int xpv_detach(dev_info_t *, ddi_detach_cmd_t); 99 static int xpv_open(dev_t *, int, int, cred_t *); 100 static int xpv_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 101 102 static struct cb_ops xpv_cb_ops = { 103 xpv_open, 104 nulldev, /* close */ 105 nodev, /* strategy */ 106 nodev, /* print */ 107 nodev, /* dump */ 108 nodev, /* read */ 109 nodev, /* write */ 110 xpv_ioctl, /* ioctl */ 111 nodev, /* devmap */ 112 nodev, /* mmap */ 113 nodev, /* segmap */ 114 nochpoll, /* poll */ 115 ddi_prop_op, 116 NULL, 117 D_MP, 118 CB_REV, 119 NULL, 120 NULL 121 }; 122 123 static struct dev_ops xpv_dv_ops = { 124 DEVO_REV, 125 0, 126 xpv_getinfo, 127 nulldev, /* identify */ 128 nulldev, /* probe */ 129 xpv_attach, 130 xpv_detach, 131 nodev, /* reset */ 132 &xpv_cb_ops, 133 NULL, /* struct bus_ops */ 134 NULL, /* power */ 135 ddi_quiesce_not_supported, /* devo_quiesce */ 136 }; 137 138 static struct modldrv modldrv = { 139 &mod_driverops, 140 "xpv driver", 141 &xpv_dv_ops 142 }; 143 144 static struct modlinkage modl = { 145 MODREV_1, 146 { 147 (void *)&modldrv, 148 NULL /* null termination */ 149 } 150 }; 151 152 static ddi_dma_attr_t xpv_dma_attr = { 153 DMA_ATTR_V0, /* version of this structure */ 154 0, /* lowest usable address */ 155 0xffffffffffffffffULL, /* highest usable address */ 156 0x7fffffff, /* maximum DMAable byte count */ 157 MMU_PAGESIZE, /* alignment in bytes */ 158 0x7ff, /* bitmap of burst sizes */ 159 1, /* minimum transfer */ 160 0xffffffffU, /* maximum transfer */ 161 0x7fffffffULL, /* maximum segment length */ 162 1, /* maximum number of segments */ 163 1, /* granularity */ 164 0, /* flags (reserved) */ 165 }; 166 167 static ddi_device_acc_attr_t xpv_accattr = { 168 DDI_DEVICE_ATTR_V0, 169 DDI_NEVERSWAP_ACC, 170 DDI_STRICTORDER_ACC 171 }; 172 173 #define MAX_ALLOCATIONS 10 174 static ddi_dma_handle_t xpv_dma_handle[MAX_ALLOCATIONS]; 175 static ddi_acc_handle_t xpv_dma_acchandle[MAX_ALLOCATIONS]; 176 static int xen_alloc_cnt = 0; 177 178 void * 179 xen_alloc_pages(pgcnt_t cnt) 180 { 181 size_t len; 182 int a = xen_alloc_cnt++; 183 caddr_t addr; 184 185 ASSERT(xen_alloc_cnt < MAX_ALLOCATIONS); 186 if (ddi_dma_alloc_handle(xpv_dip, &xpv_dma_attr, DDI_DMA_SLEEP, 0, 187 &xpv_dma_handle[a]) != DDI_SUCCESS) 188 return (NULL); 189 190 if (ddi_dma_mem_alloc(xpv_dma_handle[a], MMU_PAGESIZE * cnt, 191 &xpv_accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, 0, 192 &addr, &len, &xpv_dma_acchandle[a]) != DDI_SUCCESS) { 193 ddi_dma_free_handle(&xpv_dma_handle[a]); 194 cmn_err(CE_WARN, "Couldn't allocate memory for xpv devices"); 195 return (NULL); 196 } 197 return (addr); 198 } 199 200 /* 201 * This function is invoked twice, first time with reprogram=0 to set up 202 * the xpvd portion of the device tree. The second time it is ignored. 203 */ 204 static void 205 xpv_enumerate(int reprogram) 206 { 207 dev_info_t *dip; 208 209 if (reprogram != 0) 210 return; 211 212 ndi_devi_alloc_sleep(ddi_root_node(), "xpvd", 213 (pnode_t)DEVI_SID_NODEID, &dip); 214 215 (void) ndi_devi_bind_driver(dip, 0); 216 217 /* 218 * Too early to enumerate split device drivers in domU 219 * since we need to create taskq thread during enumeration. 220 * So, we only enumerate softdevs and console here. 221 */ 222 xendev_enum_all(dip, B_TRUE); 223 } 224 225 /* 226 * Translate a hypervisor errcode to a Solaris error code. 227 */ 228 int 229 xen_xlate_errcode(int error) 230 { 231 #define CASE(num) case X_##num: error = num; break 232 233 switch (-error) { 234 CASE(EPERM); CASE(ENOENT); CASE(ESRCH); 235 CASE(EINTR); CASE(EIO); CASE(ENXIO); 236 CASE(E2BIG); CASE(ENOMEM); CASE(EACCES); 237 CASE(EFAULT); CASE(EBUSY); CASE(EEXIST); 238 CASE(ENODEV); CASE(EISDIR); CASE(EINVAL); 239 CASE(ENOSPC); CASE(ESPIPE); CASE(EROFS); 240 CASE(ENOSYS); CASE(ENOTEMPTY); CASE(EISCONN); 241 CASE(ENODATA); 242 default: 243 panic("xen_xlate_errcode: unknown error %d", error); 244 } 245 return (error); 246 #undef CASE 247 } 248 249 /*PRINTFLIKE1*/ 250 void 251 xen_printf(const char *fmt, ...) 252 { 253 va_list adx; 254 255 va_start(adx, fmt); 256 printf(fmt, adx); 257 va_end(adx); 258 } 259 260 /* 261 * Stub functions to get the FE drivers to build, and to catch drivers that 262 * misbehave in HVM domains. 263 */ 264 /*ARGSUSED*/ 265 void 266 xen_release_pfn(pfn_t pfn, caddr_t va) 267 { 268 panic("xen_release_pfn() is not supported in HVM domains"); 269 } 270 271 /*ARGSUSED*/ 272 void 273 reassign_pfn(pfn_t pfn, mfn_t mfn) 274 { 275 panic("reassign_pfn() is not supported in HVM domains"); 276 } 277 278 /*ARGSUSED*/ 279 long 280 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) 281 { 282 panic("balloon_free_pages() is not supported in HVM domains"); 283 return (0); 284 } 285 286 /*ARGSUSED*/ 287 void 288 balloon_drv_added(int64_t delta) 289 { 290 panic("balloon_drv_added() is not supported in HVM domains"); 291 } 292 293 /* 294 * Add a mapping for the machine page at the given virtual address. 295 */ 296 void 297 kbm_map_ma(maddr_t ma, uintptr_t va, uint_t level) 298 { 299 ASSERT(level == 0); 300 301 hat_devload(kas.a_hat, (caddr_t)va, MMU_PAGESIZE, 302 mmu_btop(ma), PROT_READ | PROT_WRITE, HAT_LOAD); 303 } 304 305 static uint64_t 306 hvm_get_param(int param_id) 307 { 308 struct xen_hvm_param xhp; 309 310 xhp.domid = DOMID_SELF; 311 xhp.index = param_id; 312 if ((HYPERVISOR_hvm_op(HVMOP_get_param, &xhp) < 0)) 313 return (-1); 314 return (xhp.value); 315 } 316 317 /*ARGSUSED*/ 318 int 319 xen_map_gref(uint_t cmd, gnttab_map_grant_ref_t *mapop, uint_t count, 320 boolean_t uvaddr) 321 { 322 long rc; 323 324 ASSERT(cmd == GNTTABOP_map_grant_ref); 325 rc = HYPERVISOR_grant_table_op(cmd, mapop, count); 326 327 return (rc); 328 } 329 330 static struct xenbus_watch shutdown_watch; 331 taskq_t *xen_shutdown_tq; 332 333 #define SHUTDOWN_INVALID -1 334 #define SHUTDOWN_POWEROFF 0 335 #define SHUTDOWN_REBOOT 1 336 #define SHUTDOWN_SUSPEND 2 337 #define SHUTDOWN_HALT 3 338 #define SHUTDOWN_MAX 4 339 340 #define SHUTDOWN_TIMEOUT_SECS (60 * 5) 341 342 int 343 xen_suspend_devices(dev_info_t *dip) 344 { 345 int error; 346 char buf[XPV_BUFSIZE]; 347 348 SUSPEND_DEBUG("xen_suspend_devices\n"); 349 350 for (; dip != NULL; dip = ddi_get_next_sibling(dip)) { 351 if (xen_suspend_devices(ddi_get_child(dip))) 352 return (ENXIO); 353 if (ddi_get_driver(dip) == NULL) 354 continue; 355 SUSPEND_DEBUG("Suspending device %s\n", ddi_deviname(dip, buf)); 356 ASSERT((DEVI(dip)->devi_cpr_flags & DCF_CPR_SUSPENDED) == 0); 357 358 359 if (!i_ddi_devi_attached(dip)) { 360 error = DDI_FAILURE; 361 } else { 362 error = devi_detach(dip, DDI_SUSPEND); 363 } 364 365 if (error == DDI_SUCCESS) { 366 DEVI(dip)->devi_cpr_flags |= DCF_CPR_SUSPENDED; 367 } else { 368 SUSPEND_DEBUG("WARNING: Unable to suspend device %s\n", 369 ddi_deviname(dip, buf)); 370 cmn_err(CE_WARN, "Unable to suspend device %s.", 371 ddi_deviname(dip, buf)); 372 cmn_err(CE_WARN, "Device is busy or does not " 373 "support suspend/resume."); 374 return (ENXIO); 375 } 376 } 377 return (0); 378 } 379 380 int 381 xen_resume_devices(dev_info_t *start, int resume_failed) 382 { 383 dev_info_t *dip, *next, *last = NULL; 384 int did_suspend; 385 int error = resume_failed; 386 char buf[XPV_BUFSIZE]; 387 388 SUSPEND_DEBUG("xen_resume_devices\n"); 389 390 while (last != start) { 391 dip = start; 392 next = ddi_get_next_sibling(dip); 393 while (next != last) { 394 dip = next; 395 next = ddi_get_next_sibling(dip); 396 } 397 398 /* 399 * cpr is the only one that uses this field and the device 400 * itself hasn't resumed yet, there is no need to use a 401 * lock, even though kernel threads are active by now. 402 */ 403 did_suspend = DEVI(dip)->devi_cpr_flags & DCF_CPR_SUSPENDED; 404 if (did_suspend) 405 DEVI(dip)->devi_cpr_flags &= ~DCF_CPR_SUSPENDED; 406 407 /* 408 * There may be background attaches happening on devices 409 * that were not originally suspended by cpr, so resume 410 * only devices that were suspended by cpr. Also, stop 411 * resuming after the first resume failure, but traverse 412 * the entire tree to clear the suspend flag. 413 */ 414 if (did_suspend && !error) { 415 SUSPEND_DEBUG("Resuming device %s\n", 416 ddi_deviname(dip, buf)); 417 /* 418 * If a device suspended by cpr gets detached during 419 * the resume process (for example, due to hotplugging) 420 * before cpr gets around to issuing it a DDI_RESUME, 421 * we'll have problems. 422 */ 423 if (!i_ddi_devi_attached(dip)) { 424 cmn_err(CE_WARN, "Skipping %s, device " 425 "not ready for resume", 426 ddi_deviname(dip, buf)); 427 } else { 428 if (devi_attach(dip, DDI_RESUME) != 429 DDI_SUCCESS) { 430 error = ENXIO; 431 } 432 } 433 } 434 435 if (error == ENXIO) { 436 cmn_err(CE_WARN, "Unable to resume device %s", 437 ddi_deviname(dip, buf)); 438 } 439 440 error = xen_resume_devices(ddi_get_child(dip), error); 441 last = dip; 442 } 443 444 return (error); 445 } 446 447 /*ARGSUSED*/ 448 static int 449 check_xpvd(dev_info_t *dip, void *arg) 450 { 451 char *name; 452 453 name = ddi_node_name(dip); 454 if (name == NULL || strcmp(name, "xpvd")) { 455 return (DDI_WALK_CONTINUE); 456 } else { 457 xpvd_dip = dip; 458 return (DDI_WALK_TERMINATE); 459 } 460 } 461 462 /* 463 * Top level routine to direct suspend/resume of a domain. 464 */ 465 void 466 xen_suspend_domain(void) 467 { 468 extern void rtcsync(void); 469 extern void ec_resume(void); 470 extern kmutex_t ec_lock; 471 struct xen_add_to_physmap xatp; 472 ulong_t flags; 473 int err; 474 475 cmn_err(CE_NOTE, "Domain suspending for save/migrate"); 476 477 SUSPEND_DEBUG("xen_suspend_domain\n"); 478 479 /* 480 * We only want to suspend the PV devices, since the emulated devices 481 * are suspended by saving the emulated device state. The PV devices 482 * are all children of the xpvd nexus device. So we search the 483 * device tree for the xpvd node to use as the root of the tree to 484 * be suspended. 485 */ 486 if (xpvd_dip == NULL) 487 ddi_walk_devs(ddi_root_node(), check_xpvd, NULL); 488 489 /* 490 * suspend interrupts and devices 491 */ 492 if (xpvd_dip != NULL) 493 (void) xen_suspend_devices(ddi_get_child(xpvd_dip)); 494 else 495 cmn_err(CE_WARN, "No PV devices found to suspend"); 496 SUSPEND_DEBUG("xenbus_suspend\n"); 497 xenbus_suspend(); 498 499 mutex_enter(&cpu_lock); 500 501 /* 502 * Suspend on vcpu 0 503 */ 504 thread_affinity_set(curthread, 0); 505 kpreempt_disable(); 506 507 if (ncpus > 1) 508 pause_cpus(NULL); 509 /* 510 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence 511 * any holder would have dropped it to get through pause_cpus(). 512 */ 513 mutex_enter(&ec_lock); 514 515 /* 516 * From here on in, we can't take locks. 517 */ 518 519 flags = intr_clear(); 520 521 SUSPEND_DEBUG("HYPERVISOR_suspend\n"); 522 /* 523 * At this point we suspend and sometime later resume. 524 * Note that this call may return with an indication of a cancelled 525 * for now no matter ehat the return we do a full resume of all 526 * suspended drivers, etc. 527 */ 528 (void) HYPERVISOR_shutdown(SHUTDOWN_suspend); 529 530 /* 531 * Point HYPERVISOR_shared_info to the proper place. 532 */ 533 xatp.domid = DOMID_SELF; 534 xatp.idx = 0; 535 xatp.space = XENMAPSPACE_shared_info; 536 xatp.gpfn = shared_info_frame; 537 if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) 538 panic("Could not set shared_info page. error: %d", err); 539 540 SUSPEND_DEBUG("gnttab_resume\n"); 541 gnttab_resume(); 542 543 SUSPEND_DEBUG("ec_resume\n"); 544 ec_resume(); 545 546 intr_restore(flags); 547 548 if (ncpus > 1) 549 start_cpus(); 550 551 mutex_exit(&ec_lock); 552 mutex_exit(&cpu_lock); 553 554 /* 555 * Now we can take locks again. 556 */ 557 558 rtcsync(); 559 560 SUSPEND_DEBUG("xenbus_resume\n"); 561 xenbus_resume(); 562 SUSPEND_DEBUG("xen_resume_devices\n"); 563 if (xpvd_dip != NULL) 564 (void) xen_resume_devices(ddi_get_child(xpvd_dip), 0); 565 566 thread_affinity_clear(curthread); 567 kpreempt_enable(); 568 569 SUSPEND_DEBUG("finished xen_suspend_domain\n"); 570 571 cmn_err(CE_NOTE, "domain restore/migrate completed"); 572 } 573 574 static void 575 xen_dirty_shutdown(void *arg) 576 { 577 int cmd = (uintptr_t)arg; 578 579 cmn_err(CE_WARN, "Externally requested shutdown failed or " 580 "timed out.\nShutting down.\n"); 581 582 switch (cmd) { 583 case SHUTDOWN_HALT: 584 case SHUTDOWN_POWEROFF: 585 (void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred); 586 break; 587 case SHUTDOWN_REBOOT: 588 (void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred); 589 break; 590 } 591 } 592 593 static void 594 xen_shutdown(void *arg) 595 { 596 int cmd = (uintptr_t)arg; 597 proc_t *initpp; 598 599 ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX); 600 601 if (cmd == SHUTDOWN_SUSPEND) { 602 xen_suspend_domain(); 603 return; 604 } 605 606 switch (cmd) { 607 case SHUTDOWN_POWEROFF: 608 force_shutdown_method = AD_POWEROFF; 609 break; 610 case SHUTDOWN_HALT: 611 force_shutdown_method = AD_HALT; 612 break; 613 case SHUTDOWN_REBOOT: 614 force_shutdown_method = AD_BOOT; 615 break; 616 } 617 618 619 /* 620 * If we're still booting and init(1) isn't set up yet, simply halt. 621 */ 622 mutex_enter(&pidlock); 623 initpp = prfind(P_INITPID); 624 mutex_exit(&pidlock); 625 if (initpp == NULL) { 626 extern void halt(char *); 627 halt("Power off the System"); /* just in case */ 628 } 629 630 /* 631 * else, graceful shutdown with inittab and all getting involved 632 */ 633 psignal(initpp, SIGPWR); 634 635 (void) timeout(xen_dirty_shutdown, arg, 636 SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC)); 637 } 638 639 /*ARGSUSED*/ 640 static void 641 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec, 642 unsigned int len) 643 { 644 char *str; 645 xenbus_transaction_t xbt; 646 int err, shutdown_code = SHUTDOWN_INVALID; 647 unsigned int slen; 648 649 again: 650 err = xenbus_transaction_start(&xbt); 651 if (err) 652 return; 653 if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) { 654 (void) xenbus_transaction_end(xbt, 1); 655 return; 656 } 657 658 SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str); 659 660 /* 661 * If this is a watch fired from our write below, check out early to 662 * avoid an infinite loop. 663 */ 664 if (strcmp(str, "") == 0) { 665 (void) xenbus_transaction_end(xbt, 0); 666 kmem_free(str, slen); 667 return; 668 } else if (strcmp(str, "poweroff") == 0) { 669 shutdown_code = SHUTDOWN_POWEROFF; 670 } else if (strcmp(str, "reboot") == 0) { 671 shutdown_code = SHUTDOWN_REBOOT; 672 } else if (strcmp(str, "suspend") == 0) { 673 shutdown_code = SHUTDOWN_SUSPEND; 674 } else if (strcmp(str, "halt") == 0) { 675 shutdown_code = SHUTDOWN_HALT; 676 } else { 677 printf("Ignoring shutdown request: %s\n", str); 678 } 679 680 (void) xenbus_write(xbt, "control", "shutdown", ""); 681 err = xenbus_transaction_end(xbt, 0); 682 if (err == EAGAIN) { 683 SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id); 684 kmem_free(str, slen); 685 goto again; 686 } 687 688 kmem_free(str, slen); 689 if (shutdown_code != SHUTDOWN_INVALID) { 690 (void) taskq_dispatch(xen_shutdown_tq, xen_shutdown, 691 (void *)(intptr_t)shutdown_code, 0); 692 } 693 } 694 695 static int 696 xen_pv_init(dev_info_t *xpv_dip) 697 { 698 struct cpuid_regs cp; 699 uint32_t xen_signature[4]; 700 char *xen_str; 701 struct xen_add_to_physmap xatp; 702 xen_capabilities_info_t caps; 703 pfn_t pfn; 704 uint64_t msrval; 705 int err; 706 707 /* 708 * Xen's pseudo-cpuid function 0x40000000 returns a string 709 * representing the Xen signature in %ebx, %ecx, and %edx. 710 * %eax contains the maximum supported cpuid function. 711 */ 712 cp.cp_eax = 0x40000000; 713 (void) __cpuid_insn(&cp); 714 xen_signature[0] = cp.cp_ebx; 715 xen_signature[1] = cp.cp_ecx; 716 xen_signature[2] = cp.cp_edx; 717 xen_signature[3] = 0; 718 xen_str = (char *)xen_signature; 719 if (strcmp("XenVMMXenVMM", xen_str) != 0 || 720 cp.cp_eax < 0x40000002) { 721 cmn_err(CE_WARN, 722 "Attempting to load Xen drivers on non-Xen system"); 723 return (-1); 724 } 725 726 /* 727 * cpuid function 0x40000001 returns the Xen version in %eax. The 728 * top 16 bits are the major version, the bottom 16 are the minor 729 * version. 730 */ 731 cp.cp_eax = 0x40000001; 732 (void) __cpuid_insn(&cp); 733 xen_major = cp.cp_eax >> 16; 734 xen_minor = cp.cp_eax & 0xffff; 735 736 /* 737 * The xpv driver is incompatible with xen versions older than 3.1. This 738 * is due to the changes in the vcpu_info and shared_info structs used 739 * to communicate with the hypervisor (the event channels in particular) 740 * that were introduced with 3.1. 741 */ 742 if (xen_major < 3 || (xen_major == 3 && xen_minor < 1)) { 743 cmn_err(CE_WARN, "Xen version %d.%d is not supported", 744 xen_major, xen_minor); 745 return (-1); 746 } 747 748 /* 749 * cpuid function 0x40000002 returns information about the 750 * hypercall page. %eax nominally contains the number of pages 751 * with hypercall code, but according to the Xen guys, "I'll 752 * guarantee that remains one forever more, so you can just 753 * allocate a single page and get quite upset if you ever see CPUID 754 * return more than one page." %ebx contains an MSR we use to ask 755 * Xen to remap each page at a specific pfn. 756 */ 757 cp.cp_eax = 0x40000002; 758 (void) __cpuid_insn(&cp); 759 760 /* 761 * Let Xen know where we want the hypercall page mapped. We 762 * already have a page allocated in the .text section to simplify 763 * the wrapper code. 764 */ 765 pfn = hat_getpfnum(kas.a_hat, (caddr_t)&hypercall_page); 766 msrval = mmu_ptob(pfn); 767 wrmsr(cp.cp_ebx, msrval); 768 769 /* Fill in the xen_info data */ 770 xen_info = kmem_zalloc(sizeof (start_info_t), KM_SLEEP); 771 (void) sprintf(xen_info->magic, "xen-%d.%d", xen_major, xen_minor); 772 xen_info->store_mfn = (mfn_t)hvm_get_param(HVM_PARAM_STORE_PFN); 773 xen_info->store_evtchn = (int)hvm_get_param(HVM_PARAM_STORE_EVTCHN); 774 775 /* Figure out whether the hypervisor is 32-bit or 64-bit. */ 776 if ((HYPERVISOR_xen_version(XENVER_capabilities, &caps) == 0)) { 777 ((char *)(caps))[sizeof (caps) - 1] = '\0'; 778 if (strstr(caps, "x86_64") != NULL) 779 xen_is_64bit = 1; 780 else if (strstr(caps, "x86_32") != NULL) 781 xen_is_64bit = 0; 782 } 783 if (xen_is_64bit < 0) { 784 cmn_err(CE_WARN, "Couldn't get capability info from Xen."); 785 return (-1); 786 } 787 #ifdef __amd64 788 ASSERT(xen_is_64bit == 1); 789 #endif 790 791 /* 792 * Allocate space for the shared_info page and tell Xen where it 793 * is. 794 */ 795 HYPERVISOR_shared_info = xen_alloc_pages(1); 796 shared_info_frame = hat_getpfnum(kas.a_hat, 797 (caddr_t)HYPERVISOR_shared_info); 798 xatp.domid = DOMID_SELF; 799 xatp.idx = 0; 800 xatp.space = XENMAPSPACE_shared_info; 801 xatp.gpfn = shared_info_frame; 802 if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) { 803 cmn_err(CE_WARN, "Could not get shared_info page from Xen." 804 " error: %d", err); 805 return (-1); 806 } 807 808 /* Set up the grant tables. */ 809 gnttab_init(); 810 811 /* Set up event channel support */ 812 if (ec_init(xpv_dip) != 0) 813 return (-1); 814 815 /* Set up xenbus */ 816 xb_addr = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP); 817 xs_early_init(); 818 xs_domu_init(); 819 820 /* Set up for suspend/resume/migrate */ 821 xen_shutdown_tq = taskq_create("shutdown_taskq", 1, 822 maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE); 823 shutdown_watch.node = "control/shutdown"; 824 shutdown_watch.callback = xen_shutdown_handler; 825 if (register_xenbus_watch(&shutdown_watch)) 826 cmn_err(CE_WARN, "Failed to set shutdown watcher"); 827 828 return (0); 829 } 830 831 static void 832 xen_pv_fini() 833 { 834 if (xen_info != NULL) 835 kmem_free(xen_info, sizeof (start_info_t)); 836 ec_fini(); 837 } 838 839 /*ARGSUSED*/ 840 static int 841 xpv_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 842 { 843 if (getminor((dev_t)arg) != XPV_MINOR) 844 return (DDI_FAILURE); 845 846 switch (cmd) { 847 case DDI_INFO_DEVT2DEVINFO: 848 *result = xpv_dip; 849 break; 850 case DDI_INFO_DEVT2INSTANCE: 851 *result = 0; 852 break; 853 default: 854 return (DDI_FAILURE); 855 } 856 857 return (DDI_SUCCESS); 858 } 859 860 static int 861 xpv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 862 { 863 if (cmd != DDI_ATTACH) 864 return (DDI_FAILURE); 865 866 if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR, 867 ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS) 868 return (DDI_FAILURE); 869 870 xpv_dip = dip; 871 872 if (xen_pv_init(dip) != 0) 873 return (DDI_FAILURE); 874 875 ddi_report_dev(dip); 876 877 /* 878 * If the memscrubber attempts to scrub the pages we hand to Xen, 879 * the domain will panic. 880 */ 881 memscrub_disable(); 882 883 /* 884 * Report our version to dom0. 885 */ 886 if (xenbus_printf(XBT_NULL, "hvmpv/xpv", "version", "%d", 887 HVMPV_XPV_VERS)) 888 cmn_err(CE_WARN, "xpv: couldn't write version\n"); 889 890 return (DDI_SUCCESS); 891 } 892 893 /* 894 * Attempts to reload the PV driver plumbing hang on Intel platforms, so 895 * we don't want to unload the framework by accident. 896 */ 897 int xpv_allow_detach = 0; 898 899 static int 900 xpv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 901 { 902 if (cmd != DDI_DETACH || xpv_allow_detach == 0) 903 return (DDI_FAILURE); 904 905 if (xpv_dip != NULL) { 906 xen_pv_fini(); 907 ddi_remove_minor_node(dip, NULL); 908 xpv_dip = NULL; 909 } 910 911 return (DDI_SUCCESS); 912 } 913 914 /*ARGSUSED1*/ 915 static int 916 xpv_open(dev_t *dev, int flag, int otyp, cred_t *cr) 917 { 918 return (getminor(*dev) == XPV_MINOR ? 0 : ENXIO); 919 } 920 921 /*ARGSUSED*/ 922 static int 923 xpv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr, 924 int *rval_p) 925 { 926 return (EINVAL); 927 } 928 929 int 930 _init(void) 931 { 932 int err; 933 934 if ((err = mod_install(&modl)) != 0) 935 return (err); 936 937 impl_bus_add_probe(xpv_enumerate); 938 return (0); 939 } 940 941 int 942 _fini(void) 943 { 944 int err; 945 946 if ((err = mod_remove(&modl)) != 0) 947 return (err); 948 949 impl_bus_delete_probe(xpv_enumerate); 950 return (0); 951 } 952 953 int 954 _info(struct modinfo *modinfop) 955 { 956 return (mod_info(&modl, modinfop)); 957 } 958