xref: /illumos-gate/usr/src/uts/i86pc/i86hvm/io/xpv/xpv_support.c (revision 257873cfc1dd3337766407f80397db60a56f2f5a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/modctl.h>
27 #include <sys/types.h>
28 #include <sys/archsystm.h>
29 #include <sys/machsystm.h>
30 #include <sys/sunndi.h>
31 #include <sys/sunddi.h>
32 #include <sys/ddi_subrdefs.h>
33 #include <sys/xpv_support.h>
34 #include <sys/xen_errno.h>
35 #include <sys/hypervisor.h>
36 #include <sys/gnttab.h>
37 #include <sys/xenbus_comms.h>
38 #include <sys/xenbus_impl.h>
39 #include <xen/sys/xendev.h>
40 #include <sys/sysmacros.h>
41 #include <sys/x86_archext.h>
42 #include <sys/mman.h>
43 #include <sys/stat.h>
44 #include <sys/conf.h>
45 #include <sys/devops.h>
46 #include <sys/pc_mmu.h>
47 #include <sys/cmn_err.h>
48 #include <sys/cpr.h>
49 #include <sys/ddi.h>
50 #include <vm/seg_kmem.h>
51 #include <vm/as.h>
52 #include <vm/hat_pte.h>
53 #include <vm/hat_i86.h>
54 
55 #define	XPV_MINOR 0
56 #define	XPV_BUFSIZE 128
57 
58 /*
59  * This structure is ordinarily constructed by Xen. In the HVM world, we
60  * manually fill in the few fields the PV drivers need.
61  */
62 start_info_t *xen_info = NULL;
63 
64 /* Xen version number. */
65 int xen_major, xen_minor;
66 
67 /* Metadata page shared between domain and Xen */
68 shared_info_t *HYPERVISOR_shared_info = NULL;
69 
70 /* Page containing code to issue hypercalls.  */
71 extern caddr_t hypercall_page;
72 
73 /* Is the hypervisor 64-bit? */
74 int xen_is_64bit = -1;
75 
76 /* virtual addr for the store_mfn page */
77 caddr_t xb_addr;
78 
79 dev_info_t *xpv_dip;
80 static dev_info_t *xpvd_dip;
81 
82 /* saved pfn of the shared info page */
83 static pfn_t shared_info_frame;
84 
85 #ifdef DEBUG
86 int xen_suspend_debug;
87 
88 #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
89 #else
90 #define	SUSPEND_DEBUG(...)
91 #endif
92 
93 /*
94  * Forward declarations
95  */
96 static int xpv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
97 static int xpv_attach(dev_info_t *, ddi_attach_cmd_t);
98 static int xpv_detach(dev_info_t *, ddi_detach_cmd_t);
99 static int xpv_open(dev_t *, int, int, cred_t *);
100 static int xpv_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
101 
102 static struct cb_ops xpv_cb_ops = {
103 	xpv_open,
104 	nulldev,	/* close */
105 	nodev,		/* strategy */
106 	nodev,		/* print */
107 	nodev,		/* dump */
108 	nodev,		/* read */
109 	nodev,		/* write */
110 	xpv_ioctl,	/* ioctl */
111 	nodev,		/* devmap */
112 	nodev,		/* mmap */
113 	nodev,		/* segmap */
114 	nochpoll,	/* poll */
115 	ddi_prop_op,
116 	NULL,
117 	D_MP,
118 	CB_REV,
119 	NULL,
120 	NULL
121 };
122 
123 static struct dev_ops xpv_dv_ops = {
124 	DEVO_REV,
125 	0,
126 	xpv_getinfo,
127 	nulldev,	/* identify */
128 	nulldev,	/* probe */
129 	xpv_attach,
130 	xpv_detach,
131 	nodev,		/* reset */
132 	&xpv_cb_ops,
133 	NULL,		/* struct bus_ops */
134 	NULL,		/* power */
135 	ddi_quiesce_not_supported,	/* devo_quiesce */
136 };
137 
138 static struct modldrv modldrv = {
139 	&mod_driverops,
140 	"xpv driver",
141 	&xpv_dv_ops
142 };
143 
144 static struct modlinkage modl = {
145 	MODREV_1,
146 	{
147 		(void *)&modldrv,
148 		NULL		/* null termination */
149 	}
150 };
151 
152 static ddi_dma_attr_t xpv_dma_attr = {
153 	DMA_ATTR_V0,		/* version of this structure */
154 	0,			/* lowest usable address */
155 	0xffffffffffffffffULL,	/* highest usable address */
156 	0x7fffffff,		/* maximum DMAable byte count */
157 	MMU_PAGESIZE,		/* alignment in bytes */
158 	0x7ff,			/* bitmap of burst sizes */
159 	1,			/* minimum transfer */
160 	0xffffffffU,		/* maximum transfer */
161 	0x7fffffffULL,		/* maximum segment length */
162 	1,			/* maximum number of segments */
163 	1,			/* granularity */
164 	0,			/* flags (reserved) */
165 };
166 
167 static ddi_device_acc_attr_t xpv_accattr = {
168 	DDI_DEVICE_ATTR_V0,
169 	DDI_NEVERSWAP_ACC,
170 	DDI_STRICTORDER_ACC
171 };
172 
173 #define	MAX_ALLOCATIONS 10
174 static ddi_dma_handle_t xpv_dma_handle[MAX_ALLOCATIONS];
175 static ddi_acc_handle_t xpv_dma_acchandle[MAX_ALLOCATIONS];
176 static int xen_alloc_cnt = 0;
177 
178 void *
179 xen_alloc_pages(pgcnt_t cnt)
180 {
181 	size_t len;
182 	int a = xen_alloc_cnt++;
183 	caddr_t addr;
184 
185 	ASSERT(xen_alloc_cnt < MAX_ALLOCATIONS);
186 	if (ddi_dma_alloc_handle(xpv_dip, &xpv_dma_attr, DDI_DMA_SLEEP, 0,
187 	    &xpv_dma_handle[a]) != DDI_SUCCESS)
188 		return (NULL);
189 
190 	if (ddi_dma_mem_alloc(xpv_dma_handle[a], MMU_PAGESIZE * cnt,
191 	    &xpv_accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, 0,
192 	    &addr, &len, &xpv_dma_acchandle[a]) != DDI_SUCCESS) {
193 		ddi_dma_free_handle(&xpv_dma_handle[a]);
194 		cmn_err(CE_WARN, "Couldn't allocate memory for xpv devices");
195 		return (NULL);
196 	}
197 	return (addr);
198 }
199 
200 /*
201  * This function is invoked twice, first time with reprogram=0 to set up
202  * the xpvd portion of the device tree. The second time it is ignored.
203  */
204 static void
205 xpv_enumerate(int reprogram)
206 {
207 	dev_info_t *dip;
208 
209 	if (reprogram != 0)
210 		return;
211 
212 	ndi_devi_alloc_sleep(ddi_root_node(), "xpvd",
213 	    (pnode_t)DEVI_SID_NODEID, &dip);
214 
215 	(void) ndi_devi_bind_driver(dip, 0);
216 
217 	/*
218 	 * Too early to enumerate split device drivers in domU
219 	 * since we need to create taskq thread during enumeration.
220 	 * So, we only enumerate softdevs and console here.
221 	 */
222 	xendev_enum_all(dip, B_TRUE);
223 }
224 
225 /*
226  * Translate a hypervisor errcode to a Solaris error code.
227  */
228 int
229 xen_xlate_errcode(int error)
230 {
231 #define	CASE(num)	case X_##num: error = num; break
232 
233 	switch (-error) {
234 		CASE(EPERM);    CASE(ENOENT);   CASE(ESRCH);
235 		CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
236 		CASE(E2BIG);    CASE(ENOMEM);   CASE(EACCES);
237 		CASE(EFAULT);   CASE(EBUSY);    CASE(EEXIST);
238 		CASE(ENODEV);   CASE(EISDIR);   CASE(EINVAL);
239 		CASE(ENOSPC);   CASE(ESPIPE);   CASE(EROFS);
240 		CASE(ENOSYS);   CASE(ENOTEMPTY); CASE(EISCONN);
241 		CASE(ENODATA);
242 		default:
243 		panic("xen_xlate_errcode: unknown error %d", error);
244 	}
245 	return (error);
246 #undef CASE
247 }
248 
249 /*PRINTFLIKE1*/
250 void
251 xen_printf(const char *fmt, ...)
252 {
253 	va_list adx;
254 
255 	va_start(adx, fmt);
256 	printf(fmt, adx);
257 	va_end(adx);
258 }
259 
260 /*
261  * Stub functions to get the FE drivers to build, and to catch drivers that
262  * misbehave in HVM domains.
263  */
264 /*ARGSUSED*/
265 void
266 xen_release_pfn(pfn_t pfn, caddr_t va)
267 {
268 	panic("xen_release_pfn() is not supported in HVM domains");
269 }
270 
271 /*ARGSUSED*/
272 void
273 reassign_pfn(pfn_t pfn, mfn_t mfn)
274 {
275 	panic("reassign_pfn() is not supported in HVM domains");
276 }
277 
278 /*ARGSUSED*/
279 long
280 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
281 {
282 	panic("balloon_free_pages() is not supported in HVM domains");
283 	return (0);
284 }
285 
286 /*ARGSUSED*/
287 void
288 balloon_drv_added(int64_t delta)
289 {
290 	panic("balloon_drv_added() is not supported in HVM domains");
291 }
292 
293 /*
294  * Add a mapping for the machine page at the given virtual address.
295  */
296 void
297 kbm_map_ma(maddr_t ma, uintptr_t va, uint_t level)
298 {
299 	ASSERT(level == 0);
300 
301 	hat_devload(kas.a_hat, (caddr_t)va, MMU_PAGESIZE,
302 	    mmu_btop(ma), PROT_READ | PROT_WRITE, HAT_LOAD);
303 }
304 
305 static uint64_t
306 hvm_get_param(int param_id)
307 {
308 	struct xen_hvm_param xhp;
309 
310 	xhp.domid = DOMID_SELF;
311 	xhp.index = param_id;
312 	if ((HYPERVISOR_hvm_op(HVMOP_get_param, &xhp) < 0))
313 		return (-1);
314 	return (xhp.value);
315 }
316 
317 /*ARGSUSED*/
318 int
319 xen_map_gref(uint_t cmd, gnttab_map_grant_ref_t *mapop, uint_t count,
320     boolean_t uvaddr)
321 {
322 	long rc;
323 
324 	ASSERT(cmd == GNTTABOP_map_grant_ref);
325 	rc = HYPERVISOR_grant_table_op(cmd, mapop, count);
326 
327 	return (rc);
328 }
329 
330 static struct xenbus_watch shutdown_watch;
331 taskq_t *xen_shutdown_tq;
332 
333 #define	SHUTDOWN_INVALID	-1
334 #define	SHUTDOWN_POWEROFF	0
335 #define	SHUTDOWN_REBOOT		1
336 #define	SHUTDOWN_SUSPEND	2
337 #define	SHUTDOWN_HALT		3
338 #define	SHUTDOWN_MAX		4
339 
340 #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
341 
342 int
343 xen_suspend_devices(dev_info_t *dip)
344 {
345 	int error;
346 	char buf[XPV_BUFSIZE];
347 
348 	SUSPEND_DEBUG("xen_suspend_devices\n");
349 
350 	for (; dip != NULL; dip = ddi_get_next_sibling(dip)) {
351 		if (xen_suspend_devices(ddi_get_child(dip)))
352 			return (ENXIO);
353 		if (ddi_get_driver(dip) == NULL)
354 			continue;
355 		SUSPEND_DEBUG("Suspending device %s\n", ddi_deviname(dip, buf));
356 		ASSERT((DEVI(dip)->devi_cpr_flags & DCF_CPR_SUSPENDED) == 0);
357 
358 
359 		if (!i_ddi_devi_attached(dip)) {
360 			error = DDI_FAILURE;
361 		} else {
362 			error = devi_detach(dip, DDI_SUSPEND);
363 		}
364 
365 		if (error == DDI_SUCCESS) {
366 			DEVI(dip)->devi_cpr_flags |= DCF_CPR_SUSPENDED;
367 		} else {
368 			SUSPEND_DEBUG("WARNING: Unable to suspend device %s\n",
369 			    ddi_deviname(dip, buf));
370 			cmn_err(CE_WARN, "Unable to suspend device %s.",
371 			    ddi_deviname(dip, buf));
372 			cmn_err(CE_WARN, "Device is busy or does not "
373 			    "support suspend/resume.");
374 				return (ENXIO);
375 		}
376 	}
377 	return (0);
378 }
379 
380 int
381 xen_resume_devices(dev_info_t *start, int resume_failed)
382 {
383 	dev_info_t *dip, *next, *last = NULL;
384 	int did_suspend;
385 	int error = resume_failed;
386 	char buf[XPV_BUFSIZE];
387 
388 	SUSPEND_DEBUG("xen_resume_devices\n");
389 
390 	while (last != start) {
391 		dip = start;
392 		next = ddi_get_next_sibling(dip);
393 		while (next != last) {
394 			dip = next;
395 			next = ddi_get_next_sibling(dip);
396 		}
397 
398 		/*
399 		 * cpr is the only one that uses this field and the device
400 		 * itself hasn't resumed yet, there is no need to use a
401 		 * lock, even though kernel threads are active by now.
402 		 */
403 		did_suspend = DEVI(dip)->devi_cpr_flags & DCF_CPR_SUSPENDED;
404 		if (did_suspend)
405 			DEVI(dip)->devi_cpr_flags &= ~DCF_CPR_SUSPENDED;
406 
407 		/*
408 		 * There may be background attaches happening on devices
409 		 * that were not originally suspended by cpr, so resume
410 		 * only devices that were suspended by cpr. Also, stop
411 		 * resuming after the first resume failure, but traverse
412 		 * the entire tree to clear the suspend flag.
413 		 */
414 		if (did_suspend && !error) {
415 			SUSPEND_DEBUG("Resuming device %s\n",
416 			    ddi_deviname(dip, buf));
417 			/*
418 			 * If a device suspended by cpr gets detached during
419 			 * the resume process (for example, due to hotplugging)
420 			 * before cpr gets around to issuing it a DDI_RESUME,
421 			 * we'll have problems.
422 			 */
423 			if (!i_ddi_devi_attached(dip)) {
424 				cmn_err(CE_WARN, "Skipping %s, device "
425 				    "not ready for resume",
426 				    ddi_deviname(dip, buf));
427 			} else {
428 				if (devi_attach(dip, DDI_RESUME) !=
429 				    DDI_SUCCESS) {
430 					error = ENXIO;
431 				}
432 			}
433 		}
434 
435 		if (error == ENXIO) {
436 			cmn_err(CE_WARN, "Unable to resume device %s",
437 			    ddi_deviname(dip, buf));
438 		}
439 
440 		error = xen_resume_devices(ddi_get_child(dip), error);
441 		last = dip;
442 	}
443 
444 	return (error);
445 }
446 
447 /*ARGSUSED*/
448 static int
449 check_xpvd(dev_info_t *dip, void *arg)
450 {
451 	char *name;
452 
453 	name = ddi_node_name(dip);
454 	if (name == NULL || strcmp(name, "xpvd")) {
455 		return (DDI_WALK_CONTINUE);
456 	} else {
457 		xpvd_dip = dip;
458 		return (DDI_WALK_TERMINATE);
459 	}
460 }
461 
462 /*
463  * Top level routine to direct suspend/resume of a domain.
464  */
465 void
466 xen_suspend_domain(void)
467 {
468 	extern void rtcsync(void);
469 	extern void ec_resume(void);
470 	extern kmutex_t ec_lock;
471 	struct xen_add_to_physmap xatp;
472 	ulong_t flags;
473 	int err;
474 
475 	cmn_err(CE_NOTE, "Domain suspending for save/migrate");
476 
477 	SUSPEND_DEBUG("xen_suspend_domain\n");
478 
479 	/*
480 	 * We only want to suspend the PV devices, since the emulated devices
481 	 * are suspended by saving the emulated device state.  The PV devices
482 	 * are all children of the xpvd nexus device.  So we search the
483 	 * device tree for the xpvd node to use as the root of the tree to
484 	 * be suspended.
485 	 */
486 	if (xpvd_dip == NULL)
487 		ddi_walk_devs(ddi_root_node(), check_xpvd, NULL);
488 
489 	/*
490 	 * suspend interrupts and devices
491 	 */
492 	if (xpvd_dip != NULL)
493 		(void) xen_suspend_devices(ddi_get_child(xpvd_dip));
494 	else
495 		cmn_err(CE_WARN, "No PV devices found to suspend");
496 	SUSPEND_DEBUG("xenbus_suspend\n");
497 	xenbus_suspend();
498 
499 	mutex_enter(&cpu_lock);
500 
501 	/*
502 	 * Suspend on vcpu 0
503 	 */
504 	thread_affinity_set(curthread, 0);
505 	kpreempt_disable();
506 
507 	if (ncpus > 1)
508 		pause_cpus(NULL);
509 	/*
510 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
511 	 * any holder would have dropped it to get through pause_cpus().
512 	 */
513 	mutex_enter(&ec_lock);
514 
515 	/*
516 	 * From here on in, we can't take locks.
517 	 */
518 
519 	flags = intr_clear();
520 
521 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
522 	/*
523 	 * At this point we suspend and sometime later resume.
524 	 * Note that this call may return with an indication of a cancelled
525 	 * for now no matter ehat the return we do a full resume of all
526 	 * suspended drivers, etc.
527 	 */
528 	(void) HYPERVISOR_shutdown(SHUTDOWN_suspend);
529 
530 	/*
531 	 * Point HYPERVISOR_shared_info to the proper place.
532 	 */
533 	xatp.domid = DOMID_SELF;
534 	xatp.idx = 0;
535 	xatp.space = XENMAPSPACE_shared_info;
536 	xatp.gpfn = shared_info_frame;
537 	if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0)
538 		panic("Could not set shared_info page. error: %d", err);
539 
540 	SUSPEND_DEBUG("gnttab_resume\n");
541 	gnttab_resume();
542 
543 	SUSPEND_DEBUG("ec_resume\n");
544 	ec_resume();
545 
546 	intr_restore(flags);
547 
548 	if (ncpus > 1)
549 		start_cpus();
550 
551 	mutex_exit(&ec_lock);
552 	mutex_exit(&cpu_lock);
553 
554 	/*
555 	 * Now we can take locks again.
556 	 */
557 
558 	rtcsync();
559 
560 	SUSPEND_DEBUG("xenbus_resume\n");
561 	xenbus_resume();
562 	SUSPEND_DEBUG("xen_resume_devices\n");
563 	if (xpvd_dip != NULL)
564 		(void) xen_resume_devices(ddi_get_child(xpvd_dip), 0);
565 
566 	thread_affinity_clear(curthread);
567 	kpreempt_enable();
568 
569 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
570 
571 	cmn_err(CE_NOTE, "domain restore/migrate completed");
572 }
573 
574 static void
575 xen_dirty_shutdown(void *arg)
576 {
577 	int cmd = (uintptr_t)arg;
578 
579 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
580 	    "timed out.\nShutting down.\n");
581 
582 	switch (cmd) {
583 	case SHUTDOWN_HALT:
584 	case SHUTDOWN_POWEROFF:
585 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
586 		break;
587 	case SHUTDOWN_REBOOT:
588 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
589 		break;
590 	}
591 }
592 
593 static void
594 xen_shutdown(void *arg)
595 {
596 	int cmd = (uintptr_t)arg;
597 	proc_t *initpp;
598 
599 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
600 
601 	if (cmd == SHUTDOWN_SUSPEND) {
602 		xen_suspend_domain();
603 		return;
604 	}
605 
606 	switch (cmd) {
607 	case SHUTDOWN_POWEROFF:
608 		force_shutdown_method = AD_POWEROFF;
609 		break;
610 	case SHUTDOWN_HALT:
611 		force_shutdown_method = AD_HALT;
612 		break;
613 	case SHUTDOWN_REBOOT:
614 		force_shutdown_method = AD_BOOT;
615 		break;
616 	}
617 
618 
619 	/*
620 	 * If we're still booting and init(1) isn't set up yet, simply halt.
621 	 */
622 	mutex_enter(&pidlock);
623 	initpp = prfind(P_INITPID);
624 	mutex_exit(&pidlock);
625 	if (initpp == NULL) {
626 		extern void halt(char *);
627 		halt("Power off the System");   /* just in case */
628 	}
629 
630 	/*
631 	 * else, graceful shutdown with inittab and all getting involved
632 	 */
633 	psignal(initpp, SIGPWR);
634 
635 	(void) timeout(xen_dirty_shutdown, arg,
636 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
637 }
638 
639 /*ARGSUSED*/
640 static void
641 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
642 	unsigned int len)
643 {
644 	char *str;
645 	xenbus_transaction_t xbt;
646 	int err, shutdown_code = SHUTDOWN_INVALID;
647 	unsigned int slen;
648 
649 again:
650 	err = xenbus_transaction_start(&xbt);
651 	if (err)
652 		return;
653 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
654 		(void) xenbus_transaction_end(xbt, 1);
655 		return;
656 	}
657 
658 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
659 
660 	/*
661 	 * If this is a watch fired from our write below, check out early to
662 	 * avoid an infinite loop.
663 	 */
664 	if (strcmp(str, "") == 0) {
665 		(void) xenbus_transaction_end(xbt, 0);
666 		kmem_free(str, slen);
667 		return;
668 	} else if (strcmp(str, "poweroff") == 0) {
669 		shutdown_code = SHUTDOWN_POWEROFF;
670 	} else if (strcmp(str, "reboot") == 0) {
671 		shutdown_code = SHUTDOWN_REBOOT;
672 	} else if (strcmp(str, "suspend") == 0) {
673 		shutdown_code = SHUTDOWN_SUSPEND;
674 	} else if (strcmp(str, "halt") == 0) {
675 		shutdown_code = SHUTDOWN_HALT;
676 	} else {
677 		printf("Ignoring shutdown request: %s\n", str);
678 	}
679 
680 	(void) xenbus_write(xbt, "control", "shutdown", "");
681 	err = xenbus_transaction_end(xbt, 0);
682 	if (err == EAGAIN) {
683 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
684 		kmem_free(str, slen);
685 		goto again;
686 	}
687 
688 	kmem_free(str, slen);
689 	if (shutdown_code != SHUTDOWN_INVALID) {
690 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
691 		    (void *)(intptr_t)shutdown_code, 0);
692 	}
693 }
694 
695 static int
696 xen_pv_init(dev_info_t *xpv_dip)
697 {
698 	struct cpuid_regs cp;
699 	uint32_t xen_signature[4];
700 	char *xen_str;
701 	struct xen_add_to_physmap xatp;
702 	xen_capabilities_info_t caps;
703 	pfn_t pfn;
704 	uint64_t msrval;
705 	int err;
706 
707 	/*
708 	 * Xen's pseudo-cpuid function 0x40000000 returns a string
709 	 * representing the Xen signature in %ebx, %ecx, and %edx.
710 	 * %eax contains the maximum supported cpuid function.
711 	 */
712 	cp.cp_eax = 0x40000000;
713 	(void) __cpuid_insn(&cp);
714 	xen_signature[0] = cp.cp_ebx;
715 	xen_signature[1] = cp.cp_ecx;
716 	xen_signature[2] = cp.cp_edx;
717 	xen_signature[3] = 0;
718 	xen_str = (char *)xen_signature;
719 	if (strcmp("XenVMMXenVMM", xen_str) != 0 ||
720 	    cp.cp_eax < 0x40000002) {
721 		cmn_err(CE_WARN,
722 		    "Attempting to load Xen drivers on non-Xen system");
723 		return (-1);
724 	}
725 
726 	/*
727 	 * cpuid function 0x40000001 returns the Xen version in %eax.  The
728 	 * top 16 bits are the major version, the bottom 16 are the minor
729 	 * version.
730 	 */
731 	cp.cp_eax = 0x40000001;
732 	(void) __cpuid_insn(&cp);
733 	xen_major = cp.cp_eax >> 16;
734 	xen_minor = cp.cp_eax & 0xffff;
735 
736 	/*
737 	 * The xpv driver is incompatible with xen versions older than 3.1. This
738 	 * is due to the changes in the vcpu_info and shared_info structs used
739 	 * to communicate with the hypervisor (the event channels in particular)
740 	 * that were introduced with 3.1.
741 	 */
742 	if (xen_major < 3 || (xen_major == 3 && xen_minor < 1)) {
743 		cmn_err(CE_WARN, "Xen version %d.%d is not supported",
744 		    xen_major, xen_minor);
745 		return (-1);
746 	}
747 
748 	/*
749 	 * cpuid function 0x40000002 returns information about the
750 	 * hypercall page.  %eax nominally contains the number of pages
751 	 * with hypercall code, but according to the Xen guys, "I'll
752 	 * guarantee that remains one forever more, so you can just
753 	 * allocate a single page and get quite upset if you ever see CPUID
754 	 * return more than one page."  %ebx contains an MSR we use to ask
755 	 * Xen to remap each page at a specific pfn.
756 	 */
757 	cp.cp_eax = 0x40000002;
758 	(void) __cpuid_insn(&cp);
759 
760 	/*
761 	 * Let Xen know where we want the hypercall page mapped.  We
762 	 * already have a page allocated in the .text section to simplify
763 	 * the wrapper code.
764 	 */
765 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)&hypercall_page);
766 	msrval = mmu_ptob(pfn);
767 	wrmsr(cp.cp_ebx, msrval);
768 
769 	/* Fill in the xen_info data */
770 	xen_info = kmem_zalloc(sizeof (start_info_t), KM_SLEEP);
771 	(void) sprintf(xen_info->magic, "xen-%d.%d", xen_major, xen_minor);
772 	xen_info->store_mfn = (mfn_t)hvm_get_param(HVM_PARAM_STORE_PFN);
773 	xen_info->store_evtchn = (int)hvm_get_param(HVM_PARAM_STORE_EVTCHN);
774 
775 	/* Figure out whether the hypervisor is 32-bit or 64-bit.  */
776 	if ((HYPERVISOR_xen_version(XENVER_capabilities, &caps) == 0)) {
777 		((char *)(caps))[sizeof (caps) - 1] = '\0';
778 		if (strstr(caps, "x86_64") != NULL)
779 			xen_is_64bit = 1;
780 		else if (strstr(caps, "x86_32") != NULL)
781 			xen_is_64bit = 0;
782 	}
783 	if (xen_is_64bit < 0) {
784 		cmn_err(CE_WARN, "Couldn't get capability info from Xen.");
785 		return (-1);
786 	}
787 #ifdef __amd64
788 	ASSERT(xen_is_64bit == 1);
789 #endif
790 
791 	/*
792 	 * Allocate space for the shared_info page and tell Xen where it
793 	 * is.
794 	 */
795 	HYPERVISOR_shared_info = xen_alloc_pages(1);
796 	shared_info_frame = hat_getpfnum(kas.a_hat,
797 	    (caddr_t)HYPERVISOR_shared_info);
798 	xatp.domid = DOMID_SELF;
799 	xatp.idx = 0;
800 	xatp.space = XENMAPSPACE_shared_info;
801 	xatp.gpfn = shared_info_frame;
802 	if ((err = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) != 0) {
803 		cmn_err(CE_WARN, "Could not get shared_info page from Xen."
804 		    "  error: %d", err);
805 		return (-1);
806 	}
807 
808 	/* Set up the grant tables.  */
809 	gnttab_init();
810 
811 	/* Set up event channel support */
812 	if (ec_init(xpv_dip) != 0)
813 		return (-1);
814 
815 	/* Set up xenbus */
816 	xb_addr = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP);
817 	xs_early_init();
818 	xs_domu_init();
819 
820 	/* Set up for suspend/resume/migrate */
821 	xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
822 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
823 	shutdown_watch.node = "control/shutdown";
824 	shutdown_watch.callback = xen_shutdown_handler;
825 	if (register_xenbus_watch(&shutdown_watch))
826 		cmn_err(CE_WARN, "Failed to set shutdown watcher");
827 
828 	return (0);
829 }
830 
831 static void
832 xen_pv_fini()
833 {
834 	if (xen_info != NULL)
835 		kmem_free(xen_info, sizeof (start_info_t));
836 	ec_fini();
837 }
838 
839 /*ARGSUSED*/
840 static int
841 xpv_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
842 {
843 	if (getminor((dev_t)arg) != XPV_MINOR)
844 		return (DDI_FAILURE);
845 
846 	switch (cmd) {
847 	case DDI_INFO_DEVT2DEVINFO:
848 		*result = xpv_dip;
849 		break;
850 	case DDI_INFO_DEVT2INSTANCE:
851 		*result = 0;
852 		break;
853 	default:
854 		return (DDI_FAILURE);
855 	}
856 
857 	return (DDI_SUCCESS);
858 }
859 
860 static int
861 xpv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
862 {
863 	if (cmd != DDI_ATTACH)
864 		return (DDI_FAILURE);
865 
866 	if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
867 	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
868 		return (DDI_FAILURE);
869 
870 	xpv_dip = dip;
871 
872 	if (xen_pv_init(dip) != 0)
873 		return (DDI_FAILURE);
874 
875 	ddi_report_dev(dip);
876 
877 	/*
878 	 * If the memscrubber attempts to scrub the pages we hand to Xen,
879 	 * the domain will panic.
880 	 */
881 	memscrub_disable();
882 
883 	/*
884 	 * Report our version to dom0.
885 	 */
886 	if (xenbus_printf(XBT_NULL, "hvmpv/xpv", "version", "%d",
887 	    HVMPV_XPV_VERS))
888 		cmn_err(CE_WARN, "xpv: couldn't write version\n");
889 
890 	return (DDI_SUCCESS);
891 }
892 
893 /*
894  * Attempts to reload the PV driver plumbing hang on Intel platforms, so
895  * we don't want to unload the framework by accident.
896  */
897 int xpv_allow_detach = 0;
898 
899 static int
900 xpv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
901 {
902 	if (cmd != DDI_DETACH || xpv_allow_detach == 0)
903 		return (DDI_FAILURE);
904 
905 	if (xpv_dip != NULL) {
906 		xen_pv_fini();
907 		ddi_remove_minor_node(dip, NULL);
908 		xpv_dip = NULL;
909 	}
910 
911 	return (DDI_SUCCESS);
912 }
913 
914 /*ARGSUSED1*/
915 static int
916 xpv_open(dev_t *dev, int flag, int otyp, cred_t *cr)
917 {
918 	return (getminor(*dev) == XPV_MINOR ? 0 : ENXIO);
919 }
920 
921 /*ARGSUSED*/
922 static int
923 xpv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cr,
924     int *rval_p)
925 {
926 	return (EINVAL);
927 }
928 
929 int
930 _init(void)
931 {
932 	int err;
933 
934 	if ((err = mod_install(&modl)) != 0)
935 		return (err);
936 
937 	impl_bus_add_probe(xpv_enumerate);
938 	return (0);
939 }
940 
941 int
942 _fini(void)
943 {
944 	int err;
945 
946 	if ((err = mod_remove(&modl)) != 0)
947 		return (err);
948 
949 	impl_bus_delete_probe(xpv_enumerate);
950 	return (0);
951 }
952 
953 int
954 _info(struct modinfo *modinfop)
955 {
956 	return (mod_info(&modl, modinfop));
957 }
958