xref: /illumos-gate/usr/src/uts/i86xpv/os/mp_xen.c (revision d3b5f56344d8bfcdd6cfb82446af0e5e55ad9ebe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2019 Joyent, Inc.
29  */
30 
31 /*
32  * Virtual CPU management.
33  *
34  * VCPUs can be controlled in one of two ways; through the domain itself
35  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
36  * Unfortunately, the terminology is used in different ways; they work out as
37  * follows:
38  *
39  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
40  *
41  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
42  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
43  * receive interrupts, and we require this for offline CPUs in Solaris.
44  *
45  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
46  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
47  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
48  * event channels, etc.) will still exist.
49  *
50  * The hypervisor has two notions of CPU states as represented in the store:
51  *
52  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
53  *
54  * "online": the VCPU is running.  Corresponds to a CPU state other than
55  * P_POWEROFF.
56  *
57  * Currently, only a notification via xenstore can bring a CPU into a
58  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
59  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
60  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
61  *
62  * Note that the xenstore configuration is strictly advisory, in that a domain
63  * can choose to ignore it and still power up a VCPU in the offline state. To
64  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
65  * ENOTSUP from within Solaris.
66  *
67  * Powering off a VCPU and suspending the domain use similar code. The
68  * difficulty here is that we must ensure that each VCPU is in a stable
69  * state: it must have a saved PCB, and not be responding to interrupts
70  * (since we are just about to remove its ability to run on a real CPU,
71  * possibly forever).  However, an offline CPU in Solaris can take
72  * cross-call interrupts, as mentioned, so we must go through a
73  * two-stage process.  First, we use the standard Solaris pause_cpus().
74  * This ensures that all CPUs are either in mach_cpu_pause() or
75  * mach_cpu_idle(), and nothing will cross-call them.
76  *
77  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
78  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
79  *
80  * Running CPUs are spinning in mach_cpu_pause() waiting for either
81  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
82  *
83  * Offline CPUs are either running the idle thread and periodically
84  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
85  *
86  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
87  * poking them to make sure they're not blocked[1]. When every CPU has
88  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
89  * know we can suspend, or power-off a CPU, without problems.
90  *
91  * [1] note that we have to repeatedly poke offline CPUs: it's the only
92  * way to ensure that the CPU doesn't miss the state change before
93  * dropping into HYPERVISOR_block().
94  */
95 
96 #include <sys/types.h>
97 #include <sys/systm.h>
98 #include <sys/param.h>
99 #include <sys/taskq.h>
100 #include <sys/cmn_err.h>
101 #include <sys/archsystm.h>
102 #include <sys/machsystm.h>
103 #include <sys/segments.h>
104 #include <sys/cpuvar.h>
105 #include <sys/x86_archext.h>
106 #include <sys/controlregs.h>
107 #include <sys/hypervisor.h>
108 #include <sys/xpv_panic.h>
109 #include <sys/mman.h>
110 #include <sys/psw.h>
111 #include <sys/cpu.h>
112 #include <sys/sunddi.h>
113 #include <util/sscanf.h>
114 #include <vm/hat_i86.h>
115 #include <vm/hat.h>
116 #include <vm/as.h>
117 
118 #include <xen/public/io/xs_wire.h>
119 #include <xen/sys/xenbus_impl.h>
120 #include <xen/public/vcpu.h>
121 
122 extern cpuset_t cpu_ready_set;
123 
124 #define	CPU_PHASE_NONE 0
125 #define	CPU_PHASE_WAIT_SAFE 1
126 #define	CPU_PHASE_SAFE 2
127 #define	CPU_PHASE_POWERED_OFF 3
128 
129 /*
130  * We can only poke CPUs during barrier enter 256 times a second at
131  * most.
132  */
133 #define	POKE_TIMEOUT (NANOSEC / 256)
134 
135 static taskq_t *cpu_config_tq;
136 static int cpu_phase[NCPU];
137 
138 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
139 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
140 
141 /*
142  * Return whether or not the vcpu is actually running on a pcpu
143  */
144 int
145 vcpu_on_pcpu(processorid_t cpu)
146 {
147 	struct vcpu_runstate_info runstate;
148 	int	ret = VCPU_STATE_UNKNOWN;
149 
150 	ASSERT(cpu < NCPU);
151 	/*
152 	 * Don't bother with hypercall if we are asking about ourself
153 	 */
154 	if (cpu == CPU->cpu_id)
155 		return (VCPU_ON_PCPU);
156 	if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
157 		goto out;
158 
159 	switch (runstate.state) {
160 	case RUNSTATE_running:
161 		ret = VCPU_ON_PCPU;
162 		break;
163 
164 	case RUNSTATE_runnable:
165 	case RUNSTATE_offline:
166 	case RUNSTATE_blocked:
167 		ret = VCPU_NOT_ON_PCPU;
168 		break;
169 
170 	default:
171 		break;
172 	}
173 
174 out:
175 	return (ret);
176 }
177 
178 /*
179  * These routines allocate any global state that might be needed
180  * while starting cpus.  For virtual cpus, there is no such state.
181  */
182 int
183 mach_cpucontext_init(void)
184 {
185 	return (0);
186 }
187 
188 void
189 do_cpu_config_watch(int state)
190 {
191 	static struct xenbus_watch cpu_config_watch;
192 
193 	if (state != XENSTORE_UP)
194 		return;
195 	cpu_config_watch.node = "cpu";
196 	cpu_config_watch.callback = vcpu_config_event;
197 	if (register_xenbus_watch(&cpu_config_watch)) {
198 		taskq_destroy(cpu_config_tq);
199 		cmn_err(CE_WARN, "do_cpu_config_watch: "
200 		    "failed to set vcpu config watch");
201 	}
202 
203 }
204 
205 /*
206  * This routine is called after all the "normal" MP startup has
207  * been done; a good place to start watching xen store for virtual
208  * cpu hot plug events.
209  */
210 void
211 mach_cpucontext_fini(void)
212 {
213 
214 	cpu_config_tq = taskq_create("vcpu config taskq", 1,
215 	    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
216 
217 	(void) xs_register_xenbus_callback(do_cpu_config_watch);
218 }
219 
220 /*
221  * Fill in the remaining CPU context and initialize it.
222  */
223 static int
224 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
225 {
226 	uint_t vec, iopl;
227 
228 	vgc->flags = VGCF_IN_KERNEL;
229 
230 	/*
231 	 * fpu_ctx we leave as zero; on first fault we'll store
232 	 * sse_initial into it anyway.
233 	 */
234 
235 #if defined(__amd64)
236 	vgc->user_regs.cs = KCS_SEL | SEL_KPL;	/* force to ring 3 */
237 #else
238 	vgc->user_regs.cs = KCS_SEL;
239 #endif
240 	vgc->user_regs.ds = KDS_SEL;
241 	vgc->user_regs.es = KDS_SEL;
242 	vgc->user_regs.ss = KDS_SEL;
243 	vgc->kernel_ss = KDS_SEL;
244 
245 	/*
246 	 * Allow I/O privilege level for Dom0 kernel.
247 	 */
248 	if (DOMAIN_IS_INITDOMAIN(xen_info))
249 		iopl = (PS_IOPL & 0x1000); /* ring 1 */
250 	else
251 		iopl = 0;
252 
253 #if defined(__amd64)
254 	vgc->user_regs.fs = 0;
255 	vgc->user_regs.gs = 0;
256 	vgc->user_regs.rflags = F_OFF | iopl;
257 #elif defined(__i386)
258 	vgc->user_regs.fs = KFS_SEL;
259 	vgc->user_regs.gs = KGS_SEL;
260 	vgc->user_regs.eflags = F_OFF | iopl;
261 	vgc->event_callback_cs = vgc->user_regs.cs;
262 	vgc->failsafe_callback_cs = vgc->user_regs.cs;
263 #endif
264 
265 	/*
266 	 * Initialize the trap_info_t from the IDT
267 	 */
268 #if !defined(__lint)
269 	ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
270 #endif
271 	for (vec = 0; vec < NIDT; vec++) {
272 		trap_info_t *ti = &vgc->trap_ctxt[vec];
273 
274 		if (xen_idt_to_trap_info(vec,
275 		    &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
276 			ti->cs = KCS_SEL;
277 			ti->vector = vec;
278 		}
279 	}
280 
281 	/*
282 	 * No LDT
283 	 */
284 
285 	/*
286 	 * (We assert in various places that the GDT is (a) aligned on a
287 	 * page boundary and (b) one page long, so this really should fit..)
288 	 */
289 #ifdef CRASH_XEN
290 	vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
291 #else
292 	vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
293 #endif
294 	vgc->gdt_ents = NGDT;
295 
296 	vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
297 
298 #if defined(__i386)
299 	if (mmu.pae_hat)
300 		vgc->ctrlreg[3] =
301 		    xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
302 	else
303 #endif
304 		vgc->ctrlreg[3] =
305 		    pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
306 
307 	vgc->ctrlreg[4] = getcr4();
308 
309 	vgc->event_callback_eip = (uintptr_t)xen_callback;
310 	vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
311 	vgc->flags |= VGCF_failsafe_disables_events;
312 
313 #if defined(__amd64)
314 	/*
315 	 * XXPV should this be moved to init_cpu_syscall?
316 	 */
317 	vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
318 	vgc->flags |= VGCF_syscall_disables_events;
319 
320 	ASSERT(vgc->user_regs.gs == 0);
321 	vgc->gs_base_kernel = (uintptr_t)cp;
322 #endif
323 
324 	return (xen_vcpu_initialize(cp->cpu_id, vgc));
325 }
326 
327 /*
328  * Create a guest virtual cpu context so that the virtual cpu
329  * springs into life in the domain just about to call mp_startup()
330  *
331  * Virtual CPUs must be initialized once in the lifetime of the domain;
332  * after that subsequent attempts to start them will fail with X_EEXIST.
333  *
334  * Thus 'alloc' -really- creates and initializes the virtual
335  * CPU context just once. Once the initialisation succeeds, we never
336  * free it, nor the regular cpu_t to which it refers.
337  */
338 void *
339 mach_cpucontext_alloc(struct cpu *cp)
340 {
341 	kthread_t *tp = cp->cpu_thread;
342 	vcpu_guest_context_t vgc;
343 
344 	int err = 1;
345 
346 	/*
347 	 * First, augment the incoming cpu structure
348 	 * - vcpu pointer reference
349 	 * - pending event storage area
350 	 * - physical address of GDT
351 	 */
352 	cp->cpu_m.mcpu_vcpu_info =
353 	    &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
354 	cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
355 	    sizeof (struct xen_evt_data), KM_SLEEP);
356 	cp->cpu_m.mcpu_gdtpa =
357 	    mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
358 
359 	if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
360 		goto done;
361 
362 	/*
363 	 * Now set up the vcpu context so that we can start this vcpu
364 	 * in the kernel at tp->t_pc (mp_startup).  Note that the
365 	 * thread will thread_exit() shortly after performing the
366 	 * initialization; in particular, we will *never* take a
367 	 * privilege transition on this thread.
368 	 */
369 
370 	bzero(&vgc, sizeof (vgc));
371 
372 #ifdef __amd64
373 	vgc.user_regs.rip = tp->t_pc;
374 	vgc.user_regs.rsp = tp->t_sp;
375 	vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
376 #else
377 	vgc.user_regs.eip = tp->t_pc;
378 	vgc.user_regs.esp = tp->t_sp;
379 	vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
380 #endif
381 	/*
382 	 * XXPV	Fix resume, if Russ didn't already fix it.
383 	 *
384 	 * Note that resume unconditionally puts t->t_stk + sizeof (regs)
385 	 * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
386 	 * that only lwps take traps that switch to the kernel stack;
387 	 * part of creating an lwp adjusts the stack by subtracting
388 	 * sizeof (struct regs) off t_stk.
389 	 *
390 	 * The more interesting question is, why do we do all the work
391 	 * of a fully fledged lwp for a plain thread?  In particular
392 	 * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
393 	 * or futz with the LDT.  This should probably all be done with
394 	 * an lwp context operator to keep pure thread context switch fast.
395 	 */
396 	vgc.kernel_sp = (ulong_t)tp->t_stk;
397 
398 	err = mp_set_cpu_context(&vgc, cp);
399 
400 done:
401 	if (err) {
402 		mach_cpucontext_free(cp, NULL, err);
403 		return (NULL);
404 	}
405 	return (cp);
406 }
407 
408 /*
409  * By the time we are called either we have successfully started
410  * the cpu, or our attempt to start it has failed.
411  */
412 
413 /*ARGSUSED*/
414 void
415 mach_cpucontext_free(struct cpu *cp, void *arg, int err)
416 {
417 	switch (err) {
418 	case 0:
419 		break;
420 	case ETIMEDOUT:
421 		/*
422 		 * The vcpu context is loaded into the hypervisor, and
423 		 * we've tried to start it, but the vcpu has not been set
424 		 * running yet, for whatever reason.  We arrange to -not-
425 		 * free any data structures it may be referencing.  In
426 		 * particular, we've already told the hypervisor about
427 		 * the GDT, and so we can't map it read-write again.
428 		 */
429 		break;
430 	default:
431 		(void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
432 		kmem_free(cp->cpu_m.mcpu_evt_pend,
433 		    sizeof (struct xen_evt_data));
434 		break;
435 	}
436 }
437 
438 /*
439  * Reset this CPU's context.  Clear out any pending evtchn data, since event
440  * channel numbers will all change when we resume.
441  */
442 void
443 mach_cpucontext_reset(cpu_t *cp)
444 {
445 	bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
446 	/* mcpu_intr_pending ? */
447 }
448 
449 static void
450 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
451 {
452 #ifdef __amd64
453 	vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
454 	vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
455 	vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
456 	vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
457 	vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
458 	vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
459 	vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
460 	vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
461 #else /* __amd64 */
462 	vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
463 	vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
464 	vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
465 	vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
466 	vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
467 	vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
468 #endif /* __amd64 */
469 }
470 
471 /*
472  * Restore the context of a CPU during resume.  This context is always
473  * inside enter_safe_phase(), below.
474  */
475 void
476 mach_cpucontext_restore(cpu_t *cp)
477 {
478 	vcpu_guest_context_t vgc;
479 	int err;
480 
481 	ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
482 	    cp->cpu_thread == cp->cpu_idle_thread);
483 
484 	bzero(&vgc, sizeof (vgc));
485 
486 	pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
487 
488 	/*
489 	 * We're emulating a longjmp() here: in particular, we need to bump the
490 	 * stack pointer to account for the pop of xIP that returning from
491 	 * longjmp() normally would do, and set the return value in xAX to 1.
492 	 */
493 #ifdef __amd64
494 	vgc.user_regs.rax = 1;
495 	vgc.user_regs.rsp += sizeof (ulong_t);
496 #else
497 	vgc.user_regs.eax = 1;
498 	vgc.user_regs.esp += sizeof (ulong_t);
499 #endif
500 
501 	vgc.kernel_sp = cp->cpu_thread->t_sp;
502 
503 	err = mp_set_cpu_context(&vgc, cp);
504 
505 	ASSERT(err == 0);
506 }
507 
508 /*
509  * Reach a point at which the CPU can be safely powered-off or
510  * suspended.  Nothing can wake this CPU out of the loop.
511  */
512 static void
513 enter_safe_phase(void)
514 {
515 	ulong_t flags = intr_clear();
516 
517 	if (setjmp(&curthread->t_pcb) == 0) {
518 		cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
519 		while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
520 			SMT_PAUSE();
521 	}
522 
523 	ASSERT(!interrupts_enabled());
524 
525 	intr_restore(flags);
526 }
527 
528 /*
529  * Offline CPUs run this code even under a pause_cpus(), so we must
530  * check if we need to enter the safe phase.
531  */
532 void
533 mach_cpu_idle(void)
534 {
535 	if (IN_XPV_PANIC()) {
536 		xpv_panic_halt();
537 	} else  {
538 		(void) HYPERVISOR_block();
539 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
540 			enter_safe_phase();
541 	}
542 }
543 
544 /*
545  * Spin until either start_cpus() wakes us up, or we get a request to
546  * enter the safe phase (followed by a later start_cpus()).
547  */
548 void
549 mach_cpu_pause(volatile char *safe)
550 {
551 	*safe = PAUSE_WAIT;
552 	membar_enter();
553 
554 	while (*safe != PAUSE_IDLE) {
555 		if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
556 			enter_safe_phase();
557 		SMT_PAUSE();
558 	}
559 }
560 
561 void
562 mach_cpu_halt(char *msg)
563 {
564 	if (msg)
565 		prom_printf("%s\n", msg);
566 	(void) xen_vcpu_down(CPU->cpu_id);
567 }
568 
569 /*ARGSUSED*/
570 int
571 mp_cpu_poweron(struct cpu *cp)
572 {
573 	return (ENOTSUP);
574 }
575 
576 /*ARGSUSED*/
577 int
578 mp_cpu_poweroff(struct cpu *cp)
579 {
580 	return (ENOTSUP);
581 }
582 
583 void
584 mp_enter_barrier(void)
585 {
586 	hrtime_t last_poke_time = 0;
587 	int poke_allowed = 0;
588 	int done = 0;
589 	int i;
590 
591 	ASSERT(MUTEX_HELD(&cpu_lock));
592 
593 	pause_cpus(NULL, NULL);
594 
595 	while (!done) {
596 		done = 1;
597 		poke_allowed = 0;
598 
599 		if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
600 			last_poke_time = xpv_gethrtime();
601 			poke_allowed = 1;
602 		}
603 
604 		for (i = 0; i < NCPU; i++) {
605 			cpu_t *cp = cpu_get(i);
606 
607 			if (cp == NULL || cp == CPU)
608 				continue;
609 
610 			switch (cpu_phase[i]) {
611 			case CPU_PHASE_NONE:
612 				cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
613 				poke_cpu(i);
614 				done = 0;
615 				break;
616 
617 			case CPU_PHASE_WAIT_SAFE:
618 				if (poke_allowed)
619 					poke_cpu(i);
620 				done = 0;
621 				break;
622 
623 			case CPU_PHASE_SAFE:
624 			case CPU_PHASE_POWERED_OFF:
625 				break;
626 			}
627 		}
628 
629 		SMT_PAUSE();
630 	}
631 }
632 
633 void
634 mp_leave_barrier(void)
635 {
636 	int i;
637 
638 	ASSERT(MUTEX_HELD(&cpu_lock));
639 
640 	for (i = 0; i < NCPU; i++) {
641 		cpu_t *cp = cpu_get(i);
642 
643 		if (cp == NULL || cp == CPU)
644 			continue;
645 
646 		switch (cpu_phase[i]) {
647 		/*
648 		 * If we see a CPU in one of these phases, something has
649 		 * gone badly wrong with the guarantees
650 		 * mp_enter_barrier() is supposed to provide.  Rather
651 		 * than attempt to stumble along (and since we can't
652 		 * panic properly in this context), we tell the
653 		 * hypervisor we've crashed.
654 		 */
655 		case CPU_PHASE_NONE:
656 		case CPU_PHASE_WAIT_SAFE:
657 			(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
658 			break;
659 
660 		case CPU_PHASE_POWERED_OFF:
661 			break;
662 
663 		case CPU_PHASE_SAFE:
664 			cpu_phase[i] = CPU_PHASE_NONE;
665 		}
666 	}
667 
668 	start_cpus();
669 }
670 
671 static int
672 poweroff_vcpu(struct cpu *cp)
673 {
674 	int error;
675 
676 	ASSERT(MUTEX_HELD(&cpu_lock));
677 
678 	ASSERT(CPU->cpu_id != cp->cpu_id);
679 	ASSERT(cp->cpu_flags & CPU_QUIESCED);
680 
681 	mp_enter_barrier();
682 
683 	if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
684 		ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
685 
686 		CPUSET_DEL(cpu_ready_set, cp->cpu_id);
687 
688 		if (cp->cpu_flags & CPU_ENABLE)
689 			ncpus_intr_enabled--;
690 
691 		cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
692 		cp->cpu_flags &=
693 		    ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
694 
695 		cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
696 
697 		cpu_set_state(cp);
698 	}
699 
700 	mp_leave_barrier();
701 
702 	return (error);
703 }
704 
705 static int
706 vcpu_config_poweroff(processorid_t id)
707 {
708 	int oldstate;
709 	int error;
710 	cpu_t *cp;
711 
712 	mutex_enter(&cpu_lock);
713 
714 	if ((cp = cpu_get(id)) == NULL) {
715 		mutex_exit(&cpu_lock);
716 		return (ESRCH);
717 	}
718 
719 	if (cpu_get_state(cp) == P_POWEROFF) {
720 		mutex_exit(&cpu_lock);
721 		return (0);
722 	}
723 
724 	mutex_exit(&cpu_lock);
725 
726 	do {
727 		error = p_online_internal(id, P_OFFLINE,
728 		    &oldstate);
729 
730 		if (error != 0)
731 			break;
732 
733 		/*
734 		 * So we just changed it to P_OFFLINE.  But then we dropped
735 		 * cpu_lock, so now it is possible for another thread to change
736 		 * the cpu back to a different, non-quiesced state e.g.
737 		 * P_ONLINE.
738 		 */
739 		mutex_enter(&cpu_lock);
740 		if ((cp = cpu_get(id)) == NULL)
741 			error = ESRCH;
742 		else {
743 			if (cp->cpu_flags & CPU_QUIESCED)
744 				error = poweroff_vcpu(cp);
745 			else
746 				error = EBUSY;
747 		}
748 		mutex_exit(&cpu_lock);
749 	} while (error == EBUSY);
750 
751 	return (error);
752 }
753 
754 /*
755  * Add a new virtual cpu to the domain.
756  */
757 static int
758 vcpu_config_new(processorid_t id)
759 {
760 	extern int start_cpu(processorid_t);
761 	int error;
762 
763 	if (ncpus == 1) {
764 		printf("cannot (yet) add cpus to a single-cpu domain\n");
765 		return (ENOTSUP);
766 	}
767 
768 	affinity_set(CPU_CURRENT);
769 	error = start_cpu(id);
770 	affinity_clear();
771 	return (error);
772 }
773 
774 static int
775 poweron_vcpu(struct cpu *cp)
776 {
777 	int error;
778 
779 	ASSERT(MUTEX_HELD(&cpu_lock));
780 
781 	if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
782 		printf("poweron_vcpu: vcpu%d is not available!\n",
783 		    cp->cpu_id);
784 		return (ENXIO);
785 	}
786 
787 	if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
788 		CPUSET_ADD(cpu_ready_set, cp->cpu_id);
789 		cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
790 		cp->cpu_flags &= ~CPU_POWEROFF;
791 		/*
792 		 * There are some nasty races possible here.
793 		 * Tell the vcpu it's up one more time.
794 		 * XXPV	Is this enough?  Is this safe?
795 		 */
796 		(void) xen_vcpu_up(cp->cpu_id);
797 
798 		cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
799 
800 		cpu_set_state(cp);
801 	}
802 	return (error);
803 }
804 
805 static int
806 vcpu_config_poweron(processorid_t id)
807 {
808 	cpu_t *cp;
809 	int oldstate;
810 	int error;
811 
812 	if (id >= ncpus)
813 		return (vcpu_config_new(id));
814 
815 	mutex_enter(&cpu_lock);
816 
817 	if ((cp = cpu_get(id)) == NULL) {
818 		mutex_exit(&cpu_lock);
819 		return (ESRCH);
820 	}
821 
822 	if (cpu_get_state(cp) != P_POWEROFF) {
823 		mutex_exit(&cpu_lock);
824 		return (0);
825 	}
826 
827 	if ((error = poweron_vcpu(cp)) != 0) {
828 		mutex_exit(&cpu_lock);
829 		return (error);
830 	}
831 
832 	mutex_exit(&cpu_lock);
833 
834 	return (p_online_internal(id, P_ONLINE, &oldstate));
835 }
836 
837 #define	REPORT_LEN	128
838 
839 static void
840 vcpu_config_report(processorid_t id, uint_t newstate, int error)
841 {
842 	char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
843 	size_t len;
844 	char *ps;
845 
846 	switch (newstate) {
847 	case P_ONLINE:
848 		ps = PS_ONLINE;
849 		break;
850 	case P_POWEROFF:
851 		ps = PS_POWEROFF;
852 		break;
853 	default:
854 		cmn_err(CE_PANIC, "unknown state %u\n", newstate);
855 		break;
856 	}
857 
858 	len = snprintf(report, REPORT_LEN,
859 	    "cpu%d: externally initiated %s", id, ps);
860 
861 	if (!error) {
862 		cmn_err(CE_CONT, "!%s\n", report);
863 		kmem_free(report, REPORT_LEN);
864 		return;
865 	}
866 
867 	len += snprintf(report + len, REPORT_LEN - len,
868 	    " failed, error %d: ", error);
869 	switch (error) {
870 	case EEXIST:
871 		len += snprintf(report + len, REPORT_LEN - len,
872 		    "cpu already %s", ps ? ps : "?");
873 		break;
874 	case ESRCH:
875 		len += snprintf(report + len, REPORT_LEN - len,
876 		    "cpu not found");
877 		break;
878 	case EINVAL:
879 	case EALREADY:
880 		break;
881 	case EPERM:
882 		len += snprintf(report + len, REPORT_LEN - len,
883 		    "insufficient privilege (0x%x)", id);
884 		break;
885 	case EBUSY:
886 		switch (newstate) {
887 		case P_ONLINE:
888 			/*
889 			 * This return comes from mp_cpu_start -
890 			 * we cannot 'start' the boot CPU.
891 			 */
892 			len += snprintf(report + len, REPORT_LEN - len,
893 			    "already running");
894 			break;
895 		case P_POWEROFF:
896 			len += snprintf(report + len, REPORT_LEN - len,
897 			    "bound lwps?");
898 			break;
899 		default:
900 			break;
901 		}
902 	default:
903 		break;
904 	}
905 
906 	cmn_err(CE_CONT, "%s\n", report);
907 	kmem_free(report, REPORT_LEN);
908 }
909 
910 static void
911 vcpu_config(void *arg)
912 {
913 	int id = (int)(uintptr_t)arg;
914 	int error;
915 	char dir[16];
916 	char *state;
917 
918 	if ((uint_t)id >= max_ncpus) {
919 		cmn_err(CE_WARN,
920 		    "vcpu_config: cpu%d does not fit in this domain", id);
921 		return;
922 	}
923 
924 	(void) snprintf(dir, sizeof (dir), "cpu/%d", id);
925 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
926 	if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
927 		if (strcmp(state, "online") == 0) {
928 			error = vcpu_config_poweron(id);
929 			vcpu_config_report(id, P_ONLINE, error);
930 		} else if (strcmp(state, "offline") == 0) {
931 			error = vcpu_config_poweroff(id);
932 			vcpu_config_report(id, P_POWEROFF, error);
933 		} else {
934 			cmn_err(CE_WARN,
935 			    "cpu%d: unknown target state '%s'", id, state);
936 		}
937 	} else
938 		cmn_err(CE_WARN,
939 		    "cpu%d: unable to read target state from xenstore", id);
940 
941 	kmem_free(state, MAXPATHLEN);
942 }
943 
944 /*ARGSUSED*/
945 static void
946 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
947 {
948 	const char *path = vec[XS_WATCH_PATH];
949 	processorid_t id;
950 	char *s;
951 
952 	if ((s = strstr(path, "cpu/")) != NULL &&
953 	    sscanf(s, "cpu/%d", &id) == 1) {
954 		/*
955 		 * Run the virtual CPU configuration on a separate thread to
956 		 * avoid blocking on this event for too long (and for now,
957 		 * to ensure configuration requests are serialized.)
958 		 */
959 		(void) taskq_dispatch(cpu_config_tq,
960 		    vcpu_config, (void *)(uintptr_t)id, 0);
961 	}
962 }
963 
964 static int
965 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
966 {
967 	int err;
968 
969 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
970 		char *str;
971 		int level = CE_WARN;
972 
973 		switch (err) {
974 		case -X_EINVAL:
975 			/*
976 			 * This interface squashes multiple error sources
977 			 * to one error code.  In particular, an X_EINVAL
978 			 * code can mean:
979 			 *
980 			 * -	the vcpu id is out of range
981 			 * -	cs or ss are in ring 0
982 			 * -	cr3 is wrong
983 			 * -	an entry in the new gdt is above the
984 			 *	reserved entry
985 			 * -	a frame underneath the new gdt is bad
986 			 */
987 			str = "something is wrong :(";
988 			break;
989 		case -X_ENOENT:
990 			str = "no such cpu";
991 			break;
992 		case -X_ENOMEM:
993 			str = "no mem to copy ctxt";
994 			break;
995 		case -X_EFAULT:
996 			str = "bad address";
997 			break;
998 		case -X_EEXIST:
999 			/*
1000 			 * Hmm.  This error is returned if the vcpu has already
1001 			 * been initialized once before in the lifetime of this
1002 			 * domain.  This is a logic error in the kernel.
1003 			 */
1004 			level = CE_PANIC;
1005 			str = "already initialized";
1006 			break;
1007 		default:
1008 			level = CE_PANIC;
1009 			str = "<unexpected>";
1010 			break;
1011 		}
1012 
1013 		cmn_err(level, "vcpu%d: failed to init: error %d: %s",
1014 		    id, -err, str);
1015 	}
1016 	return (err);
1017 }
1018 
1019 long
1020 xen_vcpu_up(processorid_t id)
1021 {
1022 	long err;
1023 
1024 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
1025 		char *str;
1026 
1027 		switch (err) {
1028 		case -X_ENOENT:
1029 			str = "no such cpu";
1030 			break;
1031 		case -X_EINVAL:
1032 			/*
1033 			 * Perhaps this is diagnostic overkill.
1034 			 */
1035 			if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1036 				str = "bad cpuid";
1037 			else
1038 				str = "not initialized";
1039 			break;
1040 		default:
1041 			str = "<unexpected>";
1042 			break;
1043 		}
1044 
1045 		printf("vcpu%d: failed to start: error %d: %s\n",
1046 		    id, -(int)err, str);
1047 		return (EBFONT);	/* deliberately silly */
1048 	}
1049 	return (err);
1050 }
1051 
1052 long
1053 xen_vcpu_down(processorid_t id)
1054 {
1055 	long err;
1056 
1057 	if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1058 		/*
1059 		 * X_ENOENT:	no such cpu
1060 		 * X_EINVAL:	bad cpuid
1061 		 */
1062 		panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1063 	}
1064 
1065 	return (err);
1066 }
1067