xref: /illumos-gate/usr/src/uts/intel/os/sysi86.c (revision a2cd9e1884647e1e412c282879881873b71c84df)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2021 Joyent, Inc.
24  */
25 
26 /*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 /*	Copyright (c) 1987, 1988 Microsoft Corporation	*/
31 /*	  All Rights Reserved	*/
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/signal.h>
38 #include <sys/errno.h>
39 #include <sys/fault.h>
40 #include <sys/syscall.h>
41 #include <sys/cpuvar.h>
42 #include <sys/sysi86.h>
43 #include <sys/psw.h>
44 #include <sys/cred.h>
45 #include <sys/policy.h>
46 #include <sys/thread.h>
47 #include <sys/debug.h>
48 #include <sys/ontrap.h>
49 #include <sys/privregs.h>
50 #include <sys/x86_archext.h>
51 #include <sys/vmem.h>
52 #include <sys/kmem.h>
53 #include <sys/mman.h>
54 #include <sys/archsystm.h>
55 #include <vm/hat.h>
56 #include <vm/as.h>
57 #include <vm/seg.h>
58 #include <vm/seg_kmem.h>
59 #include <vm/faultcode.h>
60 #include <sys/fp.h>
61 #include <sys/cmn_err.h>
62 #include <sys/segments.h>
63 #include <sys/clock.h>
64 #include <vm/hat_i86.h>
65 #if defined(__xpv)
66 #include <sys/hypervisor.h>
67 #include <sys/note.h>
68 #endif
69 
70 static void ldt_alloc(proc_t *, uint_t);
71 static void ldt_free(proc_t *);
72 static void ldt_dup(proc_t *, proc_t *);
73 static void ldt_grow(proc_t *, uint_t);
74 
75 /*
76  * sysi86 System Call
77  */
78 
79 /* ARGSUSED */
80 int
81 sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
82 {
83 	struct ssd ssd;
84 	int error = 0;
85 	int c;
86 	proc_t *pp = curproc;
87 
88 	switch (cmd) {
89 
90 	/*
91 	 * The SI86V86 subsystem call of the SYSI86 system call
92 	 * supports only one subcode -- V86SC_IOPL.
93 	 */
94 	case SI86V86:
95 		if (arg1 == V86SC_IOPL) {
96 #if defined(__xpv)
97 			struct ctxop *ctx;
98 #endif
99 			struct regs *rp = lwptoregs(ttolwp(curthread));
100 			greg_t oldpl = rp->r_ps & PS_IOPL;
101 			greg_t newpl = arg2 & PS_IOPL;
102 
103 			/*
104 			 * Must be privileged to run this system call
105 			 * if giving more io privilege.
106 			 */
107 			if (newpl > oldpl && (error =
108 			    secpolicy_sys_config(CRED(), B_FALSE)) != 0)
109 				return (set_errno(error));
110 #if defined(__xpv)
111 			ctx = installctx_preallocate();
112 			kpreempt_disable();
113 			installctx(curthread, NULL, xen_disable_user_iopl,
114 			    xen_enable_user_iopl, NULL, NULL,
115 			    xen_disable_user_iopl, NULL, ctx);
116 			xen_enable_user_iopl();
117 			kpreempt_enable();
118 #else
119 			rp->r_ps ^= oldpl ^ newpl;
120 #endif
121 		} else
122 			error = EINVAL;
123 		break;
124 
125 	/*
126 	 * Set a segment descriptor
127 	 */
128 	case SI86DSCR:
129 		/*
130 		 * There are considerable problems here manipulating
131 		 * resources shared by many running lwps.  Get everyone
132 		 * into a safe state before changing the LDT.
133 		 */
134 		if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
135 			error = EINTR;
136 			break;
137 		}
138 
139 		if (get_udatamodel() == DATAMODEL_LP64) {
140 			error = EINVAL;
141 			break;
142 		}
143 
144 		if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
145 			error = EFAULT;
146 			break;
147 		}
148 
149 		error = setdscr(&ssd);
150 
151 		mutex_enter(&pp->p_lock);
152 		if (curthread != pp->p_agenttp)
153 			continuelwps(pp);
154 		mutex_exit(&pp->p_lock);
155 		break;
156 
157 	case SI86FPHW:
158 		c = fp_kind & 0xff;
159 		if (suword32((void *)arg1, c) == -1)
160 			error = EFAULT;
161 		break;
162 
163 	case SI86FPSTART:
164 		/*
165 		 * arg1 is the address of _fp_hw
166 		 * arg2 is the desired x87 FCW value
167 		 * arg3 is the desired SSE MXCSR value
168 		 * a return value of one means SSE hardware, else none.
169 		 */
170 		c = fp_kind & 0xff;
171 		if (suword32((void *)arg1, c) == -1) {
172 			error = EFAULT;
173 			break;
174 		}
175 		fpsetcw((uint16_t)arg2, (uint32_t)arg3);
176 		return ((fp_kind & __FP_SSE) ? 1 : 0);
177 
178 	/* real time clock management commands */
179 
180 	case WTODC:
181 		if ((error = secpolicy_settime(CRED())) == 0) {
182 			timestruc_t ts;
183 			mutex_enter(&tod_lock);
184 			gethrestime(&ts);
185 			tod_set(ts);
186 			mutex_exit(&tod_lock);
187 		}
188 		break;
189 
190 /* Give some timezone playing room */
191 #define	ONEWEEK	(7 * 24 * 60 * 60)
192 
193 	case SGMTL:
194 		/*
195 		 * Called from 32 bit land, negative values
196 		 * are not sign extended, so we do that here
197 		 * by casting it to an int and back.  We also
198 		 * clamp the value to within reason and detect
199 		 * when a 64 bit call overflows an int.
200 		 */
201 		if ((error = secpolicy_settime(CRED())) == 0) {
202 			int newlag = (int)arg1;
203 
204 #ifdef _SYSCALL32_IMPL
205 			if (get_udatamodel() == DATAMODEL_NATIVE &&
206 			    (long)newlag != (long)arg1) {
207 				error = EOVERFLOW;
208 			} else
209 #endif
210 			if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
211 				sgmtl(newlag);
212 			else
213 				error = EOVERFLOW;
214 		}
215 		break;
216 
217 	case GGMTL:
218 		if (get_udatamodel() == DATAMODEL_NATIVE) {
219 			if (sulword((void *)arg1, ggmtl()) == -1)
220 				error = EFAULT;
221 #ifdef _SYSCALL32_IMPL
222 		} else {
223 			time_t gmtl;
224 
225 			if ((gmtl = ggmtl()) > INT32_MAX) {
226 				/*
227 				 * Since gmt_lag can at most be
228 				 * +/- 12 hours, something is
229 				 * *seriously* messed up here.
230 				 */
231 				error = EOVERFLOW;
232 			} else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
233 				error = EFAULT;
234 #endif
235 		}
236 		break;
237 
238 	case RTCSYNC:
239 		if ((error = secpolicy_settime(CRED())) == 0)
240 			rtcsync();
241 		break;
242 
243 	/* END OF real time clock management commands */
244 
245 	default:
246 		error = EINVAL;
247 		break;
248 	}
249 	return (error == 0 ? 0 : set_errno(error));
250 }
251 
252 void
253 usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
254 {
255 	ssd->bo = USEGD_GETBASE(usd);
256 	ssd->ls = USEGD_GETLIMIT(usd);
257 	ssd->sel = sel;
258 
259 	/*
260 	 * set type, dpl and present bits.
261 	 */
262 	ssd->acc1 = usd->usd_type;
263 	ssd->acc1 |= usd->usd_dpl << 5;
264 	ssd->acc1 |= usd->usd_p << (5 + 2);
265 
266 	/*
267 	 * set avl, DB and granularity bits.
268 	 */
269 	ssd->acc2 = usd->usd_avl;
270 
271 	ssd->acc2 |= usd->usd_long << 1;
272 
273 	ssd->acc2 |= usd->usd_def32 << (1 + 1);
274 	ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
275 }
276 
277 static void
278 ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
279 {
280 
281 	ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
282 
283 	USEGD_SETBASE(usd, ssd->bo);
284 	USEGD_SETLIMIT(usd, ssd->ls);
285 
286 	/*
287 	 * Set type, dpl and present bits.
288 	 *
289 	 * Force the "accessed" bit to on so that we don't run afoul of
290 	 * KPTI.
291 	 */
292 	usd->usd_type = ssd->acc1 | SDT_A;
293 	usd->usd_dpl = ssd->acc1 >> 5;
294 	usd->usd_p = ssd->acc1 >> (5 + 2);
295 
296 	ASSERT(usd->usd_type >= SDT_MEMRO);
297 	ASSERT(usd->usd_dpl == SEL_UPL);
298 
299 	/*
300 	 * 64-bit code selectors are never allowed in the LDT.
301 	 * Reserved bit is always 0 on 32-bit systems.
302 	 */
303 	usd->usd_long = 0;
304 
305 	/*
306 	 * set avl, DB and granularity bits.
307 	 */
308 	usd->usd_avl = ssd->acc2;
309 	usd->usd_def32 = ssd->acc2 >> (1 + 1);
310 	usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
311 }
312 
313 
314 
315 /*
316  * Load LDT register with the current process's LDT.
317  */
318 static void
319 ldt_load(void)
320 {
321 #if defined(__xpv)
322 	xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1);
323 #else
324 	size_t len;
325 	system_desc_t desc;
326 
327 	/*
328 	 * Before we can use the LDT on this CPU, we must install the LDT in the
329 	 * user mapping table.
330 	 */
331 	len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
332 	bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
333 	CPU->cpu_m.mcpu_ldt_len = len;
334 	set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
335 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
336 
337 	wr_ldtr(ULDT_SEL);
338 #endif
339 }
340 
341 /*
342  * Store a NULL selector in the LDTR. All subsequent illegal references to
343  * the LDT will result in a #gp.
344  */
345 void
346 ldt_unload(void)
347 {
348 #if defined(__xpv)
349 	xen_set_ldt(NULL, 0);
350 #else
351 	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
352 	wr_ldtr(0);
353 
354 	bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
355 	CPU->cpu_m.mcpu_ldt_len = 0;
356 #endif
357 }
358 
359 /*ARGSUSED*/
360 static void
361 ldt_savectx(proc_t *p)
362 {
363 	ASSERT(p->p_ldt != NULL);
364 	ASSERT(p == curproc);
365 
366 	/*
367 	 * The 64-bit kernel must be sure to clear any stale ldt
368 	 * selectors when context switching away from a process that
369 	 * has a private ldt. Consider the following example:
370 	 *
371 	 *	Wine creats a ldt descriptor and points a segment register
372 	 *	to it.
373 	 *
374 	 *	We then context switch away from wine lwp to kernel
375 	 *	thread and hit breakpoint in kernel with kmdb
376 	 *
377 	 *	When we continue and resume from kmdb we will #gp
378 	 *	fault since kmdb will have saved the stale ldt selector
379 	 *	from wine and will try to restore it but we are no longer in
380 	 *	the context of the wine process and do not have our
381 	 *	ldtr register pointing to the private ldt.
382 	 */
383 	reset_sregs();
384 
385 	ldt_unload();
386 	cpu_fast_syscall_enable();
387 }
388 
389 static void
390 ldt_restorectx(proc_t *p)
391 {
392 	ASSERT(p->p_ldt != NULL);
393 	ASSERT(p == curproc);
394 
395 	ldt_load();
396 	cpu_fast_syscall_disable();
397 }
398 
399 /*
400  * At exec time, we need to clear up our LDT context and re-enable fast syscalls
401  * for the new process image.
402  *
403  * The same is true for the other case, where we have:
404  *
405  * proc_exit()
406  *  ->exitpctx()->ldt_savectx()
407  *  ->freepctx()->ldt_freectx()
408  *
409  * Because pre-emption is not prevented between the two callbacks, we could have
410  * come off CPU, and brought back LDT context when coming back on CPU via
411  * ldt_restorectx().
412  */
413 /* ARGSUSED */
414 static void
415 ldt_freectx(proc_t *p, int isexec)
416 {
417 	ASSERT(p->p_ldt != NULL);
418 	ASSERT(p == curproc);
419 
420 	kpreempt_disable();
421 	ldt_free(p);
422 	cpu_fast_syscall_enable();
423 	kpreempt_enable();
424 }
425 
426 /*
427  * Install ctx op that ensures syscall/sysenter are disabled.
428  * See comments below.
429  *
430  * When a thread with a private LDT forks, the new process
431  * must have the LDT context ops installed.
432  */
433 /* ARGSUSED */
434 static void
435 ldt_installctx(proc_t *p, proc_t *cp)
436 {
437 	proc_t		*targ = p;
438 	kthread_t	*t;
439 
440 	/*
441 	 * If this is a fork, operate on the child process.
442 	 */
443 	if (cp != NULL) {
444 		targ = cp;
445 		ldt_dup(p, cp);
446 	}
447 
448 	/*
449 	 * The process context ops expect the target process as their argument.
450 	 */
451 	ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
452 	    ldt_installctx, ldt_savectx, ldt_freectx) == 0);
453 
454 	installpctx(targ, targ, ldt_savectx, ldt_restorectx,
455 	    ldt_installctx, ldt_savectx, ldt_freectx);
456 
457 	/*
458 	 * We've just disabled fast system call and return instructions; take
459 	 * the slow path out to make sure we don't try to use one to return
460 	 * back to user. We must set t_post_sys for every thread in the
461 	 * process to make sure none of them escape out via fast return.
462 	 */
463 
464 	mutex_enter(&targ->p_lock);
465 	t = targ->p_tlist;
466 	do {
467 		t->t_post_sys = 1;
468 	} while ((t = t->t_forw) != targ->p_tlist);
469 	mutex_exit(&targ->p_lock);
470 }
471 
472 int
473 setdscr(struct ssd *ssd)
474 {
475 	ushort_t seli;		/* selector index */
476 	user_desc_t *ldp;	/* descriptor pointer */
477 	user_desc_t ndesc;	/* new descriptor */
478 	proc_t	*pp = curproc;
479 	int	rc = 0;
480 
481 	/*
482 	 * LDT segments: executable and data at DPL 3 only.
483 	 */
484 	if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
485 		return (EINVAL);
486 
487 	/*
488 	 * check the selector index.
489 	 */
490 	seli = SELTOIDX(ssd->sel);
491 	if (seli >= MAXNLDT || seli < LDT_UDBASE)
492 		return (EINVAL);
493 
494 	ndesc = null_udesc;
495 	mutex_enter(&pp->p_ldtlock);
496 
497 	/*
498 	 * If this is the first time for this process then setup a
499 	 * private LDT for it.
500 	 */
501 	if (pp->p_ldt == NULL) {
502 		ldt_alloc(pp, seli);
503 
504 		/*
505 		 * Now that this process has a private LDT, the use of
506 		 * the syscall/sysret and sysenter/sysexit instructions
507 		 * is forbidden for this processes because they destroy
508 		 * the contents of %cs and %ss segment registers.
509 		 *
510 		 * Explicity disable them here and add a context handler
511 		 * to the process. Note that disabling
512 		 * them here means we can't use sysret or sysexit on
513 		 * the way out of this system call - so we force this
514 		 * thread to take the slow path (which doesn't make use
515 		 * of sysenter or sysexit) back out.
516 		 */
517 		kpreempt_disable();
518 		ldt_installctx(pp, NULL);
519 		cpu_fast_syscall_disable();
520 		ASSERT(curthread->t_post_sys != 0);
521 		kpreempt_enable();
522 
523 	} else if (seli > pp->p_ldtlimit) {
524 		ASSERT(pp->p_pctx != NULL);
525 
526 		/*
527 		 * Increase size of ldt to include seli.
528 		 */
529 		ldt_grow(pp, seli);
530 	}
531 
532 	ASSERT(seli <= pp->p_ldtlimit);
533 	ldp = &pp->p_ldt[seli];
534 
535 	/*
536 	 * On the 64-bit kernel, this is where things get more subtle.
537 	 * Recall that in the 64-bit kernel, when we enter the kernel we
538 	 * deliberately -don't- reload the segment selectors we came in on
539 	 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
540 	 * and the underlying descriptors are essentially ignored by the
541 	 * hardware in long mode - except for the base that we override with
542 	 * the gsbase MSRs.
543 	 *
544 	 * However, there's one unfortunate issue with this rosy picture --
545 	 * a descriptor that's not marked as 'present' will still generate
546 	 * an #np when loading a segment register.
547 	 *
548 	 * Consider this case.  An lwp creates a harmless LDT entry, points
549 	 * one of it's segment registers at it, then tells the kernel (here)
550 	 * to delete it.  In the 32-bit kernel, the #np will happen on the
551 	 * way back to userland where we reload the segment registers, and be
552 	 * handled in kern_gpfault().  In the 64-bit kernel, the same thing
553 	 * will happen in the normal case too.  However, if we're trying to
554 	 * use a debugger that wants to save and restore the segment registers,
555 	 * and the debugger things that we have valid segment registers, we
556 	 * have the problem that the debugger will try and restore the
557 	 * segment register that points at the now 'not present' descriptor
558 	 * and will take a #np right there.
559 	 *
560 	 * We should obviously fix the debugger to be paranoid about
561 	 * -not- restoring segment registers that point to bad descriptors;
562 	 * however we can prevent the problem here if we check to see if any
563 	 * of the segment registers are still pointing at the thing we're
564 	 * destroying; if they are, return an error instead. (That also seems
565 	 * a lot better failure mode than SIGKILL and a core file
566 	 * from kern_gpfault() too.)
567 	 */
568 	if (SI86SSD_PRES(ssd) == 0) {
569 		kthread_t *t;
570 		int bad = 0;
571 
572 		/*
573 		 * Look carefully at the segment registers of every lwp
574 		 * in the process (they're all stopped by our caller).
575 		 * If we're about to invalidate a descriptor that's still
576 		 * being referenced by *any* of them, return an error,
577 		 * rather than having them #gp on their way out of the kernel.
578 		 */
579 		ASSERT(pp->p_lwprcnt == 1);
580 
581 		mutex_enter(&pp->p_lock);
582 		t = pp->p_tlist;
583 		do {
584 			klwp_t *lwp = ttolwp(t);
585 			struct regs *rp = lwp->lwp_regs;
586 			pcb_t *pcb = &lwp->lwp_pcb;
587 
588 			if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
589 				bad = 1;
590 				break;
591 			}
592 
593 			if (PCB_NEED_UPDATE_SEGS(pcb)) {
594 				if (ssd->sel == pcb->pcb_ds ||
595 				    ssd->sel == pcb->pcb_es ||
596 				    ssd->sel == pcb->pcb_fs ||
597 				    ssd->sel == pcb->pcb_gs) {
598 					bad = 1;
599 					break;
600 				}
601 			} else {
602 				if (ssd->sel == rp->r_ds ||
603 				    ssd->sel == rp->r_es ||
604 				    ssd->sel == rp->r_fs ||
605 				    ssd->sel == rp->r_gs) {
606 					bad = 1;
607 					break;
608 				}
609 			}
610 
611 		} while ((t = t->t_forw) != pp->p_tlist);
612 		mutex_exit(&pp->p_lock);
613 
614 		if (bad) {
615 			mutex_exit(&pp->p_ldtlock);
616 			return (EBUSY);
617 		}
618 	}
619 
620 	/*
621 	 * If acc1 is zero, clear the descriptor (including the 'present' bit).
622 	 * Make sure we update the CPU-private copy of the LDT.
623 	 */
624 	if (ssd->acc1 == 0) {
625 		rc  = ldt_update_segd(ldp, &null_udesc);
626 		kpreempt_disable();
627 		ldt_load();
628 		kpreempt_enable();
629 		mutex_exit(&pp->p_ldtlock);
630 		return (rc);
631 	}
632 
633 	/*
634 	 * Check segment type, allow segment not present and
635 	 * only user DPL (3).
636 	 */
637 	if (SI86SSD_DPL(ssd) != SEL_UPL) {
638 		mutex_exit(&pp->p_ldtlock);
639 		return (EINVAL);
640 	}
641 
642 	/*
643 	 * Do not allow 32-bit applications to create 64-bit mode code
644 	 * segments.
645 	 */
646 	if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
647 	    SI86SSD_ISLONG(ssd)) {
648 		mutex_exit(&pp->p_ldtlock);
649 		return (EINVAL);
650 	}
651 
652 	/*
653 	 * Set up a code or data user segment descriptor, making sure to update
654 	 * the CPU-private copy of the LDT.
655 	 */
656 	if (SI86SSD_ISUSEG(ssd)) {
657 		ssd_to_usd(ssd, &ndesc);
658 		rc = ldt_update_segd(ldp, &ndesc);
659 		kpreempt_disable();
660 		ldt_load();
661 		kpreempt_enable();
662 		mutex_exit(&pp->p_ldtlock);
663 		return (rc);
664 	}
665 
666 	mutex_exit(&pp->p_ldtlock);
667 	return (EINVAL);
668 }
669 
670 /*
671  * Allocate new LDT for process just large enough to contain seli.  Note we
672  * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the
673  * implementation and because on the hypervisor it's required, since the LDT
674  * must live on pages that have PROT_WRITE removed and which are given to the
675  * hypervisor.
676  *
677  * Note that we don't actually load the LDT into the current CPU here: it's done
678  * later by our caller.
679  */
680 static void
681 ldt_alloc(proc_t *pp, uint_t seli)
682 {
683 	user_desc_t	*ldt;
684 	size_t		ldtsz;
685 	uint_t		nsels;
686 
687 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
688 	ASSERT(pp->p_ldt == NULL);
689 	ASSERT(pp->p_ldtlimit == 0);
690 
691 	/*
692 	 * Allocate new LDT just large enough to contain seli. The LDT must
693 	 * always be allocated in units of pages for KPTI.
694 	 */
695 	ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
696 	nsels = ldtsz / sizeof (user_desc_t);
697 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
698 
699 	ldt = kmem_zalloc(ldtsz, KM_SLEEP);
700 	ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
701 
702 #if defined(__xpv)
703 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
704 		panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
705 #endif
706 
707 	pp->p_ldt = ldt;
708 	pp->p_ldtlimit = nsels - 1;
709 }
710 
711 static void
712 ldt_free(proc_t *pp)
713 {
714 	user_desc_t	*ldt;
715 	size_t		ldtsz;
716 
717 	ASSERT(pp->p_ldt != NULL);
718 
719 	mutex_enter(&pp->p_ldtlock);
720 	ldt = pp->p_ldt;
721 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
722 
723 	ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
724 
725 	pp->p_ldt = NULL;
726 	pp->p_ldtlimit = 0;
727 	mutex_exit(&pp->p_ldtlock);
728 
729 	if (pp == curproc) {
730 		kpreempt_disable();
731 		ldt_unload();
732 		kpreempt_enable();
733 	}
734 
735 #if defined(__xpv)
736 	/*
737 	 * We are not allowed to make the ldt writable until after
738 	 * we tell the hypervisor to unload it.
739 	 */
740 	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
741 		panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
742 #endif
743 
744 	kmem_free(ldt, ldtsz);
745 }
746 
747 /*
748  * On fork copy new ldt for child.
749  */
750 static void
751 ldt_dup(proc_t *pp, proc_t *cp)
752 {
753 	size_t	ldtsz;
754 
755 	ASSERT(pp->p_ldt != NULL);
756 	ASSERT(cp != curproc);
757 
758 	/*
759 	 * I assume the parent's ldt can't increase since we're in a fork.
760 	 */
761 	mutex_enter(&pp->p_ldtlock);
762 	mutex_enter(&cp->p_ldtlock);
763 
764 	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
765 
766 	ldt_alloc(cp, pp->p_ldtlimit);
767 
768 #if defined(__xpv)
769 	/*
770 	 * Make child's ldt writable so it can be copied into from
771 	 * parent's ldt. This works since ldt_alloc above did not load
772 	 * the ldt since its for the child process. If we tried to make
773 	 * an LDT writable that is loaded in hw the setprot operation
774 	 * would fail.
775 	 */
776 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
777 		panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
778 #endif
779 
780 	bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
781 
782 #if defined(__xpv)
783 	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
784 		panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
785 #endif
786 	mutex_exit(&cp->p_ldtlock);
787 	mutex_exit(&pp->p_ldtlock);
788 
789 }
790 
791 /*
792  * Note that we don't actually load the LDT into the current CPU here: it's done
793  * later by our caller - unless we take an error.  This works out because
794  * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT
795  * (and therefore can't be using the freed old LDT), and by definition if the
796  * new entry didn't pass validation, then the proc shouldn't be referencing an
797  * entry in the extended region.
798  */
799 static void
800 ldt_grow(proc_t *pp, uint_t seli)
801 {
802 	user_desc_t	*oldt, *nldt;
803 	uint_t		nsels;
804 	size_t		oldtsz, nldtsz;
805 
806 	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
807 	ASSERT(pp->p_ldt != NULL);
808 	ASSERT(pp->p_ldtlimit != 0);
809 
810 	/*
811 	 * Allocate larger LDT just large enough to contain seli. The LDT must
812 	 * always be allocated in units of pages for KPTI.
813 	 */
814 	nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
815 	nsels = nldtsz / sizeof (user_desc_t);
816 	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
817 	ASSERT(nsels > pp->p_ldtlimit);
818 
819 	oldt = pp->p_ldt;
820 	oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
821 
822 	nldt = kmem_zalloc(nldtsz, KM_SLEEP);
823 	ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
824 
825 	bcopy(oldt, nldt, oldtsz);
826 
827 	/*
828 	 * unload old ldt.
829 	 */
830 	kpreempt_disable();
831 	ldt_unload();
832 	kpreempt_enable();
833 
834 #if defined(__xpv)
835 
836 	/*
837 	 * Make old ldt writable and new ldt read only.
838 	 */
839 	if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
840 		panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
841 
842 	if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
843 		panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
844 #endif
845 
846 	pp->p_ldt = nldt;
847 	pp->p_ldtlimit = nsels - 1;
848 
849 	kmem_free(oldt, oldtsz);
850 }
851