xref: /illumos-gate/usr/src/uts/common/os/schedctl.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/systm.h>
29 #include <sys/schedctl.h>
30 #include <sys/proc.h>
31 #include <sys/thread.h>
32 #include <sys/class.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/cmn_err.h>
36 #include <sys/stack.h>
37 #include <sys/debug.h>
38 #include <sys/cpuvar.h>
39 #include <sys/sobject.h>
40 #include <sys/door.h>
41 #include <sys/modctl.h>
42 #include <sys/syscall.h>
43 #include <sys/sysmacros.h>
44 #include <sys/vmsystm.h>
45 #include <sys/mman.h>
46 #include <sys/vnode.h>
47 #include <sys/swap.h>
48 #include <sys/lwp.h>
49 #include <sys/bitmap.h>
50 #include <sys/atomic.h>
51 #include <sys/fcntl.h>
52 #include <vm/seg_kp.h>
53 #include <vm/seg_vn.h>
54 #include <vm/as.h>
55 #include <fs/fs_subr.h>
56 
57 /*
58  * Page handling structures.  This is set up as a list of per-page
59  * control structures (sc_page_ctl), with p->p_pagep pointing to
60  * the first.  The per-page structures point to the actual pages
61  * and contain pointers to the user address for each mapped page.
62  *
63  * All data is protected by p->p_sc_lock.  Since this lock is
64  * held while waiting for memory, schedctl_shared_alloc() should
65  * not be called while holding p_lock.
66  */
67 
68 typedef struct sc_page_ctl {
69 	struct sc_page_ctl *spc_next;
70 	sc_shared_t	*spc_base;	/* base of kernel page */
71 	sc_shared_t	*spc_end;	/* end of usable space */
72 	ulong_t		*spc_map;	/* bitmap of allocated space on page */
73 	size_t		spc_space;	/* amount of space on page */
74 	caddr_t		spc_uaddr;	/* user-level address of the page */
75 	struct anon_map	*spc_amp;	/* anonymous memory structure */
76 } sc_page_ctl_t;
77 
78 static size_t	sc_pagesize;		/* size of usable space on page */
79 static size_t	sc_bitmap_len;		/* # of bits in allocation bitmap */
80 static size_t	sc_bitmap_words;	/* # of words in allocation bitmap */
81 
82 /* Context ops */
83 static void	schedctl_save(sc_shared_t *);
84 static void	schedctl_restore(sc_shared_t *);
85 static void	schedctl_fork(kthread_t *, kthread_t *);
86 
87 /* Functions for handling shared pages */
88 static int	schedctl_shared_alloc(sc_shared_t **, uintptr_t *);
89 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *);
90 static int	schedctl_map(struct anon_map *, caddr_t *, caddr_t);
91 static int	schedctl_getpage(struct anon_map **, caddr_t *);
92 static void	schedctl_freepage(struct anon_map *, caddr_t);
93 
94 /*
95  * System call interface to scheduler activations.
96  * This always operates on the current lwp.
97  */
98 caddr_t
99 schedctl(void)
100 {
101 	kthread_t	*t = curthread;
102 	sc_shared_t	*ssp;
103 	uintptr_t	uaddr;
104 	int		error;
105 
106 	if (t->t_schedctl == NULL) {
107 		/*
108 		 * Allocate and initialize the shared structure.
109 		 */
110 		if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0)
111 			return ((caddr_t)(uintptr_t)set_errno(error));
112 		bzero(ssp, sizeof (*ssp));
113 
114 		installctx(t, ssp, schedctl_save, schedctl_restore,
115 		    schedctl_fork, NULL, NULL, NULL);
116 
117 		thread_lock(t);	/* protect against ts_tick and ts_update */
118 		t->t_schedctl = ssp;
119 		t->t_sc_uaddr = uaddr;
120 		ssp->sc_cid = t->t_cid;
121 		ssp->sc_cpri = t->t_cpri;
122 		ssp->sc_priority = DISP_PRIO(t);
123 		thread_unlock(t);
124 	}
125 
126 	return ((caddr_t)t->t_sc_uaddr);
127 }
128 
129 
130 /*
131  * Clean up scheduler activations state associated with an exiting
132  * (or execing) lwp.  t is always the current thread.
133  */
134 void
135 schedctl_lwp_cleanup(kthread_t *t)
136 {
137 	sc_shared_t	*ssp = t->t_schedctl;
138 	proc_t		*p = ttoproc(t);
139 	sc_page_ctl_t	*pagep;
140 	index_t		index;
141 
142 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
143 
144 	thread_lock(t);		/* protect against ts_tick and ts_update */
145 	t->t_schedctl = NULL;
146 	t->t_sc_uaddr = 0;
147 	thread_unlock(t);
148 
149 	/*
150 	 * Remove the context op to avoid the final call to
151 	 * schedctl_save when switching away from this lwp.
152 	 */
153 	(void) removectx(t, ssp, schedctl_save, schedctl_restore,
154 	    schedctl_fork, NULL, NULL, NULL);
155 
156 	/*
157 	 * Do not unmap the shared page until the process exits.
158 	 * User-level library code relies on this for adaptive mutex locking.
159 	 */
160 	mutex_enter(&p->p_sc_lock);
161 	ssp->sc_state = SC_FREE;
162 	pagep = schedctl_page_lookup(ssp);
163 	index = (index_t)(ssp - pagep->spc_base);
164 	BT_CLEAR(pagep->spc_map, index);
165 	pagep->spc_space += sizeof (sc_shared_t);
166 	mutex_exit(&p->p_sc_lock);
167 }
168 
169 
170 /*
171  * Cleanup the list of schedctl shared pages for the process.
172  * Called from exec() and exit() system calls.
173  */
174 void
175 schedctl_proc_cleanup(void)
176 {
177 	proc_t *p = curproc;
178 	sc_page_ctl_t *pagep;
179 	sc_page_ctl_t *next;
180 
181 	ASSERT(p->p_lwpcnt == 1);	/* we are single-threaded now */
182 	ASSERT(curthread->t_schedctl == NULL);
183 
184 	/*
185 	 * Since we are single-threaded, we don't have to hold p->p_sc_lock.
186 	 */
187 	pagep = p->p_pagep;
188 	p->p_pagep = NULL;
189 	while (pagep != NULL) {
190 		ASSERT(pagep->spc_space == sc_pagesize);
191 		next = pagep->spc_next;
192 		/*
193 		 * Unmap the user space and free the mapping structure.
194 		 */
195 		(void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE);
196 		schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base));
197 		kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words);
198 		kmem_free(pagep, sizeof (sc_page_ctl_t));
199 		pagep = next;
200 	}
201 }
202 
203 
204 /*
205  * Called by resume just before switching away from the current thread.
206  * Save new thread state.
207  */
208 static void
209 schedctl_save(sc_shared_t *ssp)
210 {
211 	ssp->sc_state = curthread->t_state;
212 }
213 
214 
215 /*
216  * Called by resume after switching to the current thread.
217  * Save new thread state and CPU.
218  */
219 static void
220 schedctl_restore(sc_shared_t *ssp)
221 {
222 	ssp->sc_state = SC_ONPROC;
223 	ssp->sc_cpu = CPU->cpu_id;
224 }
225 
226 
227 /*
228  * On fork, remove inherited mappings from the child's address space.
229  * The child's threads must call schedctl() to get new shared mappings.
230  */
231 static void
232 schedctl_fork(kthread_t *pt, kthread_t *ct)
233 {
234 	proc_t *pp = ttoproc(pt);
235 	proc_t *cp = ttoproc(ct);
236 	sc_page_ctl_t *pagep;
237 
238 	ASSERT(ct->t_schedctl == NULL);
239 
240 	/*
241 	 * Do this only once, whether we are doing fork1() or forkall().
242 	 * Don't do it at all if the child process is a child of vfork()
243 	 * because a child of vfork() borrows the parent's address space.
244 	 */
245 	if (pt != curthread || (cp->p_flag & SVFORK))
246 		return;
247 
248 	mutex_enter(&pp->p_sc_lock);
249 	for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next)
250 		(void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE);
251 	mutex_exit(&pp->p_sc_lock);
252 }
253 
254 
255 /*
256  * Returns non-zero if the specified thread shouldn't be preempted at this time.
257  * Called by ts_preempt(), ts_tick(), and ts_update().
258  */
259 int
260 schedctl_get_nopreempt(kthread_t *t)
261 {
262 	ASSERT(THREAD_LOCK_HELD(t));
263 	return (t->t_schedctl->sc_preemptctl.sc_nopreempt);
264 }
265 
266 
267 /*
268  * Sets the value of the nopreempt field for the specified thread.
269  * Called by ts_preempt() to clear the field on preemption.
270  */
271 void
272 schedctl_set_nopreempt(kthread_t *t, short val)
273 {
274 	ASSERT(THREAD_LOCK_HELD(t));
275 	t->t_schedctl->sc_preemptctl.sc_nopreempt = val;
276 }
277 
278 
279 /*
280  * Sets the value of the yield field for the specified thread.
281  * Called by ts_preempt() and ts_tick() to set the field, and
282  * ts_yield() to clear it.
283  * The kernel never looks at this field so we don't need a
284  * schedctl_get_yield() function.
285  */
286 void
287 schedctl_set_yield(kthread_t *t, short val)
288 {
289 	ASSERT(THREAD_LOCK_HELD(t));
290 	t->t_schedctl->sc_preemptctl.sc_yield = val;
291 }
292 
293 
294 /*
295  * Sets the values of the cid and priority fields for the specified thread.
296  * Called from thread_change_pri(), thread_change_epri(), THREAD_CHANGE_PRI().
297  * Called following calls to CL_FORKRET() and CL_ENTERCLASS().
298  */
299 void
300 schedctl_set_cidpri(kthread_t *t)
301 {
302 	sc_shared_t *tdp = t->t_schedctl;
303 
304 	if (tdp != NULL) {
305 		tdp->sc_cid = t->t_cid;
306 		tdp->sc_cpri = t->t_cpri;
307 		tdp->sc_priority = DISP_PRIO(t);
308 	}
309 }
310 
311 
312 /*
313  * Returns non-zero if the specified thread has requested that all
314  * signals be blocked.  Called by signal-related code that tests
315  * the signal mask of a thread that may not be the current thread
316  * and where the process's p_lock cannot be acquired.
317  */
318 int
319 schedctl_sigblock(kthread_t *t)
320 {
321 	sc_shared_t *tdp = t->t_schedctl;
322 
323 	if (tdp != NULL)
324 		return (tdp->sc_sigblock);
325 	return (0);
326 }
327 
328 
329 /*
330  * If the sc_sigblock field is set for the specified thread, set
331  * its signal mask to block all maskable signals, then clear the
332  * sc_sigblock field.  This finishes what user-level code requested
333  * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
334  * Called from signal-related code either by the current thread for
335  * itself or by a thread that holds the process's p_lock (/proc code).
336  */
337 void
338 schedctl_finish_sigblock(kthread_t *t)
339 {
340 	sc_shared_t *tdp = t->t_schedctl;
341 
342 	ASSERT(t == curthread || MUTEX_HELD(&ttoproc(t)->p_lock));
343 
344 	if (tdp != NULL && tdp->sc_sigblock) {
345 		t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0;
346 		t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1;
347 		tdp->sc_sigblock = 0;
348 	}
349 }
350 
351 
352 /*
353  * Return non-zero if the current thread has declared that it has
354  * a cancellation pending and that cancellation is not disabled.
355  * If SIGCANCEL is blocked, we must be going over the wire in an
356  * NFS transaction (sigintr() was called); return zero in this case.
357  */
358 int
359 schedctl_cancel_pending(void)
360 {
361 	sc_shared_t *tdp = curthread->t_schedctl;
362 
363 	if (tdp != NULL &&
364 	    (tdp->sc_flgs & SC_CANCEL_FLG) &&
365 	    !tdp->sc_sigblock &&
366 	    !sigismember(&curthread->t_hold, SIGCANCEL))
367 		return (1);
368 	return (0);
369 }
370 
371 
372 /*
373  * Inform libc that the kernel returned EINTR from some system call
374  * due to there being a cancellation pending (SC_CANCEL_FLG set or
375  * we received an SI_LWP SIGCANCEL while in a system call), rather
376  * than because of some other signal.  User-level code can try to
377  * recover from receiving other signals, but it can't recover from
378  * being cancelled.
379  */
380 void
381 schedctl_cancel_eintr(void)
382 {
383 	sc_shared_t *tdp = curthread->t_schedctl;
384 
385 	if (tdp != NULL)
386 		tdp->sc_flgs |= SC_EINTR_FLG;
387 }
388 
389 
390 /*
391  * Return non-zero if the current thread has declared that
392  * it is calling into the kernel to park, else return zero.
393  */
394 int
395 schedctl_is_park(void)
396 {
397 	sc_shared_t *tdp = curthread->t_schedctl;
398 
399 	if (tdp != NULL)
400 		return ((tdp->sc_flgs & SC_PARK_FLG) != 0);
401 	/*
402 	 * If we're here and there is no shared memory (how could
403 	 * that happen?) then just assume we really are here to park.
404 	 */
405 	return (1);
406 }
407 
408 
409 /*
410  * Declare thread is parking.
411  *
412  * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid)
413  * in order to declare that the thread is calling into the kernel to park.
414  *
415  * This interface exists ONLY to support older versions of libthread which
416  * are not aware of the SC_PARK_FLG flag.
417  *
418  * Older versions of libthread which are not aware of the SC_PARK_FLG flag
419  * need to be modified or emulated to call lwpsys_park(4, ...) instead of
420  * lwpsys_park(0, ...).  This will invoke schedctl_set_park() before
421  * lwp_park() to declare that the thread is parking.
422  */
423 void
424 schedctl_set_park(void)
425 {
426 	sc_shared_t *tdp = curthread->t_schedctl;
427 	if (tdp != NULL)
428 		tdp->sc_flgs |= SC_PARK_FLG;
429 }
430 
431 
432 /*
433  * Clear the parking flag on return from parking in the kernel.
434  */
435 void
436 schedctl_unpark(void)
437 {
438 	sc_shared_t *tdp = curthread->t_schedctl;
439 
440 	if (tdp != NULL)
441 		tdp->sc_flgs &= ~SC_PARK_FLG;
442 }
443 
444 
445 /*
446  * Page handling code.
447  */
448 
449 void
450 schedctl_init(void)
451 {
452 	/*
453 	 * Amount of page that can hold sc_shared_t structures.  If
454 	 * sizeof (sc_shared_t) is a power of 2, this should just be
455 	 * PAGESIZE.
456 	 */
457 	sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t));
458 
459 	/*
460 	 * Allocation bitmap is one bit per struct on a page.
461 	 */
462 	sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t);
463 	sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL);
464 }
465 
466 
467 static int
468 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp)
469 {
470 	proc_t		*p = curproc;
471 	sc_page_ctl_t	*pagep;
472 	sc_shared_t	*ssp;
473 	caddr_t		base;
474 	index_t		index;
475 	int		error;
476 
477 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
478 	mutex_enter(&p->p_sc_lock);
479 
480 	/*
481 	 * Try to find space for the new data in existing pages
482 	 * within the process's list of shared pages.
483 	 */
484 	for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next)
485 		if (pagep->spc_space != 0)
486 			break;
487 
488 	if (pagep != NULL)
489 		base = pagep->spc_uaddr;
490 	else {
491 		struct anon_map *amp;
492 		caddr_t kaddr;
493 
494 		/*
495 		 * No room, need to allocate a new page.  Also set up
496 		 * a mapping to the kernel address space for the new
497 		 * page and lock it in memory.
498 		 */
499 		if ((error = schedctl_getpage(&amp, &kaddr)) != 0) {
500 			mutex_exit(&p->p_sc_lock);
501 			return (error);
502 		}
503 		if ((error = schedctl_map(amp, &base, kaddr)) != 0) {
504 			schedctl_freepage(amp, kaddr);
505 			mutex_exit(&p->p_sc_lock);
506 			return (error);
507 		}
508 
509 		/*
510 		 * Allocate and initialize the page control structure.
511 		 */
512 		pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP);
513 		pagep->spc_amp = amp;
514 		pagep->spc_base = (sc_shared_t *)kaddr;
515 		pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize);
516 		pagep->spc_uaddr = base;
517 
518 		pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words,
519 		    KM_SLEEP);
520 		pagep->spc_space = sc_pagesize;
521 
522 		pagep->spc_next = p->p_pagep;
523 		p->p_pagep = pagep;
524 	}
525 
526 	/*
527 	 * Got a page, now allocate space for the data.  There should
528 	 * be space unless something's wrong.
529 	 */
530 	ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t));
531 	index = bt_availbit(pagep->spc_map, sc_bitmap_len);
532 	ASSERT(index != -1);
533 
534 	/*
535 	 * Get location with pointer arithmetic.  spc_base is of type
536 	 * sc_shared_t *.  Mark as allocated.
537 	 */
538 	ssp = pagep->spc_base + index;
539 	BT_SET(pagep->spc_map, index);
540 	pagep->spc_space -= sizeof (sc_shared_t);
541 
542 	mutex_exit(&p->p_sc_lock);
543 
544 	/*
545 	 * Return kernel and user addresses.
546 	 */
547 	*kaddrp = ssp;
548 	*uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET);
549 	return (0);
550 }
551 
552 
553 /*
554  * Find the page control structure corresponding to a kernel address.
555  */
556 static sc_page_ctl_t *
557 schedctl_page_lookup(sc_shared_t *ssp)
558 {
559 	proc_t *p = curproc;
560 	sc_page_ctl_t *pagep;
561 
562 	ASSERT(MUTEX_HELD(&p->p_sc_lock));
563 	for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) {
564 		if (ssp >= pagep->spc_base && ssp < pagep->spc_end)
565 			return (pagep);
566 	}
567 	return (NULL);		/* This "can't happen".  Should we panic? */
568 }
569 
570 
571 /*
572  * This function is called when a page needs to be mapped into a
573  * process's address space.  Allocate the user address space and
574  * set up the mapping to the page.  Assumes the page has already
575  * been allocated and locked in memory via schedctl_getpage.
576  */
577 static int
578 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr)
579 {
580 	caddr_t addr = NULL;
581 	struct as *as = curproc->p_as;
582 	struct segvn_crargs vn_a;
583 	int error;
584 
585 	as_rangelock(as);
586 	/* pass address of kernel mapping as offset to avoid VAC conflicts */
587 	map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0);
588 	if (addr == NULL) {
589 		as_rangeunlock(as);
590 		return (ENOMEM);
591 	}
592 
593 	/*
594 	 * Use segvn to set up the mapping to the page.
595 	 */
596 	vn_a.vp = NULL;
597 	vn_a.offset = 0;
598 	vn_a.cred = NULL;
599 	vn_a.type = MAP_SHARED;
600 	vn_a.prot = vn_a.maxprot = PROT_ALL;
601 	vn_a.flags = 0;
602 	vn_a.amp = amp;
603 	vn_a.szc = 0;
604 	vn_a.lgrp_mem_policy_flags = 0;
605 	error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a);
606 	as_rangeunlock(as);
607 
608 	if (error)
609 		return (error);
610 
611 	*uaddrp = addr;
612 	return (0);
613 }
614 
615 
616 /*
617  * Allocate a new page from anonymous memory.  Also, create a kernel
618  * mapping to the page and lock the page in memory.
619  */
620 static int
621 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr)
622 {
623 	struct anon_map *amp;
624 	caddr_t kaddr;
625 
626 	/*
627 	 * Set up anonymous memory struct.  No swap reservation is
628 	 * needed since the page will be locked into memory.
629 	 */
630 	amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP);
631 
632 	/*
633 	 * Allocate the page.
634 	 */
635 	kaddr = segkp_get_withanonmap(segkp, PAGESIZE,
636 	    KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp);
637 	if (kaddr == NULL) {
638 		amp->refcnt--;
639 		anonmap_free(amp);
640 		return (ENOMEM);
641 	}
642 
643 	/*
644 	 * The page is left SE_SHARED locked so that it won't be
645 	 * paged out or relocated (KPD_LOCKED above).
646 	 */
647 
648 	*newamp = amp;
649 	*newaddr = kaddr;
650 	return (0);
651 }
652 
653 
654 /*
655  * Take the necessary steps to allow a page to be released.
656  * This is called when the process is doing exit() or exec().
657  * There should be no accesses to the page after this.
658  * The kernel mapping of the page is released and the page is unlocked.
659  */
660 static void
661 schedctl_freepage(struct anon_map *amp, caddr_t kaddr)
662 {
663 	/*
664 	 * Release the lock on the page and remove the kernel mapping.
665 	 */
666 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
667 	segkp_release(segkp, kaddr);
668 
669 	/*
670 	 * Decrement the refcnt so the anon_map structure will be freed.
671 	 */
672 	if (--amp->refcnt == 0) {
673 		/*
674 		 * The current process no longer has the page mapped, so
675 		 * we have to free everything rather than letting as_free
676 		 * do the work.
677 		 */
678 		anonmap_purge(amp);
679 		anon_free(amp->ahp, 0, PAGESIZE);
680 		ANON_LOCK_EXIT(&amp->a_rwlock);
681 		anonmap_free(amp);
682 	} else {
683 		ANON_LOCK_EXIT(&amp->a_rwlock);
684 	}
685 }
686