xref: /illumos-gate/usr/src/uts/common/os/task.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/atomic.h>
27 #include <sys/cmn_err.h>
28 #include <sys/exacct.h>
29 #include <sys/id_space.h>
30 #include <sys/kmem.h>
31 #include <sys/modhash.h>
32 #include <sys/mutex.h>
33 #include <sys/proc.h>
34 #include <sys/project.h>
35 #include <sys/rctl.h>
36 #include <sys/systm.h>
37 #include <sys/task.h>
38 #include <sys/time.h>
39 #include <sys/types.h>
40 #include <sys/zone.h>
41 #include <sys/cpuvar.h>
42 #include <sys/fss.h>
43 #include <sys/class.h>
44 #include <sys/project.h>
45 
46 /*
47  * Tasks
48  *
49  *   A task is a collection of processes, associated with a common project ID
50  *   and related by a common initial parent.  The task primarily represents a
51  *   natural process sequence with known resource usage, although it can also be
52  *   viewed as a convenient grouping of processes for signal delivery, processor
53  *   binding, and administrative operations.
54  *
55  * Membership and observership
56  *   We can conceive of situations where processes outside of the task may wish
57  *   to examine the resource usage of the task.  Similarly, a number of the
58  *   administrative operations on a task can be performed by processes who are
59  *   not members of the task.  Accordingly, we must design a locking strategy
60  *   where observers of the task, who wish to examine or operate on the task,
61  *   and members of task, who can perform the mentioned operations, as well as
62  *   leave the task, see a consistent and correct representation of the task at
63  *   all times.
64  *
65  * Locking
66  *   Because the task membership is a new relation between processes, its
67  *   locking becomes an additional responsibility of the pidlock/p_lock locking
68  *   sequence; however, tasks closely resemble sessions and the session locking
69  *   model is mostly appropriate for the interaction of tasks, processes, and
70  *   procfs.
71  *
72  *   kmutex_t task_hash_lock
73  *     task_hash_lock is a global lock protecting the contents of the task
74  *     ID-to-task pointer hash.  Holders of task_hash_lock must not attempt to
75  *     acquire pidlock or p_lock.
76  *   uint_t tk_hold_count
77  *     tk_hold_count, the number of members and observers of the current task,
78  *     must be manipulated atomically.
79  *   proc_t *tk_memb_list
80  *   proc_t *p_tasknext
81  *   proc_t *p_taskprev
82  *     The task's membership list is protected by pidlock, and is therefore
83  *     always acquired before any of its members' p_lock mutexes.  The p_task
84  *     member of the proc structure is protected by pidlock or p_lock for
85  *     reading, and by both pidlock and p_lock for modification, as is done for
86  *     p_sessp.  The key point is that only the process can modify its p_task,
87  *     and not any entity on the system.  (/proc will use prlock() to prevent
88  *     the process from leaving, as opposed to pidlock.)
89  *   kmutex_t tk_usage_lock
90  *     tk_usage_lock is a per-task lock protecting the contents of the task
91  *     usage structure and tk_nlwps counter for the task.max-lwps resource
92  *     control.
93  */
94 
95 int task_hash_size = 256;
96 static kmutex_t task_hash_lock;
97 static mod_hash_t *task_hash;
98 
99 static id_space_t *taskid_space;	/* global taskid space */
100 static kmem_cache_t *task_cache;	/* kmem cache for task structures */
101 
102 rctl_hndl_t rc_task_lwps;
103 rctl_hndl_t rc_task_cpu_time;
104 
105 /*
106  * static rctl_qty_t task_usage_lwps(void *taskp)
107  *
108  * Overview
109  *   task_usage_lwps() is the usage operation for the resource control
110  *   associated with the number of LWPs in a task.
111  *
112  * Return values
113  *   The number of LWPs in the given task is returned.
114  *
115  * Caller's context
116  *   The p->p_lock must be held across the call.
117  */
118 /*ARGSUSED*/
119 static rctl_qty_t
120 task_lwps_usage(rctl_t *r, proc_t *p)
121 {
122 	task_t *t;
123 	rctl_qty_t nlwps;
124 
125 	ASSERT(MUTEX_HELD(&p->p_lock));
126 
127 	t = p->p_task;
128 	mutex_enter(&p->p_zone->zone_nlwps_lock);
129 	nlwps = t->tk_nlwps;
130 	mutex_exit(&p->p_zone->zone_nlwps_lock);
131 
132 	return (nlwps);
133 }
134 
135 /*
136  * static int task_test_lwps(void *taskp, rctl_val_t *, int64_t incr,
137  *   int flags)
138  *
139  * Overview
140  *   task_test_lwps() is the test-if-valid-increment for the resource control
141  *   for the number of processes in a task.
142  *
143  * Return values
144  *   0 if the threshold limit was not passed, 1 if the limit was passed.
145  *
146  * Caller's context
147  *   p->p_lock must be held across the call.
148  */
149 /*ARGSUSED*/
150 static int
151 task_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl,
152     rctl_qty_t incr,
153     uint_t flags)
154 {
155 	rctl_qty_t nlwps;
156 
157 	ASSERT(MUTEX_HELD(&p->p_lock));
158 	ASSERT(e->rcep_t == RCENTITY_TASK);
159 	if (e->rcep_p.task == NULL)
160 		return (0);
161 
162 	ASSERT(MUTEX_HELD(&(e->rcep_p.task->tk_zone->zone_nlwps_lock)));
163 	nlwps = e->rcep_p.task->tk_nlwps;
164 
165 	if (nlwps + incr > rcntl->rcv_value)
166 		return (1);
167 
168 	return (0);
169 }
170 /*ARGSUSED*/
171 static int
172 task_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) {
173 
174 	ASSERT(MUTEX_HELD(&p->p_lock));
175 	ASSERT(e->rcep_t == RCENTITY_TASK);
176 	if (e->rcep_p.task == NULL)
177 		return (0);
178 
179 	e->rcep_p.task->tk_nlwps_ctl = nv;
180 	return (0);
181 }
182 
183 /*
184  * static rctl_qty_t task_usage_cpu_secs(void *taskp)
185  *
186  * Overview
187  *   task_usage_cpu_secs() is the usage operation for the resource control
188  *   associated with the total accrued CPU seconds for a task.
189  *
190  * Return values
191  *   The number of CPU seconds consumed by the task is returned.
192  *
193  * Caller's context
194  *   The given task must be held across the call.
195  */
196 /*ARGSUSED*/
197 static rctl_qty_t
198 task_cpu_time_usage(rctl_t *r, proc_t *p)
199 {
200 	task_t *t = p->p_task;
201 
202 	ASSERT(MUTEX_HELD(&p->p_lock));
203 	return (t->tk_cpu_time);
204 }
205 
206 /*
207  * int task_cpu_time_incr(task_t *t, rctl_qty_t incr)
208  *
209  * Overview
210  *   task_cpu_time_incr() increments the amount of CPU time used
211  *   by this task.
212  *
213  * Return values
214  *   1   if a second or more time is accumulated
215  *   0   otherwise
216  *
217  * Caller's context
218  *   This is called by the clock tick accounting function to charge
219  *   CPU time to a task.
220  */
221 rctl_qty_t
222 task_cpu_time_incr(task_t *t, rctl_qty_t incr)
223 {
224 	rctl_qty_t ret = 0;
225 
226 	mutex_enter(&t->tk_cpu_time_lock);
227 	t->tk_cpu_ticks += incr;
228 	if (t->tk_cpu_ticks >= hz) {
229 		t->tk_cpu_time += t->tk_cpu_ticks / hz;
230 		t->tk_cpu_ticks = t->tk_cpu_ticks % hz;
231 		ret = t->tk_cpu_time;
232 	}
233 	mutex_exit(&t->tk_cpu_time_lock);
234 
235 	return (ret);
236 }
237 
238 /*
239  * static int task_test_cpu_secs(void *taskp, rctl_val_t *, int64_t incr,
240  *   int flags)
241  *
242  * Overview
243  *   task_test_cpu_secs() is the test-if-valid-increment for the resource
244  *   control for the total accrued CPU seconds for a task.
245  *
246  * Return values
247  *   0 if the threshold limit was not passed, 1 if the limit was passed.
248  *
249  * Caller's context
250  *   The given task must be held across the call.
251  */
252 /*ARGSUSED*/
253 static int
254 task_cpu_time_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e,
255     struct rctl_val *rcntl, rctl_qty_t incr, uint_t flags)
256 {
257 	ASSERT(MUTEX_HELD(&p->p_lock));
258 	ASSERT(e->rcep_t == RCENTITY_TASK);
259 	if (e->rcep_p.task == NULL)
260 		return (0);
261 
262 	if (incr >= rcntl->rcv_value)
263 		return (1);
264 
265 	return (0);
266 }
267 
268 static task_t *
269 task_find(taskid_t id, zoneid_t zoneid)
270 {
271 	task_t *tk;
272 
273 	ASSERT(MUTEX_HELD(&task_hash_lock));
274 
275 	if (mod_hash_find(task_hash, (mod_hash_key_t)(uintptr_t)id,
276 	    (mod_hash_val_t *)&tk) == MH_ERR_NOTFOUND ||
277 	    (zoneid != ALL_ZONES && zoneid != tk->tk_zone->zone_id))
278 		return (NULL);
279 
280 	return (tk);
281 }
282 
283 /*
284  * task_hold_by_id(), task_hold_by_id_zone()
285  *
286  * Overview
287  *   task_hold_by_id() is used to take a reference on a task by its task id,
288  *   supporting the various system call interfaces for obtaining resource data,
289  *   delivering signals, and so forth.
290  *
291  * Return values
292  *   Returns a pointer to the task_t with taskid_t id.  The task is returned
293  *   with its hold count incremented by one.  Returns NULL if there
294  *   is no task with the requested id.
295  *
296  * Caller's context
297  *   Caller must not be holding task_hash_lock.  No restrictions on context.
298  */
299 task_t *
300 task_hold_by_id_zone(taskid_t id, zoneid_t zoneid)
301 {
302 	task_t *tk;
303 
304 	mutex_enter(&task_hash_lock);
305 	if ((tk = task_find(id, zoneid)) != NULL)
306 		atomic_add_32(&tk->tk_hold_count, 1);
307 	mutex_exit(&task_hash_lock);
308 
309 	return (tk);
310 }
311 
312 task_t *
313 task_hold_by_id(taskid_t id)
314 {
315 	zoneid_t zoneid;
316 
317 	if (INGLOBALZONE(curproc))
318 		zoneid = ALL_ZONES;
319 	else
320 		zoneid = getzoneid();
321 	return (task_hold_by_id_zone(id, zoneid));
322 }
323 
324 /*
325  * void task_hold(task_t *)
326  *
327  * Overview
328  *   task_hold() is used to take an additional reference to the given task.
329  *
330  * Return values
331  *   None.
332  *
333  * Caller's context
334  *   No restriction on context.
335  */
336 void
337 task_hold(task_t *tk)
338 {
339 	atomic_add_32(&tk->tk_hold_count, 1);
340 }
341 
342 /*
343  * void task_rele(task_t *)
344  *
345  * Overview
346  *   task_rele() relinquishes a reference on the given task, which was acquired
347  *   via task_hold() or task_hold_by_id().  If this is the last member or
348  *   observer of the task, dispatch it for commitment via the accounting
349  *   subsystem.
350  *
351  * Return values
352  *   None.
353  *
354  * Caller's context
355  *   Caller must not be holding the task_hash_lock.
356  *   Caller's context must be acceptable for KM_SLEEP allocations.
357  */
358 void
359 task_rele(task_t *tk)
360 {
361 	mutex_enter(&task_hash_lock);
362 	if (atomic_add_32_nv(&tk->tk_hold_count, -1) > 0) {
363 		mutex_exit(&task_hash_lock);
364 		return;
365 	}
366 
367 	mutex_enter(&tk->tk_zone->zone_nlwps_lock);
368 	tk->tk_proj->kpj_ntasks--;
369 	mutex_exit(&tk->tk_zone->zone_nlwps_lock);
370 
371 	if (mod_hash_destroy(task_hash,
372 	    (mod_hash_key_t)(uintptr_t)tk->tk_tkid) != 0)
373 		panic("unable to delete task %d", tk->tk_tkid);
374 	mutex_exit(&task_hash_lock);
375 
376 	/*
377 	 * At this point, there are no members or observers of the task, so we
378 	 * can safely send it on for commitment to the accounting subsystem.
379 	 * The task will be destroyed in task_end() subsequent to commitment.
380 	 */
381 	(void) taskq_dispatch(exacct_queue, exacct_commit_task, tk, KM_SLEEP);
382 }
383 
384 /*
385  * task_t *task_create(projid_t, zone *)
386  *
387  * Overview
388  *   A process constructing a new task calls task_create() to construct and
389  *   preinitialize the task for the appropriate destination project.  Only one
390  *   task, the primordial task0, is not created with task_create().
391  *
392  * Return values
393  *   None.
394  *
395  * Caller's context
396  *   Caller's context should be safe for KM_SLEEP allocations.
397  *   The caller should appropriately bump the kpj_ntasks counter on the
398  *   project that contains this task.
399  */
400 task_t *
401 task_create(projid_t projid, zone_t *zone)
402 {
403 	task_t *tk = kmem_cache_alloc(task_cache, KM_SLEEP);
404 	task_t *ancestor_tk;
405 	taskid_t tkid;
406 	task_usage_t *tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
407 	mod_hash_hndl_t hndl;
408 	rctl_set_t *set = rctl_set_create();
409 	rctl_alloc_gp_t *gp;
410 	rctl_entity_p_t e;
411 
412 	bzero(tk, sizeof (task_t));
413 
414 	tk->tk_tkid = tkid = id_alloc(taskid_space);
415 	tk->tk_nlwps = 0;
416 	tk->tk_nlwps_ctl = INT_MAX;
417 	tk->tk_usage = tu;
418 	tk->tk_inherited = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
419 	tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT);
420 	tk->tk_flags = TASK_NORMAL;
421 
422 	/*
423 	 * Copy ancestor task's resource controls.
424 	 */
425 	zone_task_hold(zone);
426 	mutex_enter(&curproc->p_lock);
427 	ancestor_tk = curproc->p_task;
428 	task_hold(ancestor_tk);
429 	tk->tk_zone = zone;
430 	mutex_exit(&curproc->p_lock);
431 
432 	for (;;) {
433 		gp = rctl_set_dup_prealloc(ancestor_tk->tk_rctls);
434 
435 		mutex_enter(&ancestor_tk->tk_rctls->rcs_lock);
436 		if (rctl_set_dup_ready(ancestor_tk->tk_rctls, gp))
437 			break;
438 
439 		mutex_exit(&ancestor_tk->tk_rctls->rcs_lock);
440 
441 		rctl_prealloc_destroy(gp);
442 	}
443 
444 	/*
445 	 * At this point, curproc does not have the appropriate linkage
446 	 * through the task to the project. So, rctl_set_dup should only
447 	 * copy the rctls, and leave the callbacks for later.
448 	 */
449 	e.rcep_p.task = tk;
450 	e.rcep_t = RCENTITY_TASK;
451 	tk->tk_rctls = rctl_set_dup(ancestor_tk->tk_rctls, curproc, curproc, &e,
452 	    set, gp, RCD_DUP);
453 	mutex_exit(&ancestor_tk->tk_rctls->rcs_lock);
454 
455 	rctl_prealloc_destroy(gp);
456 
457 	/*
458 	 * Record the ancestor task's ID for use by extended accounting.
459 	 */
460 	tu->tu_anctaskid = ancestor_tk->tk_tkid;
461 	task_rele(ancestor_tk);
462 
463 	/*
464 	 * Put new task structure in the hash table.
465 	 */
466 	(void) mod_hash_reserve(task_hash, &hndl);
467 	mutex_enter(&task_hash_lock);
468 	ASSERT(task_find(tkid, zone->zone_id) == NULL);
469 	if (mod_hash_insert_reserve(task_hash, (mod_hash_key_t)(uintptr_t)tkid,
470 	    (mod_hash_val_t *)tk, hndl) != 0) {
471 		mod_hash_cancel(task_hash, &hndl);
472 		panic("unable to insert task %d(%p)", tkid, (void *)tk);
473 	}
474 	mutex_exit(&task_hash_lock);
475 
476 	return (tk);
477 }
478 
479 /*
480  * void task_attach(task_t *, proc_t *)
481  *
482  * Overview
483  *   task_attach() is used to attach a process to a task; this operation is only
484  *   performed as a result of a fork() or settaskid() system call.  The proc_t's
485  *   p_tasknext and p_taskprev fields will be set such that the proc_t is a
486  *   member of the doubly-linked list of proc_t's that make up the task.
487  *
488  * Return values
489  *   None.
490  *
491  * Caller's context
492  *   pidlock and p->p_lock must be held on entry.
493  */
494 void
495 task_attach(task_t *tk, proc_t *p)
496 {
497 	proc_t *first, *prev;
498 	rctl_entity_p_t e;
499 	ASSERT(tk != NULL);
500 	ASSERT(p != NULL);
501 	ASSERT(MUTEX_HELD(&pidlock));
502 	ASSERT(MUTEX_HELD(&p->p_lock));
503 
504 	if (tk->tk_memb_list == NULL) {
505 		p->p_tasknext = p;
506 		p->p_taskprev = p;
507 	} else {
508 		first = tk->tk_memb_list;
509 		prev = first->p_taskprev;
510 		first->p_taskprev = p;
511 		p->p_tasknext = first;
512 		p->p_taskprev = prev;
513 		prev->p_tasknext = p;
514 	}
515 	tk->tk_memb_list = p;
516 	task_hold(tk);
517 	p->p_task = tk;
518 
519 	/*
520 	 * Now that the linkage from process to task and project is
521 	 * complete, do the required callbacks for the task and project
522 	 * rctl sets.
523 	 */
524 	e.rcep_p.proj = tk->tk_proj;
525 	e.rcep_t = RCENTITY_PROJECT;
526 	(void) rctl_set_dup(NULL, NULL, p, &e, tk->tk_proj->kpj_rctls, NULL,
527 	    RCD_CALLBACK);
528 
529 	e.rcep_p.task = tk;
530 	e.rcep_t = RCENTITY_TASK;
531 	(void) rctl_set_dup(NULL, NULL, p, &e, tk->tk_rctls, NULL,
532 	    RCD_CALLBACK);
533 
534 }
535 
536 /*
537  * task_begin()
538  *
539  * Overview
540  *   A process constructing a new task calls task_begin() to initialize the
541  *   task, by attaching itself as a member.
542  *
543  * Return values
544  *   None.
545  *
546  * Caller's context
547  *   pidlock and p_lock must be held across the call to task_begin().
548  */
549 void
550 task_begin(task_t *tk, proc_t *p)
551 {
552 	timestruc_t ts;
553 	task_usage_t *tu;
554 
555 	ASSERT(MUTEX_HELD(&pidlock));
556 	ASSERT(MUTEX_HELD(&p->p_lock));
557 
558 	mutex_enter(&tk->tk_usage_lock);
559 	tu = tk->tk_usage;
560 	gethrestime(&ts);
561 	tu->tu_startsec = (uint64_t)ts.tv_sec;
562 	tu->tu_startnsec = (uint64_t)ts.tv_nsec;
563 	mutex_exit(&tk->tk_usage_lock);
564 
565 	/*
566 	 * Join process to the task as a member.
567 	 */
568 	task_attach(tk, p);
569 }
570 
571 /*
572  * void task_detach(proc_t *)
573  *
574  * Overview
575  *   task_detach() removes the specified process from its task.  task_detach
576  *   sets the process's task membership to NULL, in anticipation of a final exit
577  *   or of joining a new task.  Because task_rele() requires a context safe for
578  *   KM_SLEEP allocations, a task_detach() is followed by a subsequent
579  *   task_rele() once appropriate context is available.
580  *
581  *   Because task_detach() involves relinquishing the process's membership in
582  *   the project, any observational rctls the process may have had on the task
583  *   or project are destroyed.
584  *
585  * Return values
586  *   None.
587  *
588  * Caller's context
589  *   pidlock and p_lock held across task_detach().
590  */
591 void
592 task_detach(proc_t *p)
593 {
594 	task_t *tk = p->p_task;
595 
596 	ASSERT(MUTEX_HELD(&pidlock));
597 	ASSERT(MUTEX_HELD(&p->p_lock));
598 	ASSERT(p->p_task != NULL);
599 	ASSERT(tk->tk_memb_list != NULL);
600 
601 	if (tk->tk_memb_list == p)
602 		tk->tk_memb_list = p->p_tasknext;
603 	if (tk->tk_memb_list == p)
604 		tk->tk_memb_list = NULL;
605 	p->p_taskprev->p_tasknext = p->p_tasknext;
606 	p->p_tasknext->p_taskprev = p->p_taskprev;
607 
608 	rctl_set_tearoff(p->p_task->tk_rctls, p);
609 	rctl_set_tearoff(p->p_task->tk_proj->kpj_rctls, p);
610 
611 	p->p_task = NULL;
612 	p->p_tasknext = p->p_taskprev = NULL;
613 }
614 
615 /*
616  * task_change(task_t *, proc_t *)
617  *
618  * Overview
619  *   task_change() removes the specified process from its current task.  The
620  *   process is then attached to the specified task.  This routine is called
621  *   from settaskid() when process is being moved to a new task.
622  *
623  * Return values
624  *   None.
625  *
626  * Caller's context
627  *   pidlock and p_lock held across task_change()
628  */
629 void
630 task_change(task_t *newtk, proc_t *p)
631 {
632 	task_t *oldtk = p->p_task;
633 
634 	ASSERT(MUTEX_HELD(&pidlock));
635 	ASSERT(MUTEX_HELD(&p->p_lock));
636 	ASSERT(oldtk != NULL);
637 	ASSERT(oldtk->tk_memb_list != NULL);
638 
639 	mutex_enter(&oldtk->tk_zone->zone_nlwps_lock);
640 	oldtk->tk_nlwps -= p->p_lwpcnt;
641 	mutex_exit(&oldtk->tk_zone->zone_nlwps_lock);
642 
643 	mutex_enter(&newtk->tk_zone->zone_nlwps_lock);
644 	newtk->tk_nlwps += p->p_lwpcnt;
645 	mutex_exit(&newtk->tk_zone->zone_nlwps_lock);
646 
647 	task_detach(p);
648 	task_begin(newtk, p);
649 	exacct_move_mstate(p, oldtk, newtk);
650 }
651 
652 /*
653  * task_end()
654  *
655  * Overview
656  *   task_end() contains the actions executed once the final member of
657  *   a task has released the task, and all actions connected with the task, such
658  *   as committing an accounting record to a file, are completed.  It is called
659  *   by the known last consumer of the task information.  Additionally,
660  *   task_end() must never refer to any process in the system.
661  *
662  * Return values
663  *   None.
664  *
665  * Caller's context
666  *   No restrictions on context, beyond that given above.
667  */
668 void
669 task_end(task_t *tk)
670 {
671 	ASSERT(tk->tk_hold_count == 0);
672 
673 	project_rele(tk->tk_proj);
674 	kmem_free(tk->tk_usage, sizeof (task_usage_t));
675 	kmem_free(tk->tk_inherited, sizeof (task_usage_t));
676 	if (tk->tk_prevusage != NULL)
677 		kmem_free(tk->tk_prevusage, sizeof (task_usage_t));
678 	if (tk->tk_zoneusage != NULL)
679 		kmem_free(tk->tk_zoneusage, sizeof (task_usage_t));
680 	rctl_set_free(tk->tk_rctls);
681 	id_free(taskid_space, tk->tk_tkid);
682 	zone_task_rele(tk->tk_zone);
683 	kmem_cache_free(task_cache, tk);
684 }
685 
686 static void
687 changeproj(proc_t *p, kproject_t *kpj, zone_t *zone, void *projbuf,
688     void *zonebuf)
689 {
690 	kproject_t *oldkpj;
691 	kthread_t *t;
692 
693 	ASSERT(MUTEX_HELD(&pidlock));
694 	ASSERT(MUTEX_HELD(&p->p_lock));
695 
696 	if ((t = p->p_tlist) != NULL) {
697 		do {
698 			(void) project_hold(kpj);
699 
700 			thread_lock(t);
701 			oldkpj = ttoproj(t);
702 
703 			/*
704 			 * Kick this thread so that he doesn't sit
705 			 * on a wrong wait queue.
706 			 */
707 			if (ISWAITING(t))
708 				setrun_locked(t);
709 
710 			/*
711 			 * The thread wants to go on the project wait queue, but
712 			 * the waitq is changing.
713 			 */
714 			if (t->t_schedflag & TS_PROJWAITQ)
715 				t->t_schedflag &= ~ TS_PROJWAITQ;
716 
717 			t->t_proj = kpj;
718 			t->t_pre_sys = 1;		/* For cred update */
719 			thread_unlock(t);
720 			fss_changeproj(t, kpj, zone, projbuf, zonebuf);
721 
722 			project_rele(oldkpj);
723 		} while ((t = t->t_forw) != p->p_tlist);
724 	}
725 }
726 
727 /*
728  * task_join()
729  *
730  * Overview
731  *   task_join() contains the actions that must be executed when the first
732  *   member (curproc) of a newly created task joins it.  It may never fail.
733  *
734  *   The caller must make sure holdlwps() is called so that all other lwps are
735  *   stopped prior to calling this function.
736  *
737  *   NB: It returns with curproc->p_lock held.
738  *
739  * Return values
740  *   Pointer to the old task.
741  *
742  * Caller's context
743  *   cpu_lock must be held entering the function.  It will acquire pidlock,
744  *   p_crlock and p_lock during execution.
745  */
746 task_t *
747 task_join(task_t *tk, uint_t flags)
748 {
749 	proc_t *p = ttoproc(curthread);
750 	task_t *prev_tk;
751 	void *projbuf, *zonebuf;
752 	zone_t *zone = tk->tk_zone;
753 	projid_t projid = tk->tk_proj->kpj_id;
754 	cred_t *oldcr;
755 
756 	/*
757 	 * We can't know for sure if holdlwps() was called, but we can check to
758 	 * ensure we're single-threaded.
759 	 */
760 	ASSERT(curthread == p->p_agenttp || p->p_lwprcnt == 1);
761 
762 	/*
763 	 * Changing the credential is always hard because we cannot
764 	 * allocate memory when holding locks but we don't know whether
765 	 * we need to change it.  We first get a reference to the current
766 	 * cred if we need to change it.  Then we create a credential
767 	 * with an updated project id.  Finally we install it, first
768 	 * releasing the reference we had on the p_cred at the time we
769 	 * acquired the lock the first time and later we release the
770 	 * reference to p_cred at the time we acquired the lock the
771 	 * second time.
772 	 */
773 	mutex_enter(&p->p_crlock);
774 	if (crgetprojid(p->p_cred) == projid)
775 		oldcr = NULL;
776 	else
777 		crhold(oldcr = p->p_cred);
778 	mutex_exit(&p->p_crlock);
779 
780 	if (oldcr != NULL) {
781 		cred_t *newcr = crdup(oldcr);
782 		crsetprojid(newcr, projid);
783 		crfree(oldcr);
784 
785 		mutex_enter(&p->p_crlock);
786 		oldcr = p->p_cred;
787 		p->p_cred = newcr;
788 		mutex_exit(&p->p_crlock);
789 		crfree(oldcr);
790 	}
791 
792 	/*
793 	 * Make sure that the number of processor sets is constant
794 	 * across this operation.
795 	 */
796 	ASSERT(MUTEX_HELD(&cpu_lock));
797 
798 	projbuf = fss_allocbuf(FSS_NPSET_BUF, FSS_ALLOC_PROJ);
799 	zonebuf = fss_allocbuf(FSS_NPSET_BUF, FSS_ALLOC_ZONE);
800 
801 	mutex_enter(&pidlock);
802 	mutex_enter(&p->p_lock);
803 
804 	prev_tk = p->p_task;
805 	task_change(tk, p);
806 
807 	/*
808 	 * Now move threads one by one to their new project.
809 	 */
810 	changeproj(p, tk->tk_proj, zone, projbuf, zonebuf);
811 	if (flags & TASK_FINAL)
812 		p->p_task->tk_flags |= TASK_FINAL;
813 
814 	mutex_exit(&pidlock);
815 
816 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
817 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
818 	return (prev_tk);
819 }
820 
821 /*
822  * rctl ops vectors
823  */
824 static rctl_ops_t task_lwps_ops = {
825 	rcop_no_action,
826 	task_lwps_usage,
827 	task_lwps_set,
828 	task_lwps_test
829 };
830 
831 static rctl_ops_t task_cpu_time_ops = {
832 	rcop_no_action,
833 	task_cpu_time_usage,
834 	rcop_no_set,
835 	task_cpu_time_test
836 };
837 
838 /*ARGSUSED*/
839 /*
840  * void task_init(void)
841  *
842  * Overview
843  *   task_init() initializes task-related hashes, caches, and the task id
844  *   space.  Additionally, task_init() establishes p0 as a member of task0.
845  *   Called by main().
846  *
847  * Return values
848  *   None.
849  *
850  * Caller's context
851  *   task_init() must be called prior to MP startup.
852  */
853 void
854 task_init(void)
855 {
856 	proc_t *p = &p0;
857 	mod_hash_hndl_t hndl;
858 	rctl_set_t *set;
859 	rctl_alloc_gp_t *gp;
860 	rctl_entity_p_t e;
861 	/*
862 	 * Initialize task_cache and taskid_space.
863 	 */
864 	task_cache = kmem_cache_create("task_cache", sizeof (task_t),
865 	    0, NULL, NULL, NULL, NULL, NULL, 0);
866 	taskid_space = id_space_create("taskid_space", 0, MAX_TASKID);
867 
868 	/*
869 	 * Initialize task hash table.
870 	 */
871 	task_hash = mod_hash_create_idhash("task_hash", task_hash_size,
872 	    mod_hash_null_valdtor);
873 
874 	/*
875 	 * Initialize task-based rctls.
876 	 */
877 	rc_task_lwps = rctl_register("task.max-lwps", RCENTITY_TASK,
878 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX,
879 	    &task_lwps_ops);
880 	rc_task_cpu_time = rctl_register("task.max-cpu-time", RCENTITY_TASK,
881 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_DENY_NEVER |
882 	    RCTL_GLOBAL_CPU_TIME | RCTL_GLOBAL_INFINITE |
883 	    RCTL_GLOBAL_UNOBSERVABLE | RCTL_GLOBAL_SECONDS, UINT64_MAX,
884 	    UINT64_MAX, &task_cpu_time_ops);
885 
886 	/*
887 	 * Create task0 and place p0 in it as a member.
888 	 */
889 	task0p = kmem_cache_alloc(task_cache, KM_SLEEP);
890 	bzero(task0p, sizeof (task_t));
891 
892 	task0p->tk_tkid = id_alloc(taskid_space);
893 	task0p->tk_usage = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
894 	task0p->tk_inherited = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
895 	task0p->tk_proj = project_hold_by_id(0, &zone0,
896 	    PROJECT_HOLD_INSERT);
897 	task0p->tk_flags = TASK_NORMAL;
898 	task0p->tk_nlwps = p->p_lwpcnt;
899 	task0p->tk_zone = global_zone;
900 
901 	set = rctl_set_create();
902 	gp = rctl_set_init_prealloc(RCENTITY_TASK);
903 	mutex_enter(&curproc->p_lock);
904 	e.rcep_p.task = task0p;
905 	e.rcep_t = RCENTITY_TASK;
906 	task0p->tk_rctls = rctl_set_init(RCENTITY_TASK, curproc, &e, set, gp);
907 	mutex_exit(&curproc->p_lock);
908 	rctl_prealloc_destroy(gp);
909 
910 	(void) mod_hash_reserve(task_hash, &hndl);
911 	mutex_enter(&task_hash_lock);
912 	ASSERT(task_find(task0p->tk_tkid, GLOBAL_ZONEID) == NULL);
913 	if (mod_hash_insert_reserve(task_hash,
914 	    (mod_hash_key_t)(uintptr_t)task0p->tk_tkid,
915 	    (mod_hash_val_t *)task0p, hndl) != 0) {
916 		mod_hash_cancel(task_hash, &hndl);
917 		panic("unable to insert task %d(%p)", task0p->tk_tkid,
918 		    (void *)task0p);
919 	}
920 	mutex_exit(&task_hash_lock);
921 
922 	task0p->tk_memb_list = p;
923 
924 	/*
925 	 * Initialize task pointers for p0, including doubly linked list of task
926 	 * members.
927 	 */
928 	p->p_task = task0p;
929 	p->p_taskprev = p->p_tasknext = p;
930 	task_hold(task0p);
931 }
932