xref: /illumos-gate/usr/src/uts/common/os/pool.c (revision 257873cfc1dd3337766407f80397db60a56f2f5a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/pool.h>
28 #include <sys/pool_impl.h>
29 #include <sys/pool_pset.h>
30 #include <sys/id_space.h>
31 #include <sys/mutex.h>
32 #include <sys/nvpair.h>
33 #include <sys/cpuvar.h>
34 #include <sys/errno.h>
35 #include <sys/cmn_err.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/fss.h>
39 #include <sys/class.h>
40 #include <sys/exacct.h>
41 #include <sys/utsname.h>
42 #include <sys/procset.h>
43 #include <sys/atomic.h>
44 #include <sys/zone.h>
45 #include <sys/policy.h>
46 #include <sys/schedctl.h>
47 
48 /*
49  * RESOURCE POOLS
50  *
51  * The resource pools facility brings together process-bindable resource into
52  * a common abstraction called a pool. Processor sets and other entities can
53  * be configured, grouped, and labelled such that workload components can be
54  * associated with a subset of a system's total resources.
55  *
56  * When disabled, the pools facility is "invisible".  All processes belong
57  * to the same pool (pool_default), and processor sets can be managed through
58  * the old pset() system call.  When enabled, processor sets can only be
59  * managed via the pools facility.  New pools can be created and associated
60  * with processor sets.  Processes can be bound to pools which have non-empty
61  * resource sets.
62  *
63  * Locking: pool_lock() protects global pools state and must be called
64  * before modifying the configuration, or when taking a snapshot of the
65  * configuration.  If pool_lock_intr() is used, the operation may be
66  * interrupted by a signal or a request.
67  *
68  * To prevent processes from being rebound between pools while they are
69  * the middle of an operation which affects resource set bindings, such
70  * operations must be surrounded by calls to pool_barrier_enter() and
71  * pool_barrier_exit().  This mechanism guarantees that such processes will
72  * be stopped either at the beginning or at the end of the barrier so that
73  * the rebind operation can atomically bind the process and its threads
74  * to new resource sets, and then let process run again.
75  *
76  * Lock ordering with respect to other locks is as follows:
77  *
78  * 	pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
79  *
80  * Most static and global variables defined in this file are protected
81  * by calling pool_lock().
82  *
83  * The operation that binds tasks and projects to pools is atomic.  That is,
84  * either all processes in a given task or a project will be bound to a
85  * new pool, or (in case of an error) they will be all left bound to the
86  * old pool. Processes in a given task or a given project can only be bound to
87  * different pools if they were rebound individually one by one as single
88  * processes.  Threads or LWPs of the same process do not have pool bindings,
89  * and are bound to the same resource sets associated with the resource pool
90  * of that process.
91  *
92  * The following picture shows one possible pool configuration with three
93  * pools and three processor sets.  Note that processor set "foo" is not
94  * associated with any pools and therefore cannot have any processes
95  * bound to it.  Two pools (default and foo) are associated with the
96  * same processor set (default).  Also, note that processes in Task 2
97  * are bound to different pools.
98  *
99  *
100  *							       Processor Sets
101  *								+---------+
102  *		       +--------------+========================>| default |
103  *		      a|	      |				+---------+
104  *		      s|	      |				    ||
105  *		      s|	      |				+---------+
106  *		      o|	      |				|   foo   |
107  *		      c|	      |				+---------+
108  *		      i|	      |				    ||
109  *		      a|	      |				+---------+
110  *		      t|	      |			+------>|   bar   |
111  *		      e|	      |			|	+---------+
112  *                    d|              |                 |
113  *                     |              |                 |
114  *	       +---------+      +---------+      +---------+
115  *     Pools   | default |======|   foo   |======|   bar   |
116  *	       +---------+      +---------+      +---------+
117  *	           @  @            @              @ @   @
118  *                b|  |            |              | |   |
119  *                o|  |            |              | |   |
120  *                u|  +-----+      |      +-------+ |   +---+
121  *                n|        |      |      |         |       |
122  *            ....d|........|......|......|.........|.......|....
123  *            :    |   ::   |      |      |    ::   |       |   :
124  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
125  *  Processes :  | p | :: | p |  | p |  | p |  :: | p |...| p | :
126  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
127  *            :........::......................::...............:
128  *              Task 1            Task 2              Task N
129  *                 |                 |                  |
130  *                 |                 |                  |
131  *                 |  +-----------+  |             +-----------+
132  *                 +--| Project 1 |--+             | Project N |
133  *                    +-----------+                +-----------+
134  *
135  * This is just an illustration of relationships between processes, tasks,
136  * projects, pools, and processor sets. New types of resource sets will be
137  * added in the future.
138  */
139 
140 pool_t		*pool_default;	/* default pool which always exists */
141 int		pool_count;	/* number of pools created on this system */
142 int		pool_state;	/* pools state -- enabled/disabled */
143 void		*pool_buf;	/* pre-commit snapshot of the pools state */
144 size_t		pool_bufsz;	/* size of pool_buf */
145 static hrtime_t	pool_pool_mod;	/* last modification time for pools */
146 static hrtime_t	pool_sys_mod;	/* last modification time for system */
147 static nvlist_t	*pool_sys_prop;	/* system properties */
148 static id_space_t *pool_ids;	/* pool ID space */
149 static list_t	pool_list;	/* doubly-linked list of pools */
150 static kmutex_t		pool_mutex;		/* protects pool_busy_* */
151 static kcondvar_t	pool_busy_cv;		/* waiting for "pool_lock" */
152 static kthread_t	*pool_busy_thread;	/* thread holding "pool_lock" */
153 static kmutex_t		pool_barrier_lock;	/* synch. with pool_barrier_* */
154 static kcondvar_t	pool_barrier_cv;	/* synch. with pool_barrier_* */
155 static int		pool_barrier_count;	/* synch. with pool_barrier_* */
156 
157 /*
158  * Boot-time pool initialization.
159  */
160 void
161 pool_init(void)
162 {
163 	pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
164 
165 	/*
166 	 * Initialize default pool.
167 	 */
168 	pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
169 	pool_default->pool_id = POOL_DEFAULT;
170 	list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
171 	list_insert_head(&pool_list, pool_default);
172 
173 	/*
174 	 * Initialize plugins for resource sets.
175 	 */
176 	pool_pset_init();
177 	pool_count = 1;
178 	p0.p_pool = pool_default;
179 	global_zone->zone_pool = pool_default;
180 	pool_default->pool_ref = 1;
181 }
182 
183 /*
184  * Synchronization routines.
185  *
186  * pool_lock is only called from syscall-level routines (processor_bind(),
187  * pset_*(), and /dev/pool ioctls).  The pool "lock" may be held for long
188  * periods of time, including across sleeping operations, so we allow its
189  * acquisition to be interruptible.
190  *
191  * The current thread that owns the "lock" is stored in the variable
192  * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
193  */
194 void
195 pool_lock(void)
196 {
197 	mutex_enter(&pool_mutex);
198 	ASSERT(!pool_lock_held());
199 	while (pool_busy_thread != NULL)
200 		cv_wait(&pool_busy_cv, &pool_mutex);
201 	pool_busy_thread = curthread;
202 	mutex_exit(&pool_mutex);
203 }
204 
205 int
206 pool_lock_intr(void)
207 {
208 	mutex_enter(&pool_mutex);
209 	ASSERT(!pool_lock_held());
210 	while (pool_busy_thread != NULL) {
211 		if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
212 			cv_signal(&pool_busy_cv);
213 			mutex_exit(&pool_mutex);
214 			return (1);
215 		}
216 	}
217 	pool_busy_thread = curthread;
218 	mutex_exit(&pool_mutex);
219 	return (0);
220 }
221 
222 int
223 pool_lock_held(void)
224 {
225 	return (pool_busy_thread == curthread);
226 }
227 
228 void
229 pool_unlock(void)
230 {
231 	mutex_enter(&pool_mutex);
232 	ASSERT(pool_lock_held());
233 	pool_busy_thread = NULL;
234 	cv_signal(&pool_busy_cv);
235 	mutex_exit(&pool_mutex);
236 }
237 
238 /*
239  * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
240  * with pool_do_bind().
241  *
242  * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
243  * operations which modify pool or pset associations.  They can be called
244  * while the process is multi-threaded.  In the common case, when current
245  * process is not being rebound (PBWAIT flag is not set), these functions
246  * will be just incrementing and decrementing reference counts.
247  */
248 void
249 pool_barrier_enter(void)
250 {
251 	proc_t *p = curproc;
252 
253 	ASSERT(MUTEX_HELD(&p->p_lock));
254 	while (p->p_poolflag & PBWAIT)
255 		cv_wait(&p->p_poolcv, &p->p_lock);
256 	p->p_poolcnt++;
257 }
258 
259 void
260 pool_barrier_exit(void)
261 {
262 	proc_t *p = curproc;
263 
264 	ASSERT(MUTEX_HELD(&p->p_lock));
265 	ASSERT(p->p_poolcnt > 0);
266 	p->p_poolcnt--;
267 	if (p->p_poolflag & PBWAIT) {
268 		mutex_enter(&pool_barrier_lock);
269 		ASSERT(pool_barrier_count > 0);
270 		pool_barrier_count--;
271 		if (pool_barrier_count == 0)
272 			cv_signal(&pool_barrier_cv);
273 		mutex_exit(&pool_barrier_lock);
274 		while (p->p_poolflag & PBWAIT)
275 			cv_wait(&p->p_poolcv, &p->p_lock);
276 	}
277 }
278 
279 /*
280  * Enable pools facility.
281  */
282 static int
283 pool_enable(void)
284 {
285 	int ret;
286 
287 	ASSERT(pool_lock_held());
288 	ASSERT(pool_count == 1);
289 
290 	ret = pool_pset_enable();
291 	if (ret != 0)
292 		return (ret);
293 	(void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
294 	(void) nvlist_add_string(pool_sys_prop, "system.name",
295 	    "default");
296 	(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
297 	(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
298 	(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
299 	(void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
300 	    "wt-load");
301 
302 	(void) nvlist_alloc(&pool_default->pool_props,
303 	    NV_UNIQUE_NAME, KM_SLEEP);
304 	(void) nvlist_add_string(pool_default->pool_props,
305 	    "pool.name", "pool_default");
306 	(void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
307 	(void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
308 	(void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
309 	(void) nvlist_add_int64(pool_default->pool_props,
310 	    "pool.importance", 1);
311 	(void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
312 	    pool_default->pool_id);
313 
314 	pool_sys_mod = pool_pool_mod = gethrtime();
315 
316 	return (ret);
317 }
318 
319 /*
320  * Disable pools facility.
321  */
322 static int
323 pool_disable(void)
324 {
325 	int ret;
326 
327 	ASSERT(pool_lock_held());
328 
329 	if (pool_count > 1)	/* must destroy all pools first */
330 		return (EBUSY);
331 
332 	ret = pool_pset_disable();
333 	if (ret != 0)
334 		return (ret);
335 	if (pool_sys_prop != NULL) {
336 		nvlist_free(pool_sys_prop);
337 		pool_sys_prop = NULL;
338 	}
339 	if (pool_default->pool_props != NULL) {
340 		nvlist_free(pool_default->pool_props);
341 		pool_default->pool_props = NULL;
342 	}
343 	return (0);
344 }
345 
346 pool_t *
347 pool_lookup_pool_by_name(char *name)
348 {
349 	pool_t *pool = pool_default;
350 	char *p;
351 
352 	ASSERT(pool_lock_held());
353 	for (pool = list_head(&pool_list); pool;
354 	    pool = list_next(&pool_list, pool)) {
355 		if (nvlist_lookup_string(pool->pool_props,
356 		    "pool.name", &p) == 0 && strcmp(name, p) == 0)
357 			return (pool);
358 	}
359 	return (NULL);
360 }
361 
362 pool_t *
363 pool_lookup_pool_by_id(poolid_t poolid)
364 {
365 	pool_t *pool = pool_default;
366 
367 	ASSERT(pool_lock_held());
368 	for (pool = list_head(&pool_list); pool;
369 	    pool = list_next(&pool_list, pool)) {
370 		if (pool->pool_id == poolid)
371 			return (pool);
372 	}
373 	return (NULL);
374 }
375 
376 /*
377  * Create new pool, associate it with default resource sets, and give
378  * it a temporary name.
379  */
380 static int
381 pool_pool_create(poolid_t *poolid)
382 {
383 	pool_t *pool;
384 	char pool_name[40];
385 
386 	ASSERT(pool_lock_held());
387 
388 	pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
389 	pool->pool_id = *poolid = id_alloc(pool_ids);
390 	pool->pool_pset = pool_pset_default;
391 	pool_pset_default->pset_npools++;
392 	list_insert_tail(&pool_list, pool);
393 	(void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
394 	(void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
395 	(void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
396 	pool_pool_mod = gethrtime();
397 	(void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
398 	    pool_pool_mod);
399 	(void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
400 	pool_count++;
401 	return (0);
402 }
403 
404 struct destroy_zone_arg {
405 	pool_t *old;
406 	pool_t *new;
407 };
408 
409 /*
410  * Update pool pointers for zones that are currently bound to pool "old"
411  * to be bound to pool "new".
412  */
413 static int
414 pool_destroy_zone_cb(zone_t *zone, void *arg)
415 {
416 	struct destroy_zone_arg *dza = arg;
417 
418 	ASSERT(pool_lock_held());
419 	ASSERT(MUTEX_HELD(&cpu_lock));
420 
421 	if (zone_pool_get(zone) == dza->old)
422 		zone_pool_set(zone, dza->new);
423 	return (0);
424 }
425 
426 /*
427  * Destroy specified pool, and rebind all processes in it
428  * to the default pool.
429  */
430 static int
431 pool_pool_destroy(poolid_t poolid)
432 {
433 	pool_t *pool;
434 	int ret;
435 
436 	ASSERT(pool_lock_held());
437 
438 	if (poolid == POOL_DEFAULT)
439 		return (EINVAL);
440 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
441 		return (ESRCH);
442 	ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
443 	if (ret == 0) {
444 		struct destroy_zone_arg dzarg;
445 
446 		dzarg.old = pool;
447 		dzarg.new = pool_default;
448 		mutex_enter(&cpu_lock);
449 		ret = zone_walk(pool_destroy_zone_cb, &dzarg);
450 		mutex_exit(&cpu_lock);
451 		ASSERT(ret == 0);
452 		ASSERT(pool->pool_ref == 0);
453 		(void) nvlist_free(pool->pool_props);
454 		id_free(pool_ids, pool->pool_id);
455 		pool->pool_pset->pset_npools--;
456 		list_remove(&pool_list, pool);
457 		pool_count--;
458 		pool_pool_mod = gethrtime();
459 		kmem_free(pool, sizeof (pool_t));
460 	}
461 	return (ret);
462 }
463 
464 /*
465  * Create new pool or resource set.
466  */
467 int
468 pool_create(int class, int subclass, id_t *id)
469 {
470 	int ret;
471 
472 	ASSERT(pool_lock_held());
473 	if (pool_state == POOL_DISABLED)
474 		return (ENOTACTIVE);
475 	switch (class) {
476 	case PEC_POOL:
477 		ret = pool_pool_create((poolid_t *)id);
478 		break;
479 	case PEC_RES_COMP:
480 		switch (subclass) {
481 		case PREC_PSET:
482 			ret = pool_pset_create((psetid_t *)id);
483 			break;
484 		default:
485 			ret = EINVAL;
486 		}
487 		break;
488 	case PEC_RES_AGG:
489 		ret = ENOTSUP;
490 		break;
491 	default:
492 		ret = EINVAL;
493 	}
494 	return (ret);
495 }
496 
497 /*
498  * Destroy an existing pool or resource set.
499  */
500 int
501 pool_destroy(int class, int subclass, id_t id)
502 {
503 	int ret;
504 
505 	ASSERT(pool_lock_held());
506 	if (pool_state == POOL_DISABLED)
507 		return (ENOTACTIVE);
508 	switch (class) {
509 	case PEC_POOL:
510 		ret = pool_pool_destroy((poolid_t)id);
511 		break;
512 	case PEC_RES_COMP:
513 		switch (subclass) {
514 		case PREC_PSET:
515 			ret = pool_pset_destroy((psetid_t)id);
516 			break;
517 		default:
518 			ret = EINVAL;
519 		}
520 		break;
521 	case PEC_RES_AGG:
522 		ret = ENOTSUP;
523 		break;
524 	default:
525 		ret = EINVAL;
526 	}
527 	return (ret);
528 }
529 
530 /*
531  * Enable or disable pools.
532  */
533 int
534 pool_status(int status)
535 {
536 	int ret = 0;
537 
538 	ASSERT(pool_lock_held());
539 
540 	if (pool_state == status)
541 		return (0);
542 	switch (status) {
543 	case POOL_ENABLED:
544 		ret = pool_enable();
545 		if (ret != 0)
546 			return (ret);
547 		pool_state = POOL_ENABLED;
548 		break;
549 	case POOL_DISABLED:
550 		ret = pool_disable();
551 		if (ret != 0)
552 			return (ret);
553 		pool_state = POOL_DISABLED;
554 		break;
555 	default:
556 		ret = EINVAL;
557 	}
558 	return (ret);
559 }
560 
561 /*
562  * Associate pool with resource set.
563  */
564 int
565 pool_assoc(poolid_t poolid, int idtype, id_t id)
566 {
567 	int ret;
568 
569 	ASSERT(pool_lock_held());
570 	if (pool_state == POOL_DISABLED)
571 		return (ENOTACTIVE);
572 	switch (idtype) {
573 	case PREC_PSET:
574 		ret = pool_pset_assoc(poolid, (psetid_t)id);
575 		break;
576 	default:
577 		ret = EINVAL;
578 	}
579 	if (ret == 0)
580 		pool_pool_mod = gethrtime();
581 	return (ret);
582 }
583 
584 /*
585  * Disassociate resource set from pool.
586  */
587 int
588 pool_dissoc(poolid_t poolid, int idtype)
589 {
590 	int ret;
591 
592 	ASSERT(pool_lock_held());
593 	if (pool_state == POOL_DISABLED)
594 		return (ENOTACTIVE);
595 	switch (idtype) {
596 	case PREC_PSET:
597 		ret = pool_pset_assoc(poolid, PS_NONE);
598 		break;
599 	default:
600 		ret = EINVAL;
601 	}
602 	if (ret == 0)
603 		pool_pool_mod = gethrtime();
604 	return (ret);
605 }
606 
607 /*
608  * Transfer specified quantity of resources between resource sets.
609  */
610 /*ARGSUSED*/
611 int
612 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
613 {
614 	int ret = EINVAL;
615 	return (ret);
616 }
617 
618 /*
619  * Transfer resources specified by their IDs between resource sets.
620  */
621 int
622 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids)
623 {
624 	int ret;
625 
626 	ASSERT(pool_lock_held());
627 	if (pool_state == POOL_DISABLED)
628 		return (ENOTACTIVE);
629 	switch (type) {
630 	case PREC_PSET:
631 		ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst,
632 		    size, ids);
633 		break;
634 	default:
635 		ret = EINVAL;
636 	}
637 	return (ret);
638 }
639 
640 /*
641  * Bind processes to pools.
642  */
643 int
644 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
645 {
646 	pool_t *pool;
647 
648 	ASSERT(pool_lock_held());
649 
650 	if (pool_state == POOL_DISABLED)
651 		return (ENOTACTIVE);
652 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
653 		return (ESRCH);
654 
655 	switch (idtype) {
656 	case P_PID:
657 	case P_TASKID:
658 	case P_PROJID:
659 	case P_ZONEID:
660 		break;
661 	default:
662 		return (EINVAL);
663 	}
664 	return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
665 }
666 
667 /*
668  * Query pool binding of the specifed process.
669  */
670 int
671 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
672 {
673 	proc_t *p;
674 
675 	if (idtype != P_PID)
676 		return (ENOTSUP);
677 	if (id == P_MYID)
678 		id = curproc->p_pid;
679 
680 	ASSERT(pool_lock_held());
681 
682 	mutex_enter(&pidlock);
683 	if ((p = prfind((pid_t)id)) == NULL) {
684 		mutex_exit(&pidlock);
685 		return (ESRCH);
686 	}
687 	mutex_enter(&p->p_lock);
688 	/*
689 	 * In local zones, lie about pool bindings of processes from
690 	 * the global zone.
691 	 */
692 	if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
693 		pool_t *pool;
694 
695 		pool = zone_pool_get(curproc->p_zone);
696 		*poolid = pool->pool_id;
697 	} else {
698 		*poolid = p->p_pool->pool_id;
699 	}
700 	mutex_exit(&p->p_lock);
701 	mutex_exit(&pidlock);
702 	return (0);
703 }
704 
705 static ea_object_t *
706 pool_system_pack(void)
707 {
708 	ea_object_t *eo_system;
709 	size_t bufsz = 0;
710 	char *buf = NULL;
711 
712 	ASSERT(pool_lock_held());
713 
714 	eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
715 	(void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
716 	    EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
717 	if (INGLOBALZONE(curproc))
718 		(void) ea_attach_item(eo_system, &pool_pool_mod,
719 		    sizeof (hrtime_t),
720 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
721 	else
722 		(void) ea_attach_item(eo_system,
723 		    &curproc->p_zone->zone_pool_mod,
724 		    sizeof (hrtime_t),
725 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
726 	(void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
727 	    EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
728 	(void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
729 	    EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
730 	(void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
731 	(void) ea_attach_item(eo_system, buf, bufsz,
732 	    EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
733 	kmem_free(buf, bufsz);
734 	return (eo_system);
735 }
736 
737 /*
738  * Pack information about pools and attach it to specified exacct group.
739  */
740 static int
741 pool_pool_pack(ea_object_t *eo_system)
742 {
743 	ea_object_t *eo_pool;
744 	pool_t *pool;
745 	size_t bufsz;
746 	char *buf;
747 	pool_t *myzonepool;
748 
749 	ASSERT(pool_lock_held());
750 	myzonepool = zone_pool_get(curproc->p_zone);
751 	for (pool = list_head(&pool_list); pool;
752 	    pool = list_next(&pool_list, pool)) {
753 		if (!INGLOBALZONE(curproc) && myzonepool != pool)
754 			continue;
755 		bufsz = 0;
756 		buf = NULL;
757 		eo_pool = ea_alloc_group(EXT_GROUP |
758 		    EXC_LOCAL | EXD_GROUP_POOL);
759 		(void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
760 		    EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
761 		(void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
762 		    sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
763 		(void) nvlist_pack(pool->pool_props, &buf, &bufsz,
764 		    NV_ENCODE_NATIVE, 0);
765 		(void) ea_attach_item(eo_pool, buf, bufsz,
766 		    EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
767 		kmem_free(buf, bufsz);
768 		(void) ea_attach_to_group(eo_system, eo_pool);
769 	}
770 	return (0);
771 }
772 
773 /*
774  * Pack the whole pool configuration in the specified buffer.
775  */
776 int
777 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
778 {
779 	ea_object_t *eo_system;
780 	size_t ksize;
781 	int ret = 0;
782 
783 	ASSERT(pool_lock_held());
784 
785 	eo_system = pool_system_pack();		/* 1. pack system */
786 	(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
787 	(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
788 	ksize = ea_pack_object(eo_system, NULL, 0);
789 	if (kbuf == NULL || kbufsz == 0)
790 		*asize = ksize;
791 	else if (ksize > kbufsz)
792 		ret = ENOMEM;
793 	else
794 		*asize = ea_pack_object(eo_system, kbuf, kbufsz);
795 	ea_free_object(eo_system, EUP_ALLOC);
796 	return (ret);
797 }
798 
799 /*
800  * Start/end the commit transaction.  If commit transaction is currently
801  * in progress, then all POOL_QUERY ioctls will return pools configuration
802  * at the beginning of transaction.
803  */
804 int
805 pool_commit(int state)
806 {
807 	ea_object_t *eo_system;
808 	int ret = 0;
809 
810 	ASSERT(pool_lock_held());
811 
812 	if (pool_state == POOL_DISABLED)
813 		return (ENOTACTIVE);
814 	switch (state) {
815 	case 1:
816 		/*
817 		 * Beginning commit transation.
818 		 */
819 		if (pool_buf != NULL)		/* transaction in progress */
820 			return (EBUSY);
821 		eo_system = pool_system_pack();		/* 1. pack system */
822 		(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
823 		(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
824 		pool_bufsz = ea_pack_object(eo_system, NULL, 0);
825 		pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
826 		pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
827 		ea_free_object(eo_system, EUP_ALLOC);
828 		break;
829 	case 0:
830 		/*
831 		 * Finishing commit transaction.
832 		 */
833 		if (pool_buf != NULL) {
834 			kmem_free(pool_buf, pool_bufsz);
835 			pool_buf = NULL;
836 			pool_bufsz = 0;
837 		}
838 		break;
839 	default:
840 		ret = EINVAL;
841 	}
842 	return (ret);
843 }
844 
845 /*
846  * Check is the specified property is special
847  */
848 static pool_property_t *
849 pool_property_find(char *name, pool_property_t *list)
850 {
851 	pool_property_t *prop;
852 
853 	for (prop = list; prop->pp_name != NULL; prop++)
854 		if (strcmp(prop->pp_name, name) == 0)
855 			return (prop);
856 	return (NULL);
857 }
858 
859 static pool_property_t pool_prop_sys[] = {
860 	{ "system.name",		DATA_TYPE_STRING,	PP_RDWR },
861 	{ "system.comment",		DATA_TYPE_STRING,	PP_RDWR },
862 	{ "system.version",		DATA_TYPE_UINT64,	PP_READ },
863 	{ "system.bind-default",	DATA_TYPE_BYTE,		PP_RDWR },
864 	{ "system.allocate-method",	DATA_TYPE_STRING,
865 	    PP_RDWR | PP_OPTIONAL },
866 	{ "system.poold.log-level",	DATA_TYPE_STRING,
867 	    PP_RDWR | PP_OPTIONAL },
868 	{ "system.poold.log-location",	DATA_TYPE_STRING,
869 	    PP_RDWR | PP_OPTIONAL },
870 	{ "system.poold.monitor-interval",	DATA_TYPE_UINT64,
871 	    PP_RDWR | PP_OPTIONAL },
872 	{ "system.poold.history-file",	DATA_TYPE_STRING,
873 	    PP_RDWR | PP_OPTIONAL },
874 	{ "system.poold.objectives",	DATA_TYPE_STRING,
875 	    PP_RDWR | PP_OPTIONAL },
876 	{ NULL,				0,			0 }
877 };
878 
879 static pool_property_t pool_prop_pool[] = {
880 	{ "pool.sys_id",		DATA_TYPE_UINT64,	PP_READ },
881 	{ "pool.name",			DATA_TYPE_STRING,	PP_RDWR },
882 	{ "pool.default",		DATA_TYPE_BYTE,		PP_READ },
883 	{ "pool.active",		DATA_TYPE_BYTE,		PP_RDWR },
884 	{ "pool.importance",		DATA_TYPE_INT64,	PP_RDWR },
885 	{ "pool.comment",		DATA_TYPE_STRING,	PP_RDWR },
886 	{ "pool.scheduler",		DATA_TYPE_STRING,
887 	    PP_RDWR | PP_OPTIONAL },
888 	{ NULL,				0,			0 }
889 };
890 
891 /*
892  * Common routine to put new property on the specified list
893  */
894 int
895 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
896 {
897 	pool_property_t *prop;
898 
899 	if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
900 		/*
901 		 * No read-only properties or properties with bad types
902 		 */
903 		if (!(prop->pp_perm & PP_WRITE) ||
904 		    prop->pp_type != nvpair_type(pair))
905 			return (EINVAL);
906 	}
907 	return (nvlist_add_nvpair(nvlist, pair));
908 }
909 
910 /*
911  * Common routine to remove property from the given list
912  */
913 int
914 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
915 {
916 	pool_property_t *prop;
917 
918 	if ((prop = pool_property_find(name, props)) != NULL) {
919 		if (!(prop->pp_perm & PP_OPTIONAL))
920 			return (EINVAL);
921 	}
922 	return (nvlist_remove_all(nvlist, name));
923 }
924 
925 static int
926 pool_system_propput(nvpair_t *pair)
927 {
928 	int ret;
929 
930 	ASSERT(pool_lock_held());
931 	ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
932 	if (ret == 0)
933 		pool_sys_mod = gethrtime();
934 	return (ret);
935 }
936 
937 static int
938 pool_system_proprm(char *name)
939 {
940 	int ret;
941 
942 	ASSERT(pool_lock_held());
943 	ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
944 	if (ret == 0)
945 		pool_sys_mod = gethrtime();
946 	return (ret);
947 }
948 
949 static int
950 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
951 {
952 	pool_t *pool;
953 	int ret;
954 
955 	ASSERT(pool_lock_held());
956 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
957 		return (ESRCH);
958 	ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
959 	if (ret == 0)
960 		pool_pool_mod = gethrtime();
961 	return (ret);
962 }
963 
964 static int
965 pool_pool_proprm(poolid_t poolid, char *name)
966 {
967 	int ret;
968 	pool_t *pool;
969 
970 	ASSERT(pool_lock_held());
971 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
972 		return (ESRCH);
973 	ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
974 	if (ret == 0)
975 		pool_pool_mod = gethrtime();
976 	return (ret);
977 }
978 
979 int
980 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
981 {
982 	int ret;
983 
984 	ASSERT(pool_lock_held());
985 	if (pool_state == POOL_DISABLED)
986 		return (ENOTACTIVE);
987 	switch (class) {
988 	case PEC_SYSTEM:
989 		ret = pool_system_propput(pair);
990 		break;
991 	case PEC_POOL:
992 		ret = pool_pool_propput((poolid_t)id, pair);
993 		break;
994 	case PEC_RES_COMP:
995 		switch (subclass) {
996 		case PREC_PSET:
997 			ret = pool_pset_propput((psetid_t)id, pair);
998 			break;
999 		default:
1000 			ret = EINVAL;
1001 		}
1002 		break;
1003 	case PEC_RES_AGG:
1004 		ret = ENOTSUP;
1005 		break;
1006 	case PEC_COMP:
1007 		switch (subclass) {
1008 		case PCEC_CPU:
1009 			ret = pool_cpu_propput((processorid_t)id, pair);
1010 			break;
1011 		default:
1012 			ret = EINVAL;
1013 		}
1014 		break;
1015 	default:
1016 		ret = EINVAL;
1017 	}
1018 	return (ret);
1019 }
1020 
1021 int
1022 pool_proprm(int class, int subclass, id_t id, char *name)
1023 {
1024 	int ret;
1025 
1026 	ASSERT(pool_lock_held());
1027 	if (pool_state == POOL_DISABLED)
1028 		return (ENOTACTIVE);
1029 	switch (class) {
1030 	case PEC_SYSTEM:
1031 		ret = pool_system_proprm(name);
1032 		break;
1033 	case PEC_POOL:
1034 		ret = pool_pool_proprm((poolid_t)id, name);
1035 		break;
1036 	case PEC_RES_COMP:
1037 		switch (subclass) {
1038 		case PREC_PSET:
1039 			ret = pool_pset_proprm((psetid_t)id, name);
1040 			break;
1041 		default:
1042 			ret = EINVAL;
1043 		}
1044 		break;
1045 	case PEC_RES_AGG:
1046 		ret = ENOTSUP;
1047 		break;
1048 	case PEC_COMP:
1049 		switch (subclass) {
1050 		case PCEC_CPU:
1051 			ret = pool_cpu_proprm((processorid_t)id, name);
1052 			break;
1053 		default:
1054 			ret = EINVAL;
1055 		}
1056 		break;
1057 	default:
1058 		ret = EINVAL;
1059 	}
1060 	return (ret);
1061 }
1062 
1063 int
1064 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
1065 {
1066 	int ret;
1067 	nvlist_t *nvl;
1068 
1069 	ASSERT(pool_lock_held());
1070 	if (pool_state == POOL_DISABLED)
1071 		return (ENOTACTIVE);
1072 
1073 	(void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
1074 
1075 	switch (class) {
1076 	case PEC_SYSTEM:
1077 	case PEC_POOL:
1078 		ret = EINVAL;
1079 		break;
1080 	case PEC_RES_COMP:
1081 		switch (subclass) {
1082 		case PREC_PSET:
1083 			ret = pool_pset_propget((psetid_t)id, name, nvl);
1084 			break;
1085 		default:
1086 			ret = EINVAL;
1087 		}
1088 		break;
1089 	case PEC_RES_AGG:
1090 		ret = ENOTSUP;
1091 		break;
1092 	case PEC_COMP:
1093 		switch (subclass) {
1094 		case PCEC_CPU:
1095 			ret = pool_cpu_propget((processorid_t)id, name, nvl);
1096 			break;
1097 		default:
1098 			ret = EINVAL;
1099 		}
1100 		break;
1101 	default:
1102 		ret = EINVAL;
1103 	}
1104 	if (ret == 0)
1105 		*nvlp = nvl;
1106 	else
1107 		nvlist_free(nvl);
1108 	return (ret);
1109 }
1110 
1111 /*
1112  * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1113  * in case of failure in pool_do_bind().
1114  */
1115 static void
1116 pool_bind_wake(proc_t *p)
1117 {
1118 	ASSERT(pool_lock_held());
1119 
1120 	mutex_enter(&p->p_lock);
1121 	ASSERT(p->p_poolflag & PBWAIT);
1122 	if (p->p_poolcnt > 0) {
1123 		mutex_enter(&pool_barrier_lock);
1124 		pool_barrier_count -= p->p_poolcnt;
1125 		mutex_exit(&pool_barrier_lock);
1126 	}
1127 	p->p_poolflag &= ~PBWAIT;
1128 	cv_signal(&p->p_poolcv);
1129 	mutex_exit(&p->p_lock);
1130 }
1131 
1132 static void
1133 pool_bind_wakeall(proc_t **procs)
1134 {
1135 	proc_t *p, **pp;
1136 
1137 	ASSERT(pool_lock_held());
1138 	for (pp = procs; (p = *pp) != NULL; pp++)
1139 		pool_bind_wake(p);
1140 }
1141 
1142 /*
1143  * Return the scheduling class for this pool, or
1144  * 	POOL_CLASS_UNSET if not set
1145  * 	POOL_CLASS_INVAL if set to an invalid class ID.
1146  */
1147 id_t
1148 pool_get_class(pool_t *pool)
1149 {
1150 	char *name;
1151 	id_t cid;
1152 
1153 	ASSERT(pool_lock_held());
1154 
1155 	if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
1156 	    &name) == 0) {
1157 		if (getcidbyname(name, &cid) == 0)
1158 			return (cid);
1159 		else
1160 			return (POOL_CLASS_INVAL);
1161 	}
1162 	return (POOL_CLASS_UNSET);
1163 }
1164 
1165 /*
1166  * Move process to the new scheduling class.
1167  */
1168 static void
1169 pool_change_class(proc_t *p, id_t cid)
1170 {
1171 	kthread_t *t;
1172 	void *cldata;
1173 	id_t oldcid;
1174 	void **bufs;
1175 	void **buf;
1176 	int nlwp;
1177 	int ret;
1178 	int i;
1179 
1180 	/*
1181 	 * Do not move kernel processes (such as zsched).
1182 	 */
1183 	if (p->p_flag & SSYS)
1184 		return;
1185 	/*
1186 	 * This process is in the pool barrier, so it can't possibly be
1187 	 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1188 	 * (for possible agent LWP which doesn't use pool barrier) as
1189 	 * our upper bound.
1190 	 */
1191 	nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
1192 
1193 	/*
1194 	 * Pre-allocate scheduling class specific buffers before
1195 	 * grabbing p_lock.
1196 	 */
1197 	bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
1198 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1199 		ret = CL_ALLOC(buf, cid, KM_SLEEP);
1200 		ASSERT(ret == 0);
1201 	}
1202 
1203 	/*
1204 	 * Move threads one by one to the new scheduling class.
1205 	 * This never fails because we have all the right
1206 	 * privileges here.
1207 	 */
1208 	mutex_enter(&p->p_lock);
1209 	ASSERT(p->p_poolflag & PBWAIT);
1210 	buf = bufs;
1211 	t = p->p_tlist;
1212 	ASSERT(t != NULL);
1213 	do {
1214 		if (t->t_cid != cid) {
1215 			oldcid = t->t_cid;
1216 			cldata = t->t_cldata;
1217 			ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
1218 			ASSERT(ret == 0);
1219 			CL_EXITCLASS(oldcid, cldata);
1220 			schedctl_set_cidpri(t);
1221 			*buf++ = NULL;
1222 		}
1223 	} while ((t = t->t_forw) != p->p_tlist);
1224 	mutex_exit(&p->p_lock);
1225 	/*
1226 	 * Free unused scheduling class specific buffers.
1227 	 */
1228 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1229 		if (*buf != NULL) {
1230 			CL_FREE(cid, *buf);
1231 			*buf = NULL;
1232 		}
1233 	}
1234 	kmem_free(bufs, nlwp * sizeof (void *));
1235 }
1236 
1237 /*
1238  * The meat of the bind operation.  The steps in pool_do_bind are:
1239  *
1240  * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1241  *    such processes to an array.  For any interesting process that has
1242  *    threads inside the pool barrier set, increment a counter by the
1243  *    count of such threads.  Once PBWAIT is set on a process, that process
1244  *    will not disappear.
1245  *
1246  * 2) Wait for the counter from step 2 to drop to zero.  Any process which
1247  *    calls pool_barrier_exit() and notices that PBWAIT has been set on it
1248  *    will decrement that counter before going to sleep, and the process
1249  *    calling pool_barrier_exit() which does the final decrement will wake us.
1250  *
1251  * 3) For each interesting process, perform a calculation on it to see if
1252  *    the bind will actually succeed.  This uses the following three
1253  *    resource-set-specific functions:
1254  *
1255  *    - int set_bind_start(procs, pool)
1256  *
1257  *      Determine whether the given array of processes can be bound to the
1258  *      resource set associated with the given pool.  If it can, take and hold
1259  *      any locks necessary to ensure that the operation will succeed, and
1260  *      make any necessary reservations in the target resource set.  If it
1261  *      can't, return failure with no reservations made and no new locks held.
1262  *
1263  *    - void set_bind_abort(procs, pool)
1264  *
1265  *      set_bind_start() has completed successfully, but another resource set's
1266  *      set_bind_start() has failed, and we haven't begun the bind yet.  Undo
1267  *      any reservations made and drop any locks acquired by our
1268  *      set_bind_start().
1269  *
1270  *    - void set_bind_finish(void)
1271  *
1272  *      The bind has completed successfully.  The processes have been released,
1273  *      and the reservation acquired in set_bind_start() has been depleted as
1274  *      the processes have finished their bindings.  Drop any locks acquired by
1275  *      set_bind_start().
1276  *
1277  * 4) If we've decided that we can proceed with the bind, iterate through
1278  *    the list of interesting processes, grab the necessary locks (which
1279  *    may differ per resource set), perform the bind, and ASSERT that it
1280  *    succeeds.  Once a process has been rebound, it can be awakened.
1281  *
1282  * The operations from step 4 must be kept in sync with anything which might
1283  * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1284  * are thus located in the same source files as the associated bind operations.
1285  */
1286 int
1287 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
1288 {
1289 	extern uint_t nproc;
1290 	klwp_t *lwp = ttolwp(curthread);
1291 	proc_t **pp, **procs;
1292 	proc_t *prstart;
1293 	int procs_count = 0;
1294 	kproject_t *kpj;
1295 	procset_t set;
1296 	zone_t *zone;
1297 	int procs_size;
1298 	int rv = 0;
1299 	proc_t *p;
1300 	id_t cid = -1;
1301 
1302 	ASSERT(pool_lock_held());
1303 
1304 	if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
1305 		return (EINVAL);
1306 
1307 	if (idtype == P_ZONEID) {
1308 		zone = zone_find_by_id(id);
1309 		if (zone == NULL)
1310 			return (ESRCH);
1311 		if (zone_status_get(zone) > ZONE_IS_RUNNING) {
1312 			zone_rele(zone);
1313 			return (EBUSY);
1314 		}
1315 	}
1316 
1317 	if (idtype == P_PROJID) {
1318 		kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
1319 		if (kpj == NULL)
1320 			return (ESRCH);
1321 		mutex_enter(&kpj->kpj_poolbind);
1322 	}
1323 
1324 	if (idtype == P_PID) {
1325 		/*
1326 		 * Fast-path for a single process case.
1327 		 */
1328 		procs_size = 2;	/* procs is NULL-terminated */
1329 		procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
1330 		mutex_enter(&pidlock);
1331 	} else {
1332 		/*
1333 		 * We will need enough slots for proc_t pointers for as many as
1334 		 * twice the number of currently running processes (assuming
1335 		 * that each one could be in fork() creating a new child).
1336 		 */
1337 		for (;;) {
1338 			procs_size = nproc * 2;
1339 			procs = kmem_zalloc(procs_size * sizeof (proc_t *),
1340 			    KM_SLEEP);
1341 			mutex_enter(&pidlock);
1342 
1343 			if (nproc * 2 <= procs_size)
1344 				break;
1345 			/*
1346 			 * If nproc has changed, try again.
1347 			 */
1348 			mutex_exit(&pidlock);
1349 			kmem_free(procs, procs_size * sizeof (proc_t *));
1350 		}
1351 	}
1352 
1353 	if (id == P_MYID)
1354 		id = getmyid(idtype);
1355 	setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
1356 
1357 	/*
1358 	 * Do a first scan, and select target processes.
1359 	 */
1360 	if (idtype == P_PID)
1361 		prstart = prfind(id);
1362 	else
1363 		prstart = practive;
1364 	for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
1365 		mutex_enter(&p->p_lock);
1366 		/*
1367 		 * Skip processes that don't match our (id, idtype) set or
1368 		 * on the way of becoming zombies.  Skip kernel processes
1369 		 * from the global zone.
1370 		 */
1371 		if (procinset(p, &set) == 0 ||
1372 		    p->p_poolflag & PEXITED ||
1373 		    ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
1374 			mutex_exit(&p->p_lock);
1375 			continue;
1376 		}
1377 		if (!INGLOBALZONE(p)) {
1378 			switch (idtype) {
1379 			case P_PID:
1380 			case P_TASKID:
1381 				/*
1382 				 * Can't bind processes or tasks
1383 				 * in local zones to pools.
1384 				 */
1385 				mutex_exit(&p->p_lock);
1386 				mutex_exit(&pidlock);
1387 				pool_bind_wakeall(procs);
1388 				rv = EINVAL;
1389 				goto out;
1390 			case P_PROJID:
1391 				/*
1392 				 * Only projects in the global
1393 				 * zone can be rebound.
1394 				 */
1395 				mutex_exit(&p->p_lock);
1396 				continue;
1397 			case P_POOLID:
1398 				/*
1399 				 * When rebinding pools, processes can be
1400 				 * in different zones.
1401 				 */
1402 				break;
1403 			}
1404 		}
1405 
1406 		p->p_poolflag |= PBWAIT;
1407 		/*
1408 		 * If some threads in this process are inside the pool
1409 		 * barrier, add them to pool_barrier_count, as we have
1410 		 * to wait for all of them to exit the barrier.
1411 		 */
1412 		if (p->p_poolcnt > 0) {
1413 			mutex_enter(&pool_barrier_lock);
1414 			pool_barrier_count += p->p_poolcnt;
1415 			mutex_exit(&pool_barrier_lock);
1416 		}
1417 		ASSERT(pp < &procs[procs_size]);
1418 		*pp++ = p;
1419 		procs_count++;
1420 		mutex_exit(&p->p_lock);
1421 
1422 		/*
1423 		 * We just found our process, so if we're only rebinding a
1424 		 * single process then get out of this loop.
1425 		 */
1426 		if (idtype == P_PID)
1427 			break;
1428 	}
1429 	*pp = NULL;	/* cap off the end of the array */
1430 	mutex_exit(&pidlock);
1431 
1432 	/*
1433 	 * Wait for relevant processes to stop before they try to enter the
1434 	 * barrier or at the exit from the barrier.  Make sure that we do
1435 	 * not get stopped here while we're holding pool_lock.  If we were
1436 	 * requested to stop, or got a signal then return EAGAIN to let the
1437 	 * library know that it needs to retry.
1438 	 */
1439 	mutex_enter(&pool_barrier_lock);
1440 	lwp->lwp_nostop++;
1441 	while (pool_barrier_count > 0) {
1442 		(void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
1443 		if (pool_barrier_count > 0) {
1444 			/*
1445 			 * We either got a signal or were requested to
1446 			 * stop by /proc.  Bail out with EAGAIN.  If we were
1447 			 * requested to stop, we'll stop in post_syscall()
1448 			 * on our way back to userland.
1449 			 */
1450 			mutex_exit(&pool_barrier_lock);
1451 			pool_bind_wakeall(procs);
1452 			lwp->lwp_nostop--;
1453 			rv = EAGAIN;
1454 			goto out;
1455 		}
1456 	}
1457 	lwp->lwp_nostop--;
1458 	mutex_exit(&pool_barrier_lock);
1459 
1460 	if (idtype == P_PID)
1461 		goto skip;
1462 
1463 	/*
1464 	 * Do another run, and drop processes that were inside the barrier
1465 	 * in exit(), but when they have dropped to pool_barrier_exit
1466 	 * they have become of no interest to us.  Pick up child processes that
1467 	 * were created by fork() but didn't exist during our first scan.
1468 	 * Their parents are now stopped at pool_barrier_exit in cfork().
1469 	 */
1470 	mutex_enter(&pidlock);
1471 	for (pp = procs; (p = *pp) != NULL; pp++) {
1472 		if (p->p_poolflag & PEXITED) {
1473 			ASSERT(p->p_lwpcnt == 0);
1474 			pool_bind_wake(p);
1475 			/* flip w/last non-NULL slot */
1476 			*pp = procs[procs_count - 1];
1477 			procs[procs_count - 1] = NULL;
1478 			procs_count--;
1479 			pp--;			/* try this slot again */
1480 			continue;
1481 		}
1482 		/*
1483 		 * Look at the child and check if it should be rebound also.
1484 		 * We're holding pidlock, so it is safe to reference p_child.
1485 		 */
1486 		if ((p = p->p_child) == NULL)
1487 			continue;
1488 
1489 		mutex_enter(&p->p_lock);
1490 
1491 		/*
1492 		 * Skip system processes and make sure that the child is in
1493 		 * the same task/project/pool/zone as the parent.
1494 		 */
1495 		if ((!INGLOBALZONE(p) && idtype != P_ZONEID &&
1496 		    idtype != P_POOLID) || p->p_flag & SSYS) {
1497 			mutex_exit(&p->p_lock);
1498 			continue;
1499 		}
1500 
1501 		/*
1502 		 * If the child process has been already created by fork(), has
1503 		 * not exited, and has not been added to the list already,
1504 		 * then add it now.  We will hit this process again (since we
1505 		 * stick it at the end of the procs list) but it will ignored
1506 		 * because it will have the PBWAIT flag set.
1507 		 */
1508 		if (procinset(p, &set) &&
1509 		    !(p->p_poolflag & PEXITED) &&
1510 		    !(p->p_poolflag & PBWAIT)) {
1511 			ASSERT(p->p_child == NULL); /* no child of a child */
1512 			procs[procs_count] = p;
1513 			procs[procs_count + 1] = NULL;
1514 			procs_count++;
1515 			p->p_poolflag |= PBWAIT;
1516 		}
1517 		mutex_exit(&p->p_lock);
1518 	}
1519 	mutex_exit(&pidlock);
1520 skip:
1521 	/*
1522 	 * If there's no processes to rebind then return ESRCH, unless
1523 	 * we're associating a pool with new resource set, destroying it,
1524 	 * or binding a zone to a pool.
1525 	 */
1526 	if (procs_count == 0) {
1527 		if (idtype == P_POOLID || idtype == P_ZONEID)
1528 			rv = 0;
1529 		else
1530 			rv = ESRCH;
1531 		goto out;
1532 	}
1533 
1534 #ifdef DEBUG
1535 	/*
1536 	 * All processes in the array should have PBWAIT set, and none
1537 	 * should be in the critical section. Thus, although p_poolflag
1538 	 * and p_poolcnt are protected by p_lock, their ASSERTions below
1539 	 * should be stable without it. procinset(), however, ASSERTs that
1540 	 * the p_lock is held upon entry.
1541 	 */
1542 	for (pp = procs; (p = *pp) != NULL; pp++) {
1543 		int in_set;
1544 
1545 		mutex_enter(&p->p_lock);
1546 		in_set = procinset(p, &set);
1547 		mutex_exit(&p->p_lock);
1548 
1549 		ASSERT(in_set);
1550 		ASSERT(p->p_poolflag & PBWAIT);
1551 		ASSERT(p->p_poolcnt == 0);
1552 	}
1553 #endif
1554 
1555 	/*
1556 	 * Do the check if processor set rebinding is going to succeed or not.
1557 	 */
1558 	if ((flags & POOL_BIND_PSET) &&
1559 	    (rv = pset_bind_start(procs, pool)) != 0) {
1560 		pool_bind_wakeall(procs);
1561 		goto out;
1562 	}
1563 
1564 	/*
1565 	 * At this point, all bind operations should succeed.
1566 	 */
1567 	for (pp = procs; (p = *pp) != NULL; pp++) {
1568 		if (flags & POOL_BIND_PSET) {
1569 			psetid_t psetid = pool->pool_pset->pset_id;
1570 			void *zonebuf;
1571 			void *projbuf;
1572 
1573 			/*
1574 			 * Pre-allocate one buffer for FSS (per-project
1575 			 * buffer for a new pset) in case if this is the
1576 			 * first thread from its current project getting
1577 			 * bound to this processor set.
1578 			 */
1579 			projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
1580 			zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
1581 
1582 			mutex_enter(&pidlock);
1583 			mutex_enter(&p->p_lock);
1584 			pool_pset_bind(p, psetid, projbuf, zonebuf);
1585 			mutex_exit(&p->p_lock);
1586 			mutex_exit(&pidlock);
1587 			/*
1588 			 * Free buffers pre-allocated above if it
1589 			 * wasn't actually used.
1590 			 */
1591 			fss_freebuf(projbuf, FSS_ALLOC_PROJ);
1592 			fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
1593 		}
1594 		/*
1595 		 * Now let's change the scheduling class of this
1596 		 * process if our target pool has it defined.
1597 		 */
1598 		if (cid != POOL_CLASS_UNSET)
1599 			pool_change_class(p, cid);
1600 
1601 		/*
1602 		 * It is safe to reference p_pool here without holding
1603 		 * p_lock because it cannot change underneath of us.
1604 		 * We're holding pool_lock here, so nobody else can be
1605 		 * moving this process between pools.  If process "p"
1606 		 * would be exiting, we're guaranteed that it would be blocked
1607 		 * at pool_barrier_enter() in exit().  Otherwise, it would've
1608 		 * been skipped by one of our scans of the practive list
1609 		 * as a process with PEXITED flag set.
1610 		 */
1611 		if (p->p_pool != pool) {
1612 			ASSERT(p->p_pool->pool_ref > 0);
1613 			atomic_add_32(&p->p_pool->pool_ref, -1);
1614 			p->p_pool = pool;
1615 			atomic_add_32(&p->p_pool->pool_ref, 1);
1616 		}
1617 		/*
1618 		 * Okay, we've tortured this guy enough.
1619 		 * Let this poor process go now.
1620 		 */
1621 		pool_bind_wake(p);
1622 	}
1623 	if (flags & POOL_BIND_PSET)
1624 		pset_bind_finish();
1625 
1626 out:	switch (idtype) {
1627 	case P_PROJID:
1628 		ASSERT(kpj != NULL);
1629 		mutex_exit(&kpj->kpj_poolbind);
1630 		project_rele(kpj);
1631 		break;
1632 	case P_ZONEID:
1633 		if (rv == 0) {
1634 			mutex_enter(&cpu_lock);
1635 			zone_pool_set(zone, pool);
1636 			mutex_exit(&cpu_lock);
1637 		}
1638 		zone->zone_pool_mod = gethrtime();
1639 		zone_rele(zone);
1640 		break;
1641 	}
1642 
1643 	kmem_free(procs, procs_size * sizeof (proc_t *));
1644 	ASSERT(pool_barrier_count == 0);
1645 	return (rv);
1646 }
1647