xref: /illumos-gate/usr/src/uts/common/os/kcpc.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/thread.h>
29 #include <sys/cpuvar.h>
30 #include <sys/inttypes.h>
31 #include <sys/cmn_err.h>
32 #include <sys/time.h>
33 #include <sys/ksynch.h>
34 #include <sys/systm.h>
35 #include <sys/kcpc.h>
36 #include <sys/cpc_impl.h>
37 #include <sys/cpc_pcbe.h>
38 #include <sys/atomic.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/sdt.h>
42 #if defined(__x86)
43 #include <asm/clock.h>
44 #endif
45 
46 kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
47 kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
48 
49 
50 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
51 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
52 
53 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
54 
55 /*
56  * These are set when a PCBE module is loaded.
57  */
58 uint_t		cpc_ncounters = 0;
59 pcbe_ops_t	*pcbe_ops = NULL;
60 
61 /*
62  * Statistics on (mis)behavior
63  */
64 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
65 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
66 
67 /*
68  * By setting 'kcpc_nullctx_panic' to 1, any overflow interrupts in a thread
69  * with no valid context will result in a panic.
70  */
71 static int kcpc_nullctx_panic = 0;
72 
73 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
74 static void kcpc_restore(kcpc_ctx_t *ctx);
75 static void kcpc_save(kcpc_ctx_t *ctx);
76 static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
77 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
78 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
79 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
80 
81 void
82 kcpc_register_pcbe(pcbe_ops_t *ops)
83 {
84 	pcbe_ops = ops;
85 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
86 }
87 
88 void
89 kcpc_register_dcpc(void (*func)(uint64_t))
90 {
91 	dtrace_cpc_fire = func;
92 }
93 
94 void
95 kcpc_unregister_dcpc(void)
96 {
97 	dtrace_cpc_fire = NULL;
98 }
99 
100 int
101 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
102 {
103 	cpu_t		*cp;
104 	kcpc_ctx_t	*ctx;
105 	int		error;
106 
107 	ctx = kcpc_ctx_alloc();
108 
109 	if (kcpc_assign_reqs(set, ctx) != 0) {
110 		kcpc_ctx_free(ctx);
111 		*subcode = CPC_RESOURCE_UNAVAIL;
112 		return (EINVAL);
113 	}
114 
115 	ctx->kc_cpuid = cpuid;
116 	ctx->kc_thread = curthread;
117 
118 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
119 
120 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
121 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
122 		kcpc_ctx_free(ctx);
123 		return (error);
124 	}
125 
126 	set->ks_ctx = ctx;
127 	ctx->kc_set = set;
128 
129 	/*
130 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
131 	 * we are manipulating the cpu_t and programming the hardware, else the
132 	 * the cpu_t could go away while we're looking at it.
133 	 */
134 	mutex_enter(&cpu_lock);
135 	cp = cpu_get(cpuid);
136 
137 	if (cp == NULL)
138 		/*
139 		 * The CPU could have been DRd out while we were getting set up.
140 		 */
141 		goto unbound;
142 
143 	mutex_enter(&cp->cpu_cpc_ctxlock);
144 
145 	if (cp->cpu_cpc_ctx != NULL) {
146 		/*
147 		 * If this CPU already has a bound set, return an error.
148 		 */
149 		mutex_exit(&cp->cpu_cpc_ctxlock);
150 		goto unbound;
151 	}
152 
153 	if (curthread->t_bind_cpu != cpuid) {
154 		mutex_exit(&cp->cpu_cpc_ctxlock);
155 		goto unbound;
156 	}
157 	cp->cpu_cpc_ctx = ctx;
158 
159 	/*
160 	 * Kernel preemption must be disabled while fiddling with the hardware
161 	 * registers to prevent partial updates.
162 	 */
163 	kpreempt_disable();
164 	ctx->kc_rawtick = KCPC_GET_TICK();
165 	pcbe_ops->pcbe_program(ctx);
166 	kpreempt_enable();
167 
168 	mutex_exit(&cp->cpu_cpc_ctxlock);
169 	mutex_exit(&cpu_lock);
170 
171 	mutex_enter(&set->ks_lock);
172 	set->ks_state |= KCPC_SET_BOUND;
173 	cv_signal(&set->ks_condv);
174 	mutex_exit(&set->ks_lock);
175 
176 	return (0);
177 
178 unbound:
179 	mutex_exit(&cpu_lock);
180 	set->ks_ctx = NULL;
181 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
182 	kcpc_ctx_free(ctx);
183 	return (EAGAIN);
184 }
185 
186 int
187 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
188 {
189 	kcpc_ctx_t	*ctx;
190 	int		error;
191 
192 	/*
193 	 * Only one set is allowed per context, so ensure there is no
194 	 * existing context.
195 	 */
196 
197 	if (t->t_cpc_ctx != NULL)
198 		return (EEXIST);
199 
200 	ctx = kcpc_ctx_alloc();
201 
202 	/*
203 	 * The context must begin life frozen until it has been properly
204 	 * programmed onto the hardware. This prevents the context ops from
205 	 * worrying about it until we're ready.
206 	 */
207 	ctx->kc_flags |= KCPC_CTX_FREEZE;
208 	ctx->kc_hrtime = gethrtime();
209 
210 	if (kcpc_assign_reqs(set, ctx) != 0) {
211 		kcpc_ctx_free(ctx);
212 		*subcode = CPC_RESOURCE_UNAVAIL;
213 		return (EINVAL);
214 	}
215 
216 	ctx->kc_cpuid = -1;
217 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
218 		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
219 	ctx->kc_thread = t;
220 	t->t_cpc_ctx = ctx;
221 	/*
222 	 * Permit threads to look at their own hardware counters from userland.
223 	 */
224 	ctx->kc_flags |= KCPC_CTX_NONPRIV;
225 
226 	/*
227 	 * Create the data store for this set.
228 	 */
229 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
230 
231 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
232 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
233 		kcpc_ctx_free(ctx);
234 		t->t_cpc_ctx = NULL;
235 		return (error);
236 	}
237 
238 	set->ks_ctx = ctx;
239 	ctx->kc_set = set;
240 
241 	/*
242 	 * Add a device context to the subject thread.
243 	 */
244 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
245 	    kcpc_lwp_create, NULL, kcpc_free);
246 
247 	/*
248 	 * Ask the backend to program the hardware.
249 	 */
250 	if (t == curthread) {
251 		kpreempt_disable();
252 		ctx->kc_rawtick = KCPC_GET_TICK();
253 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
254 		pcbe_ops->pcbe_program(ctx);
255 		kpreempt_enable();
256 	} else
257 		/*
258 		 * Since we are the agent LWP, we know the victim LWP is stopped
259 		 * until we're done here; no need to worry about preemption or
260 		 * migration here. We still use an atomic op to clear the flag
261 		 * to ensure the flags are always self-consistent; they can
262 		 * still be accessed from, for instance, another CPU doing a
263 		 * kcpc_invalidate_all().
264 		 */
265 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
266 
267 	mutex_enter(&set->ks_lock);
268 	set->ks_state |= KCPC_SET_BOUND;
269 	cv_signal(&set->ks_condv);
270 	mutex_exit(&set->ks_lock);
271 
272 	return (0);
273 }
274 
275 /*
276  * Walk through each request in the set and ask the PCBE to configure a
277  * corresponding counter.
278  */
279 int
280 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
281 {
282 	int		i;
283 	int		ret;
284 	kcpc_request_t	*rp;
285 
286 	for (i = 0; i < set->ks_nreqs; i++) {
287 		int n;
288 		rp = &set->ks_req[i];
289 
290 		n = rp->kr_picnum;
291 
292 		ASSERT(n >= 0 && n < cpc_ncounters);
293 
294 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
295 
296 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
297 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
298 			    == 0) {
299 				*subcode = -1;
300 				return (ENOTSUP);
301 			}
302 			/*
303 			 * If any of the counters have requested overflow
304 			 * notification, we flag the context as being one that
305 			 * cares about overflow.
306 			 */
307 			ctx->kc_flags |= KCPC_CTX_SIGOVF;
308 		}
309 
310 		rp->kr_config = NULL;
311 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
312 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
313 		    &(rp->kr_config), (void *)ctx)) != 0) {
314 			kcpc_free_configs(set);
315 			*subcode = ret;
316 			switch (ret) {
317 			case CPC_ATTR_REQUIRES_PRIVILEGE:
318 			case CPC_HV_NO_ACCESS:
319 				return (EACCES);
320 			default:
321 				return (EINVAL);
322 			}
323 		}
324 
325 		ctx->kc_pics[n].kp_req = rp;
326 		rp->kr_picp = &ctx->kc_pics[n];
327 		rp->kr_data = set->ks_data + rp->kr_index;
328 		*rp->kr_data = rp->kr_preset;
329 	}
330 
331 	return (0);
332 }
333 
334 void
335 kcpc_free_configs(kcpc_set_t *set)
336 {
337 	int i;
338 
339 	for (i = 0; i < set->ks_nreqs; i++)
340 		if (set->ks_req[i].kr_config != NULL)
341 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
342 }
343 
344 /*
345  * buf points to a user address and the data should be copied out to that
346  * address in the current process.
347  */
348 int
349 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
350 {
351 	kcpc_ctx_t	*ctx = set->ks_ctx;
352 	uint64_t	curtick = KCPC_GET_TICK();
353 
354 	mutex_enter(&set->ks_lock);
355 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
356 		mutex_exit(&set->ks_lock);
357 		return (EINVAL);
358 	}
359 	mutex_exit(&set->ks_lock);
360 
361 	if (ctx->kc_flags & KCPC_CTX_INVALID)
362 		return (EAGAIN);
363 
364 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
365 		/*
366 		 * Kernel preemption must be disabled while reading the
367 		 * hardware regs, and if this is a CPU-bound context, while
368 		 * checking the CPU binding of the current thread.
369 		 */
370 		kpreempt_disable();
371 
372 		if (ctx->kc_cpuid != -1) {
373 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
374 				kpreempt_enable();
375 				return (EAGAIN);
376 			}
377 		}
378 
379 		if (ctx->kc_thread == curthread) {
380 			ctx->kc_hrtime = gethrtime();
381 			pcbe_ops->pcbe_sample(ctx);
382 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
383 			ctx->kc_rawtick = curtick;
384 		}
385 
386 		kpreempt_enable();
387 
388 		/*
389 		 * The config may have been invalidated by
390 		 * the pcbe_sample op.
391 		 */
392 		if (ctx->kc_flags & KCPC_CTX_INVALID)
393 			return (EAGAIN);
394 	}
395 
396 	if (copyout(set->ks_data, buf,
397 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
398 		return (EFAULT);
399 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
400 		return (EFAULT);
401 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
402 		return (EFAULT);
403 
404 	return (0);
405 }
406 
407 /*
408  * Stop the counters on the CPU this context is bound to.
409  */
410 static void
411 kcpc_stop_hw(kcpc_ctx_t *ctx)
412 {
413 	cpu_t *cp;
414 
415 	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
416 	    == KCPC_CTX_INVALID);
417 
418 	kpreempt_disable();
419 
420 	cp = cpu_get(ctx->kc_cpuid);
421 	ASSERT(cp != NULL);
422 
423 	if (cp == CPU) {
424 		pcbe_ops->pcbe_allstop();
425 		atomic_or_uint(&ctx->kc_flags,
426 		    KCPC_CTX_INVALID_STOPPED);
427 	} else
428 		kcpc_remote_stop(cp);
429 	kpreempt_enable();
430 }
431 
432 int
433 kcpc_unbind(kcpc_set_t *set)
434 {
435 	kcpc_ctx_t	*ctx;
436 	kthread_t	*t;
437 
438 	/*
439 	 * We could be racing with the process's agent thread as it
440 	 * binds the set; we must wait for the set to finish binding
441 	 * before attempting to tear it down.
442 	 */
443 	mutex_enter(&set->ks_lock);
444 	while ((set->ks_state & KCPC_SET_BOUND) == 0)
445 		cv_wait(&set->ks_condv, &set->ks_lock);
446 	mutex_exit(&set->ks_lock);
447 
448 	ctx = set->ks_ctx;
449 
450 	/*
451 	 * Use kc_lock to synchronize with kcpc_restore().
452 	 */
453 	mutex_enter(&ctx->kc_lock);
454 	ctx->kc_flags |= KCPC_CTX_INVALID;
455 	mutex_exit(&ctx->kc_lock);
456 
457 	if (ctx->kc_cpuid == -1) {
458 		t = ctx->kc_thread;
459 		/*
460 		 * The context is thread-bound and therefore has a device
461 		 * context.  It will be freed via removectx() calling
462 		 * freectx() calling kcpc_free().
463 		 */
464 		if (t == curthread &&
465 		    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
466 			kpreempt_disable();
467 			pcbe_ops->pcbe_allstop();
468 			atomic_or_uint(&ctx->kc_flags,
469 			    KCPC_CTX_INVALID_STOPPED);
470 			kpreempt_enable();
471 		}
472 #ifdef DEBUG
473 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
474 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
475 			panic("kcpc_unbind: context %p not preset on thread %p",
476 			    (void *)ctx, (void *)t);
477 #else
478 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
479 		    kcpc_lwp_create, NULL, kcpc_free);
480 #endif /* DEBUG */
481 		t->t_cpc_set = NULL;
482 		t->t_cpc_ctx = NULL;
483 	} else {
484 		/*
485 		 * If we are unbinding a CPU-bound set from a remote CPU, the
486 		 * native CPU's idle thread could be in the midst of programming
487 		 * this context onto the CPU. We grab the context's lock here to
488 		 * ensure that the idle thread is done with it. When we release
489 		 * the lock, the CPU no longer has a context and the idle thread
490 		 * will move on.
491 		 *
492 		 * cpu_lock must be held to prevent the CPU from being DR'd out
493 		 * while we disassociate the context from the cpu_t.
494 		 */
495 		cpu_t *cp;
496 		mutex_enter(&cpu_lock);
497 		cp = cpu_get(ctx->kc_cpuid);
498 		if (cp != NULL) {
499 			/*
500 			 * The CPU may have been DR'd out of the system.
501 			 */
502 			mutex_enter(&cp->cpu_cpc_ctxlock);
503 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
504 				kcpc_stop_hw(ctx);
505 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
506 			cp->cpu_cpc_ctx = NULL;
507 			mutex_exit(&cp->cpu_cpc_ctxlock);
508 		}
509 		mutex_exit(&cpu_lock);
510 		if (ctx->kc_thread == curthread) {
511 			kcpc_free(ctx, 0);
512 			curthread->t_cpc_set = NULL;
513 		}
514 	}
515 
516 	return (0);
517 }
518 
519 int
520 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
521 {
522 	int i;
523 
524 	ASSERT(set != NULL);
525 	ASSERT(set->ks_state & KCPC_SET_BOUND);
526 	ASSERT(set->ks_ctx->kc_thread == curthread);
527 	ASSERT(set->ks_ctx->kc_cpuid == -1);
528 
529 	if (index < 0 || index >= set->ks_nreqs)
530 		return (EINVAL);
531 
532 	for (i = 0; i < set->ks_nreqs; i++)
533 		if (set->ks_req[i].kr_index == index)
534 			break;
535 	ASSERT(i != set->ks_nreqs);
536 
537 	set->ks_req[i].kr_preset = preset;
538 	return (0);
539 }
540 
541 int
542 kcpc_restart(kcpc_set_t *set)
543 {
544 	kcpc_ctx_t	*ctx = set->ks_ctx;
545 	int		i;
546 
547 	ASSERT(set->ks_state & KCPC_SET_BOUND);
548 	ASSERT(ctx->kc_thread == curthread);
549 	ASSERT(ctx->kc_cpuid == -1);
550 
551 	kpreempt_disable();
552 
553 	/*
554 	 * If the user is doing this on a running set, make sure the counters
555 	 * are stopped first.
556 	 */
557 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
558 		pcbe_ops->pcbe_allstop();
559 
560 	for (i = 0; i < set->ks_nreqs; i++) {
561 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
562 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
563 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
564 	}
565 
566 	/*
567 	 * Ask the backend to program the hardware.
568 	 */
569 	ctx->kc_rawtick = KCPC_GET_TICK();
570 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
571 	pcbe_ops->pcbe_program(ctx);
572 	kpreempt_enable();
573 
574 	return (0);
575 }
576 
577 /*
578  * Caller must hold kcpc_cpuctx_lock.
579  */
580 int
581 kcpc_enable(kthread_t *t, int cmd, int enable)
582 {
583 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
584 	kcpc_set_t	*set = t->t_cpc_set;
585 	kcpc_set_t	*newset;
586 	int		i;
587 	int		flag;
588 	int		err;
589 
590 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
591 
592 	if (ctx == NULL) {
593 		/*
594 		 * This thread has a set but no context; it must be a
595 		 * CPU-bound set.
596 		 */
597 		ASSERT(t->t_cpc_set != NULL);
598 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
599 		return (EINVAL);
600 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
601 		return (EAGAIN);
602 
603 	if (cmd == CPC_ENABLE) {
604 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
605 			return (EINVAL);
606 		kpreempt_disable();
607 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
608 		kcpc_restore(ctx);
609 		kpreempt_enable();
610 	} else if (cmd == CPC_DISABLE) {
611 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
612 			return (EINVAL);
613 		kpreempt_disable();
614 		kcpc_save(ctx);
615 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
616 		kpreempt_enable();
617 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
618 		/*
619 		 * Strategy for usr/sys: stop counters and update set's presets
620 		 * with current counter values, unbind, update requests with
621 		 * new config, then re-bind.
622 		 */
623 		flag = (cmd == CPC_USR_EVENTS) ?
624 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
625 
626 		kpreempt_disable();
627 		atomic_or_uint(&ctx->kc_flags,
628 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
629 		pcbe_ops->pcbe_allstop();
630 		kpreempt_enable();
631 		for (i = 0; i < set->ks_nreqs; i++) {
632 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
633 			if (enable)
634 				set->ks_req[i].kr_flags |= flag;
635 			else
636 				set->ks_req[i].kr_flags &= ~flag;
637 		}
638 		newset = kcpc_dup_set(set);
639 		if (kcpc_unbind(set) != 0)
640 			return (EINVAL);
641 		t->t_cpc_set = newset;
642 		if (kcpc_bind_thread(newset, t, &err) != 0) {
643 			t->t_cpc_set = NULL;
644 			kcpc_free_set(newset);
645 			return (EINVAL);
646 		}
647 	} else
648 		return (EINVAL);
649 
650 	return (0);
651 }
652 
653 /*
654  * Provide PCBEs with a way of obtaining the configs of every counter which will
655  * be programmed together.
656  *
657  * If current is NULL, provide the first config.
658  *
659  * If data != NULL, caller wants to know where the data store associated with
660  * the config we return is located.
661  */
662 void *
663 kcpc_next_config(void *token, void *current, uint64_t **data)
664 {
665 	int		i;
666 	kcpc_pic_t	*pic;
667 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
668 
669 	if (current == NULL) {
670 		/*
671 		 * Client would like the first config, which may not be in
672 		 * counter 0; we need to search through the counters for the
673 		 * first config.
674 		 */
675 		for (i = 0; i < cpc_ncounters; i++)
676 			if (ctx->kc_pics[i].kp_req != NULL)
677 				break;
678 		/*
679 		 * There are no counters configured for the given context.
680 		 */
681 		if (i == cpc_ncounters)
682 			return (NULL);
683 	} else {
684 		/*
685 		 * There surely is a faster way to do this.
686 		 */
687 		for (i = 0; i < cpc_ncounters; i++) {
688 			pic = &ctx->kc_pics[i];
689 
690 			if (pic->kp_req != NULL &&
691 			    current == pic->kp_req->kr_config)
692 				break;
693 		}
694 
695 		/*
696 		 * We found the current config at picnum i. Now search for the
697 		 * next configured PIC.
698 		 */
699 		for (i++; i < cpc_ncounters; i++) {
700 			pic = &ctx->kc_pics[i];
701 			if (pic->kp_req != NULL)
702 				break;
703 		}
704 
705 		if (i == cpc_ncounters)
706 			return (NULL);
707 	}
708 
709 	if (data != NULL) {
710 		*data = ctx->kc_pics[i].kp_req->kr_data;
711 	}
712 
713 	return (ctx->kc_pics[i].kp_req->kr_config);
714 }
715 
716 
717 kcpc_ctx_t *
718 kcpc_ctx_alloc(void)
719 {
720 	kcpc_ctx_t	*ctx;
721 	long		hash;
722 
723 	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), KM_SLEEP);
724 
725 	hash = CPC_HASH_CTX(ctx);
726 	mutex_enter(&kcpc_ctx_llock[hash]);
727 	ctx->kc_next = kcpc_ctx_list[hash];
728 	kcpc_ctx_list[hash] = ctx;
729 	mutex_exit(&kcpc_ctx_llock[hash]);
730 
731 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
732 	    cpc_ncounters, KM_SLEEP);
733 
734 	ctx->kc_cpuid = -1;
735 
736 	return (ctx);
737 }
738 
739 /*
740  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
741  * in the flags.
742  */
743 static void
744 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
745 {
746 	kcpc_set_t	*ks = ctx->kc_set, *cks;
747 	int		i, j;
748 	int		code;
749 
750 	ASSERT(ks != NULL);
751 
752 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
753 		return;
754 
755 	cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
756 	cks->ks_state &= ~KCPC_SET_BOUND;
757 	cctx->kc_set = cks;
758 	cks->ks_flags = ks->ks_flags;
759 	cks->ks_nreqs = ks->ks_nreqs;
760 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
761 	    sizeof (kcpc_request_t), KM_SLEEP);
762 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
763 	    KM_SLEEP);
764 	cks->ks_ctx = cctx;
765 
766 	for (i = 0; i < cks->ks_nreqs; i++) {
767 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
768 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
769 		(void) strncpy(cks->ks_req[i].kr_event,
770 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
771 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
772 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
773 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
774 		if (ks->ks_req[i].kr_nattrs > 0) {
775 			cks->ks_req[i].kr_attr =
776 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
777 			    sizeof (kcpc_attr_t), KM_SLEEP);
778 		}
779 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
780 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
781 			    ks->ks_req[i].kr_attr[j].ka_name,
782 			    CPC_MAX_ATTR_LEN);
783 			cks->ks_req[i].kr_attr[j].ka_val =
784 			    ks->ks_req[i].kr_attr[j].ka_val;
785 		}
786 	}
787 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
788 		kcpc_invalidate_config(cctx);
789 
790 	mutex_enter(&cks->ks_lock);
791 	cks->ks_state |= KCPC_SET_BOUND;
792 	cv_signal(&cks->ks_condv);
793 	mutex_exit(&cks->ks_lock);
794 }
795 
796 
797 void
798 kcpc_ctx_free(kcpc_ctx_t *ctx)
799 {
800 	kcpc_ctx_t	**loc;
801 	long		hash = CPC_HASH_CTX(ctx);
802 
803 	mutex_enter(&kcpc_ctx_llock[hash]);
804 	loc = &kcpc_ctx_list[hash];
805 	ASSERT(*loc != NULL);
806 	while (*loc != ctx)
807 		loc = &(*loc)->kc_next;
808 	*loc = ctx->kc_next;
809 	mutex_exit(&kcpc_ctx_llock[hash]);
810 
811 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
812 	cv_destroy(&ctx->kc_condv);
813 	mutex_destroy(&ctx->kc_lock);
814 	kmem_free(ctx, sizeof (*ctx));
815 }
816 
817 /*
818  * Generic interrupt handler used on hardware that generates
819  * overflow interrupts.
820  *
821  * Note: executed at high-level interrupt context!
822  */
823 /*ARGSUSED*/
824 kcpc_ctx_t *
825 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
826 {
827 	kcpc_ctx_t	*ctx;
828 	kthread_t	*t = curthread;
829 	int		i;
830 
831 	/*
832 	 * On both x86 and UltraSPARC, we may deliver the high-level
833 	 * interrupt in kernel mode, just after we've started to run an
834 	 * interrupt thread.  (That's because the hardware helpfully
835 	 * delivers the overflow interrupt some random number of cycles
836 	 * after the instruction that caused the overflow by which time
837 	 * we're in some part of the kernel, not necessarily running on
838 	 * the right thread).
839 	 *
840 	 * Check for this case here -- find the pinned thread
841 	 * that was running when the interrupt went off.
842 	 */
843 	if (t->t_flag & T_INTR_THREAD) {
844 		klwp_t *lwp;
845 
846 		atomic_add_32(&kcpc_intrctx_count, 1);
847 
848 		/*
849 		 * Note that t_lwp is always set to point at the underlying
850 		 * thread, thus this will work in the presence of nested
851 		 * interrupts.
852 		 */
853 		ctx = NULL;
854 		if ((lwp = t->t_lwp) != NULL) {
855 			t = lwptot(lwp);
856 			ctx = t->t_cpc_ctx;
857 		}
858 	} else
859 		ctx = t->t_cpc_ctx;
860 
861 	if (ctx == NULL) {
862 		/*
863 		 * This can easily happen if we're using the counters in
864 		 * "shared" mode, for example, and an overflow interrupt
865 		 * occurs while we are running cpustat.  In that case, the
866 		 * bound thread that has the context that belongs to this
867 		 * CPU is almost certainly sleeping (if it was running on
868 		 * the CPU we'd have found it above), and the actual
869 		 * interrupted thread has no knowledge of performance counters!
870 		 */
871 		ctx = curthread->t_cpu->cpu_cpc_ctx;
872 		if (ctx != NULL) {
873 			/*
874 			 * Return the bound context for this CPU to
875 			 * the interrupt handler so that it can synchronously
876 			 * sample the hardware counters and restart them.
877 			 */
878 			return (ctx);
879 		}
880 
881 		/*
882 		 * As long as the overflow interrupt really is delivered early
883 		 * enough after trapping into the kernel to avoid switching
884 		 * threads, we must always be able to find the cpc context,
885 		 * or something went terribly wrong i.e. we ended up
886 		 * running a passivated interrupt thread, a kernel
887 		 * thread or we interrupted idle, all of which are Very Bad.
888 		 *
889 		 * We also could end up here owing to an incredibly unlikely
890 		 * race condition that exists on x86 based architectures when
891 		 * the cpc provider is in use; overflow interrupts are directed
892 		 * to the cpc provider if the 'dtrace_cpc_in_use' variable is
893 		 * set when we enter the handler. This variable is unset after
894 		 * overflow interrupts have been disabled on all CPUs and all
895 		 * contexts have been torn down. To stop interrupts, the cpc
896 		 * provider issues a xcall to the remote CPU before it tears
897 		 * down that CPUs context. As high priority xcalls, on an x86
898 		 * architecture, execute at a higher PIL than this handler, it
899 		 * is possible (though extremely unlikely) that the xcall could
900 		 * interrupt the overflow handler before the handler has
901 		 * checked the 'dtrace_cpc_in_use' variable, stop the counters,
902 		 * return to the cpc provider which could then rip down
903 		 * contexts and unset 'dtrace_cpc_in_use' *before* the CPUs
904 		 * overflow handler has had a chance to check the variable. In
905 		 * that case, the handler would direct the overflow into this
906 		 * code and no valid context will be found. The default behavior
907 		 * when no valid context is found is now to shout a warning to
908 		 * the console and bump the 'kcpc_nullctx_count' variable.
909 		 */
910 		if (kcpc_nullctx_panic)
911 			panic("null cpc context, thread %p", (void *)t);
912 
913 		cmn_err(CE_WARN,
914 		    "null cpc context found in overflow handler!\n");
915 		atomic_add_32(&kcpc_nullctx_count, 1);
916 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
917 		/*
918 		 * Schedule an ast to sample the counters, which will
919 		 * propagate any overflow into the virtualized performance
920 		 * counter(s), and may deliver a signal.
921 		 */
922 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
923 		/*
924 		 * If a counter has overflowed which was counting on behalf of
925 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
926 		 * process a signal.
927 		 */
928 		for (i = 0; i < cpc_ncounters; i++) {
929 			if (ctx->kc_pics[i].kp_req != NULL &&
930 			    bitmap & (1 << i) &&
931 			    ctx->kc_pics[i].kp_req->kr_flags &
932 			    CPC_OVF_NOTIFY_EMT) {
933 				/*
934 				 * A signal has been requested for this PIC, so
935 				 * so freeze the context. The interrupt handler
936 				 * has already stopped the counter hardware.
937 				 */
938 				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
939 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
940 				    KCPC_PIC_OVERFLOWED);
941 			}
942 		}
943 		aston(t);
944 	}
945 	return (NULL);
946 }
947 
948 /*
949  * The current thread context had an overflow interrupt; we're
950  * executing here in high-level interrupt context.
951  */
952 /*ARGSUSED*/
953 uint_t
954 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
955 {
956 	kcpc_ctx_t *ctx;
957 	uint64_t bitmap;
958 	uint8_t *state;
959 
960 	if (pcbe_ops == NULL ||
961 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
962 		return (DDI_INTR_UNCLAIMED);
963 
964 	/*
965 	 * Prevent any further interrupts.
966 	 */
967 	pcbe_ops->pcbe_allstop();
968 
969 	if (dtrace_cpc_in_use) {
970 		state = &cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state;
971 
972 		/*
973 		 * Set the per-CPU state bit to indicate that we are currently
974 		 * processing an interrupt if it is currently free. Drop the
975 		 * interrupt if the state isn't free (i.e. a configuration
976 		 * event is taking place).
977 		 */
978 		if (atomic_cas_8(state, DCPC_INTR_FREE,
979 		    DCPC_INTR_PROCESSING) == DCPC_INTR_FREE) {
980 			int i;
981 			kcpc_request_t req;
982 
983 			ASSERT(dtrace_cpc_fire != NULL);
984 
985 			(*dtrace_cpc_fire)(bitmap);
986 
987 			ctx = curthread->t_cpu->cpu_cpc_ctx;
988 
989 			/* Reset any counters that have overflowed */
990 			for (i = 0; i < ctx->kc_set->ks_nreqs; i++) {
991 				req = ctx->kc_set->ks_req[i];
992 
993 				if (bitmap & (1 << req.kr_picnum)) {
994 					pcbe_ops->pcbe_configure(req.kr_picnum,
995 					    req.kr_event, req.kr_preset,
996 					    req.kr_flags, req.kr_nattrs,
997 					    req.kr_attr, &(req.kr_config),
998 					    (void *)ctx);
999 				}
1000 			}
1001 			pcbe_ops->pcbe_program(ctx);
1002 
1003 			/*
1004 			 * We've finished processing the interrupt so set
1005 			 * the state back to free.
1006 			 */
1007 			cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state =
1008 			    DCPC_INTR_FREE;
1009 			membar_producer();
1010 		}
1011 		return (DDI_INTR_CLAIMED);
1012 	}
1013 
1014 	/*
1015 	 * DTrace isn't involved so pass on accordingly.
1016 	 *
1017 	 * If the interrupt has occurred in the context of an lwp owning
1018 	 * the counters, then the handler posts an AST to the lwp to
1019 	 * trigger the actual sampling, and optionally deliver a signal or
1020 	 * restart the counters, on the way out of the kernel using
1021 	 * kcpc_hw_overflow_ast() (see below).
1022 	 *
1023 	 * On the other hand, if the handler returns the context to us
1024 	 * directly, then it means that there are no other threads in
1025 	 * the middle of updating it, no AST has been posted, and so we
1026 	 * should sample the counters here, and restart them with no
1027 	 * further fuss.
1028 	 */
1029 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
1030 		uint64_t curtick = KCPC_GET_TICK();
1031 
1032 		ctx->kc_hrtime = gethrtime_waitfree();
1033 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
1034 		ctx->kc_rawtick = curtick;
1035 		pcbe_ops->pcbe_sample(ctx);
1036 		pcbe_ops->pcbe_program(ctx);
1037 	}
1038 
1039 	return (DDI_INTR_CLAIMED);
1040 }
1041 
1042 /*
1043  * Called from trap() when processing the ast posted by the high-level
1044  * interrupt handler.
1045  */
1046 int
1047 kcpc_overflow_ast()
1048 {
1049 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
1050 	int		i;
1051 	int		found = 0;
1052 	uint64_t	curtick = KCPC_GET_TICK();
1053 
1054 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
1055 
1056 	/*
1057 	 * An overflow happened: sample the context to ensure that
1058 	 * the overflow is propagated into the upper bits of the
1059 	 * virtualized 64-bit counter(s).
1060 	 */
1061 	kpreempt_disable();
1062 	ctx->kc_hrtime = gethrtime_waitfree();
1063 	pcbe_ops->pcbe_sample(ctx);
1064 	kpreempt_enable();
1065 
1066 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
1067 
1068 	/*
1069 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
1070 	 * if that pic generated an overflow and if the request it was counting
1071 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
1072 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
1073 	 * found any overflowed pics, keep the context frozen and return true
1074 	 * (thus causing a signal to be sent).
1075 	 */
1076 	for (i = 0; i < cpc_ncounters; i++) {
1077 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
1078 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
1079 			    ~KCPC_PIC_OVERFLOWED);
1080 			found = 1;
1081 		}
1082 	}
1083 	if (found)
1084 		return (1);
1085 
1086 	/*
1087 	 * Otherwise, re-enable the counters and continue life as before.
1088 	 */
1089 	kpreempt_disable();
1090 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
1091 	pcbe_ops->pcbe_program(ctx);
1092 	kpreempt_enable();
1093 	return (0);
1094 }
1095 
1096 /*
1097  * Called when switching away from current thread.
1098  */
1099 static void
1100 kcpc_save(kcpc_ctx_t *ctx)
1101 {
1102 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1103 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
1104 			return;
1105 		/*
1106 		 * This context has been invalidated but the counters have not
1107 		 * been stopped. Stop them here and mark the context stopped.
1108 		 */
1109 		pcbe_ops->pcbe_allstop();
1110 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
1111 		return;
1112 	}
1113 
1114 	pcbe_ops->pcbe_allstop();
1115 	if (ctx->kc_flags & KCPC_CTX_FREEZE)
1116 		return;
1117 
1118 	/*
1119 	 * Need to sample for all reqs into each req's current mpic.
1120 	 */
1121 	ctx->kc_hrtime = gethrtime();
1122 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1123 	pcbe_ops->pcbe_sample(ctx);
1124 }
1125 
1126 static void
1127 kcpc_restore(kcpc_ctx_t *ctx)
1128 {
1129 	mutex_enter(&ctx->kc_lock);
1130 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1131 	    KCPC_CTX_INVALID)
1132 		/*
1133 		 * The context is invalidated but has not been marked stopped.
1134 		 * We mark it as such here because we will not start the
1135 		 * counters during this context switch.
1136 		 */
1137 		ctx->kc_flags |= KCPC_CTX_INVALID_STOPPED;
1138 
1139 
1140 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
1141 		mutex_exit(&ctx->kc_lock);
1142 		return;
1143 	}
1144 
1145 	/*
1146 	 * Set kc_flags to show that a kcpc_restore() is in progress to avoid
1147 	 * ctx & set related memory objects being freed without us knowing.
1148 	 * This can happen if an agent thread is executing a kcpc_unbind(),
1149 	 * with this thread as the target, whilst we're concurrently doing a
1150 	 * restorectx() during, for example, a proc_exit().  Effectively, by
1151 	 * doing this, we're asking kcpc_free() to cv_wait() until
1152 	 * kcpc_restore() has completed.
1153 	 */
1154 	ctx->kc_flags |= KCPC_CTX_RESTORE;
1155 	mutex_exit(&ctx->kc_lock);
1156 
1157 	/*
1158 	 * While programming the hardware, the counters should be stopped. We
1159 	 * don't do an explicit pcbe_allstop() here because they should have
1160 	 * been stopped already by the last consumer.
1161 	 */
1162 	ctx->kc_rawtick = KCPC_GET_TICK();
1163 	pcbe_ops->pcbe_program(ctx);
1164 
1165 	/*
1166 	 * Wake the agent thread if it's waiting in kcpc_free().
1167 	 */
1168 	mutex_enter(&ctx->kc_lock);
1169 	ctx->kc_flags &= ~KCPC_CTX_RESTORE;
1170 	cv_signal(&ctx->kc_condv);
1171 	mutex_exit(&ctx->kc_lock);
1172 }
1173 
1174 /*
1175  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1176  * following context operators to the idle thread on each CPU. They stop the
1177  * counters when the idle thread is switched on, and they start them again when
1178  * it is switched off.
1179  */
1180 
1181 /*ARGSUSED*/
1182 void
1183 kcpc_idle_save(struct cpu *cp)
1184 {
1185 	/*
1186 	 * The idle thread shouldn't be run anywhere else.
1187 	 */
1188 	ASSERT(CPU == cp);
1189 
1190 	/*
1191 	 * We must hold the CPU's context lock to ensure the context isn't freed
1192 	 * while we're looking at it.
1193 	 */
1194 	mutex_enter(&cp->cpu_cpc_ctxlock);
1195 
1196 	if ((cp->cpu_cpc_ctx == NULL) ||
1197 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1198 		mutex_exit(&cp->cpu_cpc_ctxlock);
1199 		return;
1200 	}
1201 
1202 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1203 	mutex_exit(&cp->cpu_cpc_ctxlock);
1204 }
1205 
1206 void
1207 kcpc_idle_restore(struct cpu *cp)
1208 {
1209 	/*
1210 	 * The idle thread shouldn't be run anywhere else.
1211 	 */
1212 	ASSERT(CPU == cp);
1213 
1214 	/*
1215 	 * We must hold the CPU's context lock to ensure the context isn't freed
1216 	 * while we're looking at it.
1217 	 */
1218 	mutex_enter(&cp->cpu_cpc_ctxlock);
1219 
1220 	if ((cp->cpu_cpc_ctx == NULL) ||
1221 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1222 		mutex_exit(&cp->cpu_cpc_ctxlock);
1223 		return;
1224 	}
1225 
1226 	pcbe_ops->pcbe_allstop();
1227 	mutex_exit(&cp->cpu_cpc_ctxlock);
1228 }
1229 
1230 /*ARGSUSED*/
1231 static void
1232 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1233 {
1234 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
1235 	int		i;
1236 
1237 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1238 		return;
1239 
1240 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
1241 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
1242 		rw_exit(&kcpc_cpuctx_lock);
1243 		return;
1244 	}
1245 	cctx = kcpc_ctx_alloc();
1246 	kcpc_ctx_clone(ctx, cctx);
1247 	rw_exit(&kcpc_cpuctx_lock);
1248 
1249 	/*
1250 	 * Copy the parent context's kc_flags field, but don't overwrite
1251 	 * the child's in case it was modified during kcpc_ctx_clone.
1252 	 */
1253 	cctx->kc_flags |= ctx->kc_flags;
1254 	cctx->kc_thread = ct;
1255 	cctx->kc_cpuid = -1;
1256 	ct->t_cpc_set = cctx->kc_set;
1257 	ct->t_cpc_ctx = cctx;
1258 
1259 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1260 		kcpc_set_t *ks = cctx->kc_set;
1261 		/*
1262 		 * Our contract with the user requires us to immediately send an
1263 		 * overflow signal to all children if we have the LWPINHERIT
1264 		 * and SIGOVF flags set. In addition, all counters should be
1265 		 * set to UINT64_MAX, and their pic's overflow flag turned on
1266 		 * so that our trap() processing knows to send a signal.
1267 		 */
1268 		atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE);
1269 		for (i = 0; i < ks->ks_nreqs; i++) {
1270 			kcpc_request_t *kr = &ks->ks_req[i];
1271 
1272 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1273 				*(kr->kr_data) = UINT64_MAX;
1274 				kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED;
1275 			}
1276 		}
1277 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1278 		aston(ct);
1279 	}
1280 
1281 	installctx(ct, cctx, kcpc_save, kcpc_restore,
1282 	    NULL, kcpc_lwp_create, NULL, kcpc_free);
1283 }
1284 
1285 /*
1286  * Counter Stoppage Theory
1287  *
1288  * The counters may need to be stopped properly at the following occasions:
1289  *
1290  * 1) An LWP exits.
1291  * 2) A thread exits.
1292  * 3) An LWP performs an exec().
1293  * 4) A bound set is unbound.
1294  *
1295  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1296  * to be freed as well.
1297  *
1298  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1299  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1300  * context.
1301  *
1302  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1303  *
1304  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1305  * been called from exec. It stops the counters _and_ frees the context.
1306  *
1307  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1308  *
1309  * CPU-bound counters are always stopped via kcpc_unbind().
1310  */
1311 
1312 /*
1313  * We're being called to delete the context; we ensure that all associated data
1314  * structures are freed, and that the hardware is passivated if this is an exec.
1315  */
1316 
1317 /*ARGSUSED*/
1318 static void
1319 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1320 {
1321 	int		i;
1322 	kcpc_set_t	*set = ctx->kc_set;
1323 
1324 	ASSERT(set != NULL);
1325 
1326 	/*
1327 	 * Wait for kcpc_restore() to finish before we tear things down.
1328 	 */
1329 	mutex_enter(&ctx->kc_lock);
1330 	while (ctx->kc_flags & KCPC_CTX_RESTORE)
1331 		cv_wait(&ctx->kc_condv, &ctx->kc_lock);
1332 	ctx->kc_flags |= KCPC_CTX_INVALID;
1333 	mutex_exit(&ctx->kc_lock);
1334 
1335 	if (isexec) {
1336 		/*
1337 		 * This thread is execing, and after the exec it should not have
1338 		 * any performance counter context. Stop the counters properly
1339 		 * here so the system isn't surprised by an overflow interrupt
1340 		 * later.
1341 		 */
1342 		if (ctx->kc_cpuid != -1) {
1343 			cpu_t *cp;
1344 			/*
1345 			 * CPU-bound context; stop the appropriate CPU's ctrs.
1346 			 * Hold cpu_lock while examining the CPU to ensure it
1347 			 * doesn't go away.
1348 			 */
1349 			mutex_enter(&cpu_lock);
1350 			cp = cpu_get(ctx->kc_cpuid);
1351 			/*
1352 			 * The CPU could have been DR'd out, so only stop the
1353 			 * CPU and clear its context pointer if the CPU still
1354 			 * exists.
1355 			 */
1356 			if (cp != NULL) {
1357 				mutex_enter(&cp->cpu_cpc_ctxlock);
1358 				kcpc_stop_hw(ctx);
1359 				cp->cpu_cpc_ctx = NULL;
1360 				mutex_exit(&cp->cpu_cpc_ctxlock);
1361 			}
1362 			mutex_exit(&cpu_lock);
1363 			ASSERT(curthread->t_cpc_ctx == NULL);
1364 		} else {
1365 			/*
1366 			 * Thread-bound context; stop _this_ CPU's counters.
1367 			 */
1368 			kpreempt_disable();
1369 			pcbe_ops->pcbe_allstop();
1370 			atomic_or_uint(&ctx->kc_flags,
1371 			    KCPC_CTX_INVALID_STOPPED);
1372 			kpreempt_enable();
1373 			curthread->t_cpc_ctx = NULL;
1374 		}
1375 
1376 		/*
1377 		 * Since we are being called from an exec and we know that
1378 		 * exec is not permitted via the agent thread, we should clean
1379 		 * up this thread's CPC state completely, and not leave dangling
1380 		 * CPC pointers behind.
1381 		 */
1382 		ASSERT(ctx->kc_thread == curthread);
1383 		curthread->t_cpc_set = NULL;
1384 	}
1385 
1386 	/*
1387 	 * Walk through each request in this context's set and free the PCBE's
1388 	 * configuration if it exists.
1389 	 */
1390 	for (i = 0; i < set->ks_nreqs; i++) {
1391 		if (set->ks_req[i].kr_config != NULL)
1392 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1393 	}
1394 
1395 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1396 	kcpc_ctx_free(ctx);
1397 	kcpc_free_set(set);
1398 }
1399 
1400 /*
1401  * Free the memory associated with a request set.
1402  */
1403 void
1404 kcpc_free_set(kcpc_set_t *set)
1405 {
1406 	int		i;
1407 	kcpc_request_t	*req;
1408 
1409 	ASSERT(set->ks_req != NULL);
1410 
1411 	for (i = 0; i < set->ks_nreqs; i++) {
1412 		req = &set->ks_req[i];
1413 
1414 		if (req->kr_nattrs != 0) {
1415 			kmem_free(req->kr_attr,
1416 			    req->kr_nattrs * sizeof (kcpc_attr_t));
1417 		}
1418 	}
1419 
1420 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1421 	cv_destroy(&set->ks_condv);
1422 	mutex_destroy(&set->ks_lock);
1423 	kmem_free(set, sizeof (kcpc_set_t));
1424 }
1425 
1426 /*
1427  * Grab every existing context and mark it as invalid.
1428  */
1429 void
1430 kcpc_invalidate_all(void)
1431 {
1432 	kcpc_ctx_t *ctx;
1433 	long hash;
1434 
1435 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1436 		mutex_enter(&kcpc_ctx_llock[hash]);
1437 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1438 			atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1439 		mutex_exit(&kcpc_ctx_llock[hash]);
1440 	}
1441 }
1442 
1443 /*
1444  * Interface for PCBEs to signal that an existing configuration has suddenly
1445  * become invalid.
1446  */
1447 void
1448 kcpc_invalidate_config(void *token)
1449 {
1450 	kcpc_ctx_t *ctx = token;
1451 
1452 	ASSERT(ctx != NULL);
1453 
1454 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1455 }
1456 
1457 /*
1458  * Called from lwp_exit() and thread_exit()
1459  */
1460 void
1461 kcpc_passivate(void)
1462 {
1463 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1464 	kcpc_set_t *set = curthread->t_cpc_set;
1465 
1466 	if (set == NULL)
1467 		return;
1468 
1469 	/*
1470 	 * We're cleaning up after this thread; ensure there are no dangling
1471 	 * CPC pointers left behind. The context and set will be freed by
1472 	 * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in
1473 	 * the case of a CPU-bound set.
1474 	 */
1475 	curthread->t_cpc_ctx = NULL;
1476 
1477 	if (ctx == NULL) {
1478 		/*
1479 		 * This thread has a set but no context; it must be a CPU-bound
1480 		 * set. The hardware will be stopped via kcpc_unbind() when the
1481 		 * process exits and closes its file descriptors with
1482 		 * kcpc_close(). Our only job here is to clean up this thread's
1483 		 * state; the set will be freed with the unbind().
1484 		 */
1485 		(void) kcpc_unbind(set);
1486 		/*
1487 		 * Unbinding a set belonging to the current thread should clear
1488 		 * its set pointer.
1489 		 */
1490 		ASSERT(curthread->t_cpc_set == NULL);
1491 		return;
1492 	}
1493 
1494 	curthread->t_cpc_set = NULL;
1495 
1496 	/*
1497 	 * This thread/LWP is exiting but context switches will continue to
1498 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
1499 	 * disabled here to prevent a race between checking or setting the
1500 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1501 	 * a context switch.
1502 	 */
1503 
1504 	kpreempt_disable();
1505 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1506 		pcbe_ops->pcbe_allstop();
1507 		atomic_or_uint(&ctx->kc_flags,
1508 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1509 	}
1510 	kpreempt_enable();
1511 }
1512 
1513 /*
1514  * Assign the requests in the given set to the PICs in the context.
1515  * Returns 0 if successful, -1 on failure.
1516  */
1517 /*ARGSUSED*/
1518 int
1519 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1520 {
1521 	int i;
1522 	int *picnum_save;
1523 
1524 	ASSERT(set->ks_nreqs <= cpc_ncounters);
1525 
1526 	/*
1527 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
1528 	 * alloc/free with every invocation.
1529 	 */
1530 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1531 	/*
1532 	 * kcpc_tryassign() blindly walks through each request in the set,
1533 	 * seeing if a counter can count its event. If yes, it assigns that
1534 	 * counter. However, that counter may have been the only capable counter
1535 	 * for _another_ request's event. The solution is to try every possible
1536 	 * request first. Note that this does not cover all solutions, as
1537 	 * that would require all unique orderings of requests, an n^n operation
1538 	 * which would be unacceptable for architectures with many counters.
1539 	 */
1540 	for (i = 0; i < set->ks_nreqs; i++)
1541 		if (kcpc_tryassign(set, i, picnum_save) == 0)
1542 			break;
1543 
1544 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1545 	if (i == set->ks_nreqs)
1546 		return (-1);
1547 	return (0);
1548 }
1549 
1550 static int
1551 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1552 {
1553 	int		i;
1554 	int		j;
1555 	uint64_t	bitmap = 0, resmap = 0;
1556 	uint64_t	ctrmap;
1557 
1558 	/*
1559 	 * We are attempting to assign the reqs to pics, but we may fail. If we
1560 	 * fail, we need to restore the state of the requests to what it was
1561 	 * when we found it, as some reqs may have been explicitly assigned to
1562 	 * a specific PIC beforehand. We do this by snapshotting the assignments
1563 	 * now and restoring from it later if we fail.
1564 	 *
1565 	 * Also we note here which counters have already been claimed by
1566 	 * requests with explicit counter assignments.
1567 	 */
1568 	for (i = 0; i < set->ks_nreqs; i++) {
1569 		scratch[i] = set->ks_req[i].kr_picnum;
1570 		if (set->ks_req[i].kr_picnum != -1)
1571 			resmap |= (1 << set->ks_req[i].kr_picnum);
1572 	}
1573 
1574 	/*
1575 	 * Walk through requests assigning them to the first PIC that is
1576 	 * capable.
1577 	 */
1578 	i = starting_req;
1579 	do {
1580 		if (set->ks_req[i].kr_picnum != -1) {
1581 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1582 			bitmap |= (1 << set->ks_req[i].kr_picnum);
1583 			if (++i == set->ks_nreqs)
1584 				i = 0;
1585 			continue;
1586 		}
1587 
1588 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1589 		for (j = 0; j < cpc_ncounters; j++) {
1590 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1591 			    (resmap & (1 << j)) == 0) {
1592 				/*
1593 				 * We can assign this counter because:
1594 				 *
1595 				 * 1. It can count the event (ctrmap)
1596 				 * 2. It hasn't been assigned yet (bitmap)
1597 				 * 3. It wasn't reserved by a request (resmap)
1598 				 */
1599 				bitmap |= (1 << j);
1600 				break;
1601 			}
1602 		}
1603 		if (j == cpc_ncounters) {
1604 			for (i = 0; i < set->ks_nreqs; i++)
1605 				set->ks_req[i].kr_picnum = scratch[i];
1606 			return (-1);
1607 		}
1608 		set->ks_req[i].kr_picnum = j;
1609 
1610 		if (++i == set->ks_nreqs)
1611 			i = 0;
1612 	} while (i != starting_req);
1613 
1614 	return (0);
1615 }
1616 
1617 kcpc_set_t *
1618 kcpc_dup_set(kcpc_set_t *set)
1619 {
1620 	kcpc_set_t	*new;
1621 	int		i;
1622 	int		j;
1623 
1624 	new = kmem_zalloc(sizeof (*new), KM_SLEEP);
1625 	new->ks_state &= ~KCPC_SET_BOUND;
1626 	new->ks_flags = set->ks_flags;
1627 	new->ks_nreqs = set->ks_nreqs;
1628 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1629 	    KM_SLEEP);
1630 	new->ks_data = NULL;
1631 	new->ks_ctx = NULL;
1632 
1633 	for (i = 0; i < new->ks_nreqs; i++) {
1634 		new->ks_req[i].kr_config = NULL;
1635 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1636 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1637 		new->ks_req[i].kr_picp = NULL;
1638 		new->ks_req[i].kr_data = NULL;
1639 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1640 		    CPC_MAX_EVENT_LEN);
1641 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1642 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1643 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1644 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1645 		    sizeof (kcpc_attr_t), KM_SLEEP);
1646 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1647 			new->ks_req[i].kr_attr[j].ka_val =
1648 			    set->ks_req[i].kr_attr[j].ka_val;
1649 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1650 			    set->ks_req[i].kr_attr[j].ka_name,
1651 			    CPC_MAX_ATTR_LEN);
1652 		}
1653 	}
1654 
1655 	return (new);
1656 }
1657 
1658 int
1659 kcpc_allow_nonpriv(void *token)
1660 {
1661 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1662 }
1663 
1664 void
1665 kcpc_invalidate(kthread_t *t)
1666 {
1667 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
1668 
1669 	if (ctx != NULL)
1670 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
1671 }
1672 
1673 /*
1674  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1675  * are used to construct PCBE names, starting with the most specific,
1676  * "pcbe.first.second.third.fourth" and ending with the least specific,
1677  * "pcbe.first".
1678  *
1679  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1680  */
1681 int
1682 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1683 {
1684 	uint_t s[3];
1685 
1686 	s[0] = first;
1687 	s[1] = second;
1688 	s[2] = third;
1689 
1690 	return (modload_qualified("pcbe",
1691 	    "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
1692 }
1693 
1694 char *
1695 kcpc_list_attrs(void)
1696 {
1697 	ASSERT(pcbe_ops != NULL);
1698 
1699 	return (pcbe_ops->pcbe_list_attrs());
1700 }
1701 
1702 char *
1703 kcpc_list_events(uint_t pic)
1704 {
1705 	ASSERT(pcbe_ops != NULL);
1706 
1707 	return (pcbe_ops->pcbe_list_events(pic));
1708 }
1709 
1710 uint_t
1711 kcpc_pcbe_capabilities(void)
1712 {
1713 	ASSERT(pcbe_ops != NULL);
1714 
1715 	return (pcbe_ops->pcbe_caps);
1716 }
1717 
1718 int
1719 kcpc_pcbe_loaded(void)
1720 {
1721 	return (pcbe_ops == NULL ? -1 : 0);
1722 }
1723