common/syscall/lgrpsys.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2015 Joyent, Inc.
 */

/*
 * lgroup system calls
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/sunddi.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/cpupart.h>
#include <sys/lgrp.h>
#include <sys/lgrp_user.h>
#include <sys/promif.h>		/* for prom_printf() */
#include <sys/sysmacros.h>
#include <sys/policy.h>

#include <vm/as.h>


/* definitions for mi_validity */
#define	VALID_ADDR	1
#define	VALID_REQ	2

/*
 * run through the given number of addresses and requests and return the
 * corresponding memory information for each address
 */
static int
meminfo(int addr_count, struct meminfo *mip)
{
	size_t		in_size, out_size, req_size, val_size;
	struct as	*as;
	struct hat	*hat;
	int		i, j, out_idx, info_count;
	lgrp_t		*lgrp;
	pfn_t		pfn;
	ssize_t		pgsz;
	int		*req_array, *val_array;
	uint64_t	*in_array, *out_array;
	uint64_t	addr, paddr;
	uintptr_t	vaddr;
	int		ret = 0;
	struct meminfo minfo;
#if defined(_SYSCALL32_IMPL)
	struct meminfo32 minfo32;
#endif

	/*
	 * Make sure that there is at least one address to translate and
	 * limit how many virtual addresses the kernel can do per call
	 */
	if (addr_count < 1)
		return (set_errno(EINVAL));
	else if (addr_count > MAX_MEMINFO_CNT)
		addr_count = MAX_MEMINFO_CNT;

	if (get_udatamodel() == DATAMODEL_NATIVE) {
		if (copyin(mip, &minfo, sizeof (struct meminfo)))
			return (set_errno(EFAULT));
	}
#if defined(_SYSCALL32_IMPL)
	else {
		bzero(&minfo, sizeof (minfo));
		if (copyin(mip, &minfo32, sizeof (struct meminfo32)))
			return (set_errno(EFAULT));
		minfo.mi_inaddr = (const uint64_t *)(uintptr_t)
		    minfo32.mi_inaddr;
		minfo.mi_info_req = (const uint_t *)(uintptr_t)
		    minfo32.mi_info_req;
		minfo.mi_info_count = minfo32.mi_info_count;
		minfo.mi_outdata = (uint64_t *)(uintptr_t)
		    minfo32.mi_outdata;
		minfo.mi_validity = (uint_t *)(uintptr_t)
		    minfo32.mi_validity;
	}
#endif
	/*
	 * all the input parameters have been copied in:-
	 * addr_count - number of input addresses
	 * minfo.mi_inaddr - array of input addresses
	 * minfo.mi_info_req - array of types of information requested
	 * minfo.mi_info_count - no. of pieces of info requested for each addr
	 * minfo.mi_outdata - array into which the results are placed
	 * minfo.mi_validity -  array containing bitwise result codes; 0th bit
	 *			evaluates validity of corresponding input
	 *			address, 1st bit validity of response to first
	 *			member of info_req, etc.
	 */

	/* make sure mi_info_count is within limit */
	info_count = minfo.mi_info_count;
	if (info_count < 1 || info_count > MAX_MEMINFO_REQ)
		return (set_errno(EINVAL));

	/*
	 * allocate buffer in_array for the input addresses and copy them in
	 */
	in_size = sizeof (uint64_t) * addr_count;
	in_array = kmem_alloc(in_size, KM_SLEEP);
	if (copyin(minfo.mi_inaddr, in_array, in_size)) {
		kmem_free(in_array, in_size);
		return (set_errno(EFAULT));
	}

	/*
	 * allocate buffer req_array for the input info_reqs and copy them in
	 */
	req_size = sizeof (uint_t) * info_count;
	req_array = kmem_alloc(req_size, KM_SLEEP);
	if (copyin(minfo.mi_info_req, req_array, req_size)) {
		kmem_free(req_array, req_size);
		kmem_free(in_array, in_size);
		return (set_errno(EFAULT));
	}

	/*
	 * Validate privs for each req.
	 */
	for (i = 0; i < info_count; i++) {
		switch (req_array[i] & MEMINFO_MASK) {
		case MEMINFO_VLGRP:
		case MEMINFO_VPAGESIZE:
			break;
		default:
			if (secpolicy_meminfo(CRED()) != 0) {
				kmem_free(req_array, req_size);
				kmem_free(in_array, in_size);
				return (set_errno(EPERM));
			}
			break;
		}
	}

	/*
	 * allocate buffer out_array which holds the results and will have
	 * to be copied out later
	 */
	out_size = sizeof (uint64_t) * addr_count * info_count;
	out_array = kmem_alloc(out_size, KM_SLEEP);

	/*
	 * allocate buffer val_array which holds the validity bits and will
	 * have to be copied out later
	 */
	val_size = sizeof (uint_t) * addr_count;
	val_array = kmem_alloc(val_size, KM_SLEEP);

	if ((req_array[0] & MEMINFO_MASK) == MEMINFO_PLGRP) {
		/* find the corresponding lgroup for each physical address */
		for (i = 0; i < addr_count; i++) {
			paddr = in_array[i];
			pfn = btop(paddr);
			lgrp = lgrp_pfn_to_lgrp(pfn);
			if (lgrp) {
				out_array[i] = lgrp->lgrp_id;
				val_array[i] = VALID_ADDR | VALID_REQ;
			} else {
				out_array[i] = NULL;
				val_array[i] = 0;
			}
		}
	} else {
		/* get the corresponding memory info for each virtual address */
		as = curproc->p_as;

		AS_LOCK_ENTER(as, RW_READER);
		hat = as->a_hat;
		for (i = out_idx = 0; i < addr_count; i++, out_idx +=
		    info_count) {
			addr = in_array[i];
			vaddr = (uintptr_t)(addr & ~PAGEOFFSET);
			if (!as_segat(as, (caddr_t)vaddr)) {
				val_array[i] = 0;
				continue;
			}
			val_array[i] = VALID_ADDR;
			pfn = hat_getpfnum(hat, (caddr_t)vaddr);
			if (pfn != PFN_INVALID) {
				paddr = (uint64_t)((pfn << PAGESHIFT) |
				    (addr & PAGEOFFSET));
				for (j = 0; j < info_count; j++) {
					switch (req_array[j] & MEMINFO_MASK) {
					case MEMINFO_VPHYSICAL:
						/*
						 * return the physical address
						 * corresponding to the input
						 * virtual address
						 */
						out_array[out_idx + j] = paddr;
						val_array[i] |= VALID_REQ << j;
						break;
					case MEMINFO_VLGRP:
						/*
						 * return the lgroup of physical
						 * page corresponding to the
						 * input virtual address
						 */
						lgrp = lgrp_pfn_to_lgrp(pfn);
						if (lgrp) {
							out_array[out_idx + j] =
							    lgrp->lgrp_id;
							val_array[i] |=
							    VALID_REQ << j;
						}
						break;
					case MEMINFO_VPAGESIZE:
						/*
						 * return the size of physical
						 * page corresponding to the
						 * input virtual address
						 */
						pgsz = hat_getpagesize(hat,
						    (caddr_t)vaddr);
						if (pgsz != -1) {
							out_array[out_idx + j] =
							    pgsz;
							val_array[i] |=
							    VALID_REQ << j;
						}
						break;
					case MEMINFO_VREPLCNT:
						/*
						 * for future use:-
						 * return the no. replicated
						 * physical pages corresponding
						 * to the input virtual address,
						 * so it is always 0 at the
						 * moment
						 */
						out_array[out_idx + j] = 0;
						val_array[i] |= VALID_REQ << j;
						break;
					case MEMINFO_VREPL:
						/*
						 * for future use:-
						 * return the nth physical
						 * replica of the specified
						 * virtual address
						 */
						break;
					case MEMINFO_VREPL_LGRP:
						/*
						 * for future use:-
						 * return the lgroup of nth
						 * physical replica of the
						 * specified virtual address
						 */
						break;
					case MEMINFO_PLGRP:
						/*
						 * this is for physical address
						 * only, shouldn't mix with
						 * virtual address
						 */
						break;
					default:
						break;
					}
				}
			}
		}
		AS_LOCK_EXIT(as);
	}

	/* copy out the results and validity bits and free the buffers */
	if ((copyout(out_array, minfo.mi_outdata, out_size) != 0) ||
	    (copyout(val_array, minfo.mi_validity, val_size) != 0))
		ret = set_errno(EFAULT);

	kmem_free(in_array, in_size);
	kmem_free(out_array, out_size);
	kmem_free(req_array, req_size);
	kmem_free(val_array, val_size);

	return (ret);
}


/*
 * Initialize lgroup affinities for thread
 */
void
lgrp_affinity_init(lgrp_affinity_t **bufaddr)
{
	if (bufaddr)
		*bufaddr = NULL;
}


/*
 * Free lgroup affinities for thread and set to NULL
 * just in case thread gets recycled
 */
void
lgrp_affinity_free(lgrp_affinity_t **bufaddr)
{
	if (bufaddr && *bufaddr) {
		kmem_free(*bufaddr, nlgrpsmax * sizeof (lgrp_affinity_t));
		*bufaddr = NULL;
	}
}


#define	P_ANY	-2	/* cookie specifying any ID */


/*
 * Find LWP with given ID in specified process and get its affinity for
 * specified lgroup
 */
lgrp_affinity_t
lgrp_affinity_get_thread(proc_t *p, id_t lwpid, lgrp_id_t lgrp)
{
	lgrp_affinity_t aff;
	int		found;
	kthread_t	*t;

	ASSERT(MUTEX_HELD(&p->p_lock));

	aff = LGRP_AFF_NONE;
	found = 0;
	t = p->p_tlist;
	/*
	 * The process may be executing in proc_exit() and its p->p_list may be
	 * already NULL.
	 */
	if (t == NULL)
		return (set_errno(ESRCH));

	do {
		if (t->t_tid == lwpid || lwpid == P_ANY) {
			thread_lock(t);
			/*
			 * Check to see whether caller has permission to set
			 * affinity for LWP
			 */
			if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
				thread_unlock(t);
				return (set_errno(EPERM));
			}

			if (t->t_lgrp_affinity)
				aff = t->t_lgrp_affinity[lgrp];
			thread_unlock(t);
			found = 1;
			break;
		}
	} while ((t = t->t_forw) != p->p_tlist);
	if (!found)
		aff = set_errno(ESRCH);

	return (aff);
}


/*
 * Get lgroup affinity for given LWP
 */
lgrp_affinity_t
lgrp_affinity_get(lgrp_affinity_args_t *ap)
{
	lgrp_affinity_t		aff;
	lgrp_affinity_args_t	args;
	id_t			id;
	idtype_t		idtype;
	lgrp_id_t		lgrp;
	proc_t			*p;
	kthread_t		*t;

	/*
	 * Copyin arguments
	 */
	if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
		return (set_errno(EFAULT));

	id = args.id;
	idtype = args.idtype;
	lgrp = args.lgrp;

	/*
	 * Check for invalid lgroup
	 */
	if (lgrp < 0 || lgrp == LGRP_NONE)
		return (set_errno(EINVAL));

	/*
	 * Check for existing lgroup
	 */
	if (lgrp > lgrp_alloc_max)
		return (set_errno(ESRCH));

	/*
	 * Get lgroup affinity for given LWP or process
	 */
	switch (idtype) {

	case P_LWPID:
		/*
		 * LWP in current process
		 */
		p = curproc;
		mutex_enter(&p->p_lock);
		if (id != P_MYID)	/* different thread */
			aff = lgrp_affinity_get_thread(p, id, lgrp);
		else {			/* current thread */
			aff = LGRP_AFF_NONE;
			t = curthread;
			thread_lock(t);
			if (t->t_lgrp_affinity)
				aff = t->t_lgrp_affinity[lgrp];
			thread_unlock(t);
		}
		mutex_exit(&p->p_lock);
		break;

	case P_PID:
		/*
		 * Process
		 */
		mutex_enter(&pidlock);

		if (id == P_MYID)
			p = curproc;
		else {
			p = prfind(id);
			if (p == NULL) {
				mutex_exit(&pidlock);
				return (set_errno(ESRCH));
			}
		}

		mutex_enter(&p->p_lock);
		aff = lgrp_affinity_get_thread(p, P_ANY, lgrp);
		mutex_exit(&p->p_lock);

		mutex_exit(&pidlock);
		break;

	default:
		aff = set_errno(EINVAL);
		break;
	}

	return (aff);
}


/*
 * Find lgroup for which this thread has most affinity in specified partition
 * starting from home lgroup unless specified starting lgroup is preferred
 */
lpl_t *
lgrp_affinity_best(kthread_t *t, struct cpupart *cpupart, lgrp_id_t start,
    boolean_t prefer_start)
{
	lgrp_affinity_t	*affs;
	lgrp_affinity_t	best_aff;
	lpl_t		*best_lpl;
	lgrp_id_t	finish;
	lgrp_id_t	home;
	lgrp_id_t	lgrpid;
	lpl_t		*lpl;

	ASSERT(t != NULL);
	ASSERT((MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0) ||
	    (MUTEX_HELD(&ttoproc(t)->p_lock) && THREAD_LOCK_HELD(t)));
	ASSERT(cpupart != NULL);

	if (t->t_lgrp_affinity == NULL)
		return (NULL);

	affs = t->t_lgrp_affinity;

	/*
	 * Thread bound to CPU
	 */
	if (t->t_bind_cpu != PBIND_NONE) {
		cpu_t	*cp;

		/*
		 * Find which lpl has most affinity among leaf lpl directly
		 * containing CPU and its ancestor lpls
		 */
		cp = cpu[t->t_bind_cpu];

		best_lpl = lpl = cp->cpu_lpl;
		best_aff = affs[best_lpl->lpl_lgrpid];
		while (lpl->lpl_parent != NULL) {
			lpl = lpl->lpl_parent;
			lgrpid = lpl->lpl_lgrpid;
			if (affs[lgrpid] > best_aff) {
				best_lpl = lpl;
				best_aff = affs[lgrpid];
			}
		}
		return (best_lpl);
	}

	/*
	 * Start searching from home lgroup unless given starting lgroup is
	 * preferred or home lgroup isn't in given pset.  Use root lgroup as
	 * starting point if both home and starting lgroups aren't in given
	 * pset.
	 */
	ASSERT(start >= 0 && start <= lgrp_alloc_max);
	home = t->t_lpl->lpl_lgrpid;
	if (!prefer_start && LGRP_CPUS_IN_PART(home, cpupart))
		lgrpid = home;
	else if (start != LGRP_NONE && LGRP_CPUS_IN_PART(start, cpupart))
		lgrpid = start;
	else
		lgrpid = LGRP_ROOTID;

	best_lpl = &cpupart->cp_lgrploads[lgrpid];
	best_aff = affs[lgrpid];
	finish = lgrpid;
	do {
		/*
		 * Skip any lgroups that don't have CPU resources
		 * in this processor set.
		 */
		if (!LGRP_CPUS_IN_PART(lgrpid, cpupart)) {
			if (++lgrpid > lgrp_alloc_max)
				lgrpid = 0;	/* wrap the search */
			continue;
		}

		/*
		 * Find lgroup with most affinity
		 */
		lpl = &cpupart->cp_lgrploads[lgrpid];
		if (affs[lgrpid] > best_aff) {
			best_aff = affs[lgrpid];
			best_lpl = lpl;
		}

		if (++lgrpid > lgrp_alloc_max)
			lgrpid = 0;	/* wrap the search */

	} while (lgrpid != finish);

	/*
	 * No lgroup (in this pset) with any affinity
	 */
	if (best_aff == LGRP_AFF_NONE)
		return (NULL);

	lgrpid = best_lpl->lpl_lgrpid;
	ASSERT(LGRP_CPUS_IN_PART(lgrpid, cpupart) && best_lpl->lpl_ncpu > 0);

	return (best_lpl);
}


/*
 * Set thread's affinity for given lgroup
 */
int
lgrp_affinity_set_thread(kthread_t *t, lgrp_id_t lgrp, lgrp_affinity_t aff,
    lgrp_affinity_t **aff_buf)
{
	lgrp_affinity_t	*affs;
	lgrp_id_t	best;
	lpl_t		*best_lpl;
	lgrp_id_t	home;
	int		retval;

	ASSERT(t != NULL);
	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));

	retval = 0;

	thread_lock(t);

	/*
	 * Check to see whether caller has permission to set affinity for
	 * thread
	 */
	if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
		thread_unlock(t);
		return (set_errno(EPERM));
	}

	if (t->t_lgrp_affinity == NULL) {
		if (aff == LGRP_AFF_NONE) {
			thread_unlock(t);
			return (0);
		}
		ASSERT(aff_buf != NULL && *aff_buf != NULL);
		t->t_lgrp_affinity = *aff_buf;
		*aff_buf = NULL;
	}

	affs = t->t_lgrp_affinity;
	affs[lgrp] = aff;

	/*
	 * Find lgroup for which thread has most affinity,
	 * starting with lgroup for which affinity being set
	 */
	best_lpl = lgrp_affinity_best(t, t->t_cpupart, lgrp, B_TRUE);

	/*
	 * Rehome if found lgroup with more affinity than home or lgroup for
	 * which affinity is being set has same affinity as home
	 */
	home = t->t_lpl->lpl_lgrpid;
	if (best_lpl != NULL && best_lpl != t->t_lpl) {
		best = best_lpl->lpl_lgrpid;
		if (affs[best] > affs[home] || (affs[best] == affs[home] &&
		    best == lgrp))
			lgrp_move_thread(t, best_lpl, 1);
	}

	thread_unlock(t);

	return (retval);
}


/*
 * Set process' affinity for specified lgroup
 */
int
lgrp_affinity_set_proc(proc_t *p, lgrp_id_t lgrp, lgrp_affinity_t aff,
    lgrp_affinity_t **aff_buf_array)
{
	lgrp_affinity_t	*buf;
	int		err = 0;
	int		i;
	int		retval;
	kthread_t	*t;

	ASSERT(MUTEX_HELD(&pidlock) && MUTEX_HELD(&p->p_lock));
	ASSERT(aff_buf_array != NULL);

	i = 0;
	t = p->p_tlist;
	if (t != NULL) {
		do {
			/*
			 * Set lgroup affinity for thread
			 */
			buf = aff_buf_array[i];
			retval = lgrp_affinity_set_thread(t, lgrp, aff, &buf);

			if (err == 0 && retval != 0)
				err = retval;

			/*
			 * Advance pointer to next buffer
			 */
			if (buf == NULL) {
				ASSERT(i < p->p_lwpcnt);
				aff_buf_array[i] = NULL;
				i++;
			}

		} while ((t = t->t_forw) != p->p_tlist);
	}
	return (err);
}


/*
 * Set LWP's or process' affinity for specified lgroup
 *
 * When setting affinities, pidlock, process p_lock, and thread_lock()
 * need to be held in that order to protect target thread's pset, process,
 * process contents, and thread contents.  thread_lock() does splhigh(),
 * so it ends up having similiar effect as kpreempt_disable(), so it will
 * protect calls to lgrp_move_thread() and lgrp_choose() from pset changes.
 */
int
lgrp_affinity_set(lgrp_affinity_args_t *ap)
{
	lgrp_affinity_t		aff;
	lgrp_affinity_t		*aff_buf;
	lgrp_affinity_args_t	args;
	id_t			id;
	idtype_t		idtype;
	lgrp_id_t		lgrp;
	int			nthreads;
	proc_t			*p;
	int			retval;

	/*
	 * Copyin arguments
	 */
	if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
		return (set_errno(EFAULT));

	idtype = args.idtype;
	id = args.id;
	lgrp = args.lgrp;
	aff = args.aff;

	/*
	 * Check for invalid lgroup
	 */
	if (lgrp < 0 || lgrp == LGRP_NONE)
		return (set_errno(EINVAL));

	/*
	 * Check for existing lgroup
	 */
	if (lgrp > lgrp_alloc_max)
		return (set_errno(ESRCH));

	/*
	 * Check for legal affinity
	 */
	if (aff != LGRP_AFF_NONE && aff != LGRP_AFF_WEAK &&
	    aff != LGRP_AFF_STRONG)
		return (set_errno(EINVAL));

	/*
	 * Must be process or LWP ID
	 */
	if (idtype != P_LWPID && idtype != P_PID)
		return (set_errno(EINVAL));

	/*
	 * Set given LWP's or process' affinity for specified lgroup
	 */
	switch (idtype) {

	case P_LWPID:
		/*
		 * Allocate memory for thread's lgroup affinities
		 * ahead of time w/o holding locks
		 */
		aff_buf = kmem_zalloc(nlgrpsmax * sizeof (lgrp_affinity_t),
		    KM_SLEEP);

		p = curproc;

		/*
		 * Set affinity for thread
		 */
		mutex_enter(&p->p_lock);
		if (id == P_MYID) {		/* current thread */
			retval = lgrp_affinity_set_thread(curthread, lgrp, aff,
			    &aff_buf);
		} else if (p->p_tlist == NULL) {
			retval = set_errno(ESRCH);
		} else {			/* other thread */
			int		found = 0;
			kthread_t	*t;

			t = p->p_tlist;
			do {
				if (t->t_tid == id) {
					retval = lgrp_affinity_set_thread(t,
					    lgrp, aff, &aff_buf);
					found = 1;
					break;
				}
			} while ((t = t->t_forw) != p->p_tlist);
			if (!found)
				retval = set_errno(ESRCH);
		}
		mutex_exit(&p->p_lock);

		/*
		 * Free memory for lgroup affinities,
		 * since thread didn't need it
		 */
		if (aff_buf)
			kmem_free(aff_buf,
			    nlgrpsmax * sizeof (lgrp_affinity_t));

		break;

	case P_PID:

		do {
			lgrp_affinity_t	**aff_buf_array;
			int		i;
			size_t		size;

			/*
			 * Get process
			 */
			mutex_enter(&pidlock);

			if (id == P_MYID)
				p = curproc;
			else
				p = prfind(id);

			if (p == NULL) {
				mutex_exit(&pidlock);
				return (set_errno(ESRCH));
			}

			/*
			 * Get number of threads in process
			 *
			 * NOTE: Only care about user processes,
			 *	 so p_lwpcnt should be number of threads.
			 */
			mutex_enter(&p->p_lock);
			nthreads = p->p_lwpcnt;
			mutex_exit(&p->p_lock);

			mutex_exit(&pidlock);

			if (nthreads < 1)
				return (set_errno(ESRCH));

			/*
			 * Preallocate memory for lgroup affinities for
			 * each thread in process now to avoid holding
			 * any locks.  Allocate an array to hold a buffer
			 * for each thread.
			 */
			aff_buf_array = kmem_zalloc(nthreads *
			    sizeof (lgrp_affinity_t *), KM_SLEEP);

			size = nlgrpsmax * sizeof (lgrp_affinity_t);
			for (i = 0; i < nthreads; i++)
				aff_buf_array[i] = kmem_zalloc(size, KM_SLEEP);

			mutex_enter(&pidlock);

			/*
			 * Get process again since dropped locks to allocate
			 * memory (except current process)
			 */
			if (id != P_MYID)
				p = prfind(id);

			/*
			 * Process went away after we dropped locks and before
			 * reacquiring them, so drop locks, free memory, and
			 * return.
			 */
			if (p == NULL) {
				mutex_exit(&pidlock);
				for (i = 0; i < nthreads; i++)
					kmem_free(aff_buf_array[i], size);
				kmem_free(aff_buf_array,
				    nthreads * sizeof (lgrp_affinity_t *));
				return (set_errno(ESRCH));
			}

			mutex_enter(&p->p_lock);

			/*
			 * See whether number of threads is same
			 * If not, drop locks, free memory, and try again
			 */
			if (nthreads != p->p_lwpcnt) {
				mutex_exit(&p->p_lock);
				mutex_exit(&pidlock);
				for (i = 0; i < nthreads; i++)
					kmem_free(aff_buf_array[i], size);
				kmem_free(aff_buf_array,
				    nthreads * sizeof (lgrp_affinity_t *));
				continue;
			}

			/*
			 * Set lgroup affinity for threads in process
			 */
			retval = lgrp_affinity_set_proc(p, lgrp, aff,
			    aff_buf_array);

			mutex_exit(&p->p_lock);
			mutex_exit(&pidlock);

			/*
			 * Free any leftover memory, since some threads may
			 * have already allocated memory and set lgroup
			 * affinities before
			 */
			for (i = 0; i < nthreads; i++)
				if (aff_buf_array[i] != NULL)
					kmem_free(aff_buf_array[i], size);
			kmem_free(aff_buf_array,
			    nthreads * sizeof (lgrp_affinity_t *));

			break;

		} while (nthreads != p->p_lwpcnt);

		break;

	default:
		retval = set_errno(EINVAL);
		break;
	}

	return (retval);
}


/*
 * Return the latest generation number for the lgroup hierarchy
 * with the given view
 */
lgrp_gen_t
lgrp_generation(lgrp_view_t view)
{
	cpupart_t	*cpupart;
	uint_t		gen;

	kpreempt_disable();

	/*
	 * Determine generation number for given view
	 */
	if (view == LGRP_VIEW_OS)
		/*
		 * Return generation number of lgroup hierarchy for OS view
		 */
		gen = lgrp_gen;
	else {
		/*
		 * For caller's view, use generation numbers for lgroup
		 * hierarchy and caller's pset
		 * NOTE: Caller needs to check for change in pset ID
		 */
		cpupart = curthread->t_cpupart;
		ASSERT(cpupart);
		gen = lgrp_gen + cpupart->cp_gen;
	}

	kpreempt_enable();

	return (gen);
}


lgrp_id_t
lgrp_home_thread(kthread_t *t)
{
	lgrp_id_t	home;

	ASSERT(t != NULL);
	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));

	thread_lock(t);

	/*
	 * Check to see whether caller has permission to set affinity for
	 * thread
	 */
	if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
		thread_unlock(t);
		return (set_errno(EPERM));
	}

	home = lgrp_home_id(t);

	thread_unlock(t);
	return (home);
}


/*
 * Get home lgroup of given process or thread
 */
lgrp_id_t
lgrp_home_get(idtype_t idtype, id_t id)
{
	proc_t		*p;
	lgrp_id_t	retval;
	kthread_t	*t;

	/*
	 * Get home lgroup of given LWP or process
	 */
	switch (idtype) {

	case P_LWPID:
		p = curproc;

		/*
		 * Set affinity for thread
		 */
		mutex_enter(&p->p_lock);
		if (id == P_MYID) {		/* current thread */
			retval = lgrp_home_thread(curthread);
		} else if (p->p_tlist == NULL) {
			retval = set_errno(ESRCH);
		} else {			/* other thread */
			int	found = 0;

			t = p->p_tlist;
			do {
				if (t->t_tid == id) {
					retval = lgrp_home_thread(t);
					found = 1;
					break;
				}
			} while ((t = t->t_forw) != p->p_tlist);
			if (!found)
				retval = set_errno(ESRCH);
		}
		mutex_exit(&p->p_lock);
		break;

	case P_PID:
		/*
		 * Get process
		 */
		mutex_enter(&pidlock);

		if (id == P_MYID)
			p = curproc;
		else
			p = prfind(id);

		if (p == NULL) {
			mutex_exit(&pidlock);
			return (set_errno(ESRCH));
		}

		mutex_enter(&p->p_lock);
		t = p->p_tlist;
		if (t == NULL)
			retval = set_errno(ESRCH);
		else
			retval = lgrp_home_thread(t);
		mutex_exit(&p->p_lock);

		mutex_exit(&pidlock);

		break;

	default:
		retval = set_errno(EINVAL);
		break;
	}

	return (retval);
}


/*
 * Return latency between "from" and "to" lgroups
 *
 * This latency number can only be used for relative comparison
 * between lgroups on the running system, cannot be used across platforms,
 * and may not reflect the actual latency.  It is platform and implementation
 * specific, so platform gets to decide its value.  It would be nice if the
 * number was at least proportional to make comparisons more meaningful though.
 */
int
lgrp_latency(lgrp_id_t from, lgrp_id_t to)
{
	lgrp_t		*from_lgrp;
	int		i;
	int		latency;
	int		latency_max;
	lgrp_t		*to_lgrp;

	ASSERT(MUTEX_HELD(&cpu_lock));

	if (from < 0 || to < 0)
		return (set_errno(EINVAL));

	if (from > lgrp_alloc_max || to > lgrp_alloc_max)
		return (set_errno(ESRCH));

	from_lgrp = lgrp_table[from];
	to_lgrp = lgrp_table[to];

	if (!LGRP_EXISTS(from_lgrp) || !LGRP_EXISTS(to_lgrp)) {
		return (set_errno(ESRCH));
	}

	/*
	 * Get latency for same lgroup
	 */
	if (from == to) {
		latency = from_lgrp->lgrp_latency;
		return (latency);
	}

	/*
	 * Get latency between leaf lgroups
	 */
	if (from_lgrp->lgrp_childcnt == 0 && to_lgrp->lgrp_childcnt == 0)
		return (lgrp_plat_latency(from_lgrp->lgrp_plathand,
		    to_lgrp->lgrp_plathand));

	/*
	 * Determine max latency between resources in two lgroups
	 */
	latency_max = 0;
	for (i = 0; i <= lgrp_alloc_max; i++) {
		lgrp_t	*from_rsrc;
		int	j;
		lgrp_t	*to_rsrc;

		from_rsrc = lgrp_table[i];
		if (!LGRP_EXISTS(from_rsrc) ||
		    !klgrpset_ismember(from_lgrp->lgrp_set[LGRP_RSRC_CPU], i))
			continue;

		for (j = 0; j <= lgrp_alloc_max; j++) {
			to_rsrc = lgrp_table[j];
			if (!LGRP_EXISTS(to_rsrc) ||
			    klgrpset_ismember(to_lgrp->lgrp_set[LGRP_RSRC_MEM],
			    j) == 0)
				continue;
			latency = lgrp_plat_latency(from_rsrc->lgrp_plathand,
			    to_rsrc->lgrp_plathand);
			if (latency > latency_max)
				latency_max = latency;
		}
	}
	return (latency_max);
}


/*
 * Return lgroup interface version number
 * 0 - none
 * 1 - original
 * 2 - lgrp_latency_cookie() and lgrp_resources() added
 */
int
lgrp_version(int version)
{
	/*
	 * Return LGRP_VER_NONE when requested version isn't supported
	 */
	if (version < LGRP_VER_NONE || version > LGRP_VER_CURRENT)
		return (LGRP_VER_NONE);

	/*
	 * Return current version when LGRP_VER_NONE passed in
	 */
	if (version == LGRP_VER_NONE)
		return (LGRP_VER_CURRENT);

	/*
	 * Otherwise, return supported version.
	 */
	return (version);
}


/*
 * Snapshot of lgroup hieararchy
 *
 * One snapshot is kept and is based on the kernel's native data model, so
 * a 32-bit snapshot is kept for the 32-bit kernel and a 64-bit one for the
 * 64-bit kernel.  If a 32-bit user wants a snapshot from the 64-bit kernel,
 * the kernel generates a 32-bit snapshot from the data in its 64-bit snapshot.
 *
 * The format is defined by lgroup snapshot header and the layout of
 * the snapshot in memory is as follows:
 * 1) lgroup snapshot header
 *    - specifies format of snapshot
 *    - defined by lgrp_snapshot_header_t
 * 2) lgroup info array
 *    - contains information about each lgroup
 *    - one element for each lgroup
 *    - each element is defined by lgrp_info_t
 * 3) lgroup CPU ID array
 *    - contains list (array) of CPU IDs for each lgroup
 *    - lgrp_info_t points into array and specifies how many CPUs belong to
 *      given lgroup
 * 4) lgroup parents array
 *    - contains lgroup bitmask of parents for each lgroup
 *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
 * 5) lgroup children array
 *    - contains lgroup bitmask of children for each lgroup
 *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
 * 6) lgroup resources array
 *    - contains lgroup bitmask of resources for each lgroup
 *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
 * 7) lgroup latency table
 *    - contains latency from each lgroup to each of other lgroups
 *
 * NOTE:  Must use nlgrpsmax for per lgroup data structures because lgroups
 *	  may be sparsely allocated.
 */
lgrp_snapshot_header_t	*lgrp_snap = NULL;	/* lgroup snapshot */
static kmutex_t		lgrp_snap_lock;		/* snapshot lock */


/*
 * Take a snapshot of lgroup hierarchy and return size of buffer
 * needed to hold snapshot
 */
static int
lgrp_snapshot(void)
{
	size_t		bitmask_size;
	size_t		bitmasks_size;
	size_t		bufsize;
	int		cpu_index;
	size_t		cpuids_size;
	int		i;
	int		j;
	size_t		info_size;
	size_t		lats_size;
	ulong_t		*lgrp_children;
	processorid_t	*lgrp_cpuids;
	lgrp_info_t	*lgrp_info;
	int		**lgrp_lats;
	ulong_t		*lgrp_parents;
	ulong_t		*lgrp_rsets;
	ulong_t		*lgrpset;
	int		snap_ncpus;
	int		snap_nlgrps;
	int		snap_nlgrpsmax;
	size_t		snap_hdr_size;
#ifdef	_SYSCALL32_IMPL
	model_t		model = DATAMODEL_NATIVE;

	/*
	 * Have up-to-date snapshot, so check to see whether caller is 32-bit
	 * program and need to return size of 32-bit snapshot now.
	 */
	model = get_udatamodel();
	if (model == DATAMODEL_ILP32 && lgrp_snap &&
	    lgrp_snap->ss_gen == lgrp_gen) {

		snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

		/*
		 * Calculate size of buffer needed for 32-bit snapshot,
		 * rounding up size of each object to allow for alignment
		 * of next object in buffer.
		 */
		snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
		    sizeof (caddr32_t));
		info_size =
		    P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
		    sizeof (processorid_t));
		cpuids_size =
		    P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t),
		    sizeof (ulong_t));

		/*
		 * lgroup bitmasks needed for parents, children, and resources
		 * for each lgroup and pset lgroup set
		 */
		bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
		bitmasks_size = (((2 + LGRP_RSRC_COUNT) *
		    snap_nlgrpsmax) + 1) * bitmask_size;

		/*
		 * Size of latency table and buffer
		 */
		lats_size = snap_nlgrpsmax * sizeof (caddr32_t) +
		    snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int);

		bufsize = snap_hdr_size + info_size + cpuids_size +
		    bitmasks_size + lats_size;
		return (bufsize);
	}
#endif	/* _SYSCALL32_IMPL */

	/*
	 * Check whether snapshot is up-to-date
	 * Free it and take another one if not
	 */
	if (lgrp_snap) {
		if (lgrp_snap->ss_gen == lgrp_gen)
			return (lgrp_snap->ss_size);

		kmem_free(lgrp_snap, lgrp_snap->ss_size);
		lgrp_snap = NULL;
	}

	/*
	 * Allocate memory for snapshot
	 * w/o holding cpu_lock while waiting for memory
	 */
	while (lgrp_snap == NULL) {
		int	old_generation;

		/*
		 * Take snapshot of lgroup generation number
		 * and configuration size dependent information
		 * NOTE: Only count number of online CPUs,
		 * since only online CPUs appear in lgroups.
		 */
		mutex_enter(&cpu_lock);
		old_generation = lgrp_gen;
		snap_ncpus = ncpus_online;
		snap_nlgrps = nlgrps;
		snap_nlgrpsmax = nlgrpsmax;
		mutex_exit(&cpu_lock);

		/*
		 * Calculate size of buffer needed for snapshot,
		 * rounding up size of each object to allow for alignment
		 * of next object in buffer.
		 */
		snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t),
		    sizeof (void *));
		info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t),
		    sizeof (processorid_t));
		cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
		    sizeof (ulong_t));
		/*
		 * lgroup bitmasks needed for pset lgroup set and  parents,
		 * children, and resource sets for each lgroup
		 */
		bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
		bitmasks_size = (((2 + LGRP_RSRC_COUNT) *
		    snap_nlgrpsmax) + 1) * bitmask_size;

		/*
		 * Size of latency table and buffer
		 */
		lats_size = snap_nlgrpsmax * sizeof (int *) +
		    snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int);

		bufsize = snap_hdr_size + info_size + cpuids_size +
		    bitmasks_size + lats_size;

		/*
		 * Allocate memory for buffer
		 */
		lgrp_snap = kmem_zalloc(bufsize, KM_NOSLEEP);
		if (lgrp_snap == NULL)
			return (set_errno(ENOMEM));

		/*
		 * Check whether generation number has changed
		 */
		mutex_enter(&cpu_lock);
		if (lgrp_gen == old_generation)
			break;		/* hasn't change, so done. */

		/*
		 * Generation number changed, so free memory and try again.
		 */
		mutex_exit(&cpu_lock);
		kmem_free(lgrp_snap, bufsize);
		lgrp_snap = NULL;
	}

	/*
	 * Fill in lgroup snapshot header
	 * (including pointers to tables of lgroup info, CPU IDs, and parents
	 * and children)
	 */
	lgrp_snap->ss_version = LGRP_VER_CURRENT;

	/*
	 * XXX For now, liblgrp only needs to know whether the hierarchy
	 * XXX only has one level or not
	 */
	if (snap_nlgrps == 1)
		lgrp_snap->ss_levels = 1;
	else
		lgrp_snap->ss_levels = 2;

	lgrp_snap->ss_root = LGRP_ROOTID;

	lgrp_snap->ss_nlgrps = lgrp_snap->ss_nlgrps_os = snap_nlgrps;
	lgrp_snap->ss_nlgrps_max = snap_nlgrpsmax;
	lgrp_snap->ss_ncpus = snap_ncpus;
	lgrp_snap->ss_gen = lgrp_gen;
	lgrp_snap->ss_view = LGRP_VIEW_OS;
	lgrp_snap->ss_pset = 0;		/* NOTE: caller should set if needed */
	lgrp_snap->ss_size = bufsize;
	lgrp_snap->ss_magic = (uintptr_t)lgrp_snap;

	lgrp_snap->ss_info = lgrp_info =
	    (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size);

	lgrp_snap->ss_cpuids = lgrp_cpuids =
	    (processorid_t *)((uintptr_t)lgrp_info + info_size);

	lgrp_snap->ss_lgrpset = lgrpset =
	    (ulong_t *)((uintptr_t)lgrp_cpuids + cpuids_size);

	lgrp_snap->ss_parents = lgrp_parents =
	    (ulong_t *)((uintptr_t)lgrpset + bitmask_size);

	lgrp_snap->ss_children = lgrp_children =
	    (ulong_t *)((uintptr_t)lgrp_parents + (snap_nlgrpsmax *
	    bitmask_size));

	lgrp_snap->ss_rsets = lgrp_rsets =
	    (ulong_t *)((uintptr_t)lgrp_children + (snap_nlgrpsmax *
	    bitmask_size));

	lgrp_snap->ss_latencies = lgrp_lats =
	    (int **)((uintptr_t)lgrp_rsets + (LGRP_RSRC_COUNT *
	    snap_nlgrpsmax * bitmask_size));

	/*
	 * Fill in lgroup information
	 */
	cpu_index = 0;
	for (i = 0; i < snap_nlgrpsmax; i++) {
		struct cpu	*cp;
		int		cpu_count;
		struct cpu	*head;
		int		k;
		lgrp_t		*lgrp;

		lgrp = lgrp_table[i];
		if (!LGRP_EXISTS(lgrp)) {
			bzero(&lgrp_info[i], sizeof (lgrp_info[i]));
			lgrp_info[i].info_lgrpid = LGRP_NONE;
			continue;
		}

		lgrp_info[i].info_lgrpid = i;
		lgrp_info[i].info_latency = lgrp->lgrp_latency;

		/*
		 * Fill in parents, children, and lgroup resources
		 */
		lgrp_info[i].info_parents =
		    (ulong_t *)((uintptr_t)lgrp_parents + (i * bitmask_size));

		if (lgrp->lgrp_parent)
			BT_SET(lgrp_info[i].info_parents,
			    lgrp->lgrp_parent->lgrp_id);

		lgrp_info[i].info_children =
		    (ulong_t *)((uintptr_t)lgrp_children + (i * bitmask_size));

		for (j = 0; j < snap_nlgrpsmax; j++)
			if (klgrpset_ismember(lgrp->lgrp_children, j))
				BT_SET(lgrp_info[i].info_children, j);

		lgrp_info[i].info_rset =
		    (ulong_t *)((uintptr_t)lgrp_rsets +
		    (i * LGRP_RSRC_COUNT * bitmask_size));

		for (j = 0; j < LGRP_RSRC_COUNT; j++) {
			ulong_t	*rset;

			rset = (ulong_t *)((uintptr_t)lgrp_info[i].info_rset +
			    (j * bitmask_size));
			for (k = 0; k < snap_nlgrpsmax; k++)
				if (klgrpset_ismember(lgrp->lgrp_set[j], k))
					BT_SET(rset, k);
		}

		/*
		 * Fill in CPU IDs
		 */
		cpu_count = 0;
		lgrp_info[i].info_cpuids = NULL;
		cp = head = lgrp->lgrp_cpu;
		if (head != NULL) {
			lgrp_info[i].info_cpuids = &lgrp_cpuids[cpu_index];
			do {
				lgrp_cpuids[cpu_index] = cp->cpu_id;
				cpu_index++;
				cpu_count++;
				cp = cp->cpu_next_lgrp;
			} while (cp != head);
		}
		ASSERT(cpu_count == lgrp->lgrp_cpucnt);
		lgrp_info[i].info_ncpus = cpu_count;

		/*
		 * Fill in memory sizes for lgroups that directly contain
		 * memory
		 */
		if (klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], i)) {
			lgrp_info[i].info_mem_free =
			    lgrp_mem_size(i, LGRP_MEM_SIZE_FREE);
			lgrp_info[i].info_mem_install =
			    lgrp_mem_size(i, LGRP_MEM_SIZE_INSTALL);
		}

		/*
		 * Fill in latency table and buffer
		 */
		lgrp_lats[i] = (int *)((uintptr_t)lgrp_lats + snap_nlgrpsmax *
		    sizeof (int *) + i * snap_nlgrpsmax * sizeof (int));
		for (j = 0; j < snap_nlgrpsmax; j++) {
			lgrp_t	*to;

			to = lgrp_table[j];
			if (!LGRP_EXISTS(to))
				continue;
			lgrp_lats[i][j] = lgrp_latency(lgrp->lgrp_id,
			    to->lgrp_id);
		}
	}
	ASSERT(cpu_index == snap_ncpus);


	mutex_exit(&cpu_lock);

#ifdef	_SYSCALL32_IMPL
	/*
	 * Check to see whether caller is 32-bit program and need to return
	 * size of 32-bit snapshot now that snapshot has been taken/updated.
	 * May not have been able to do this earlier if snapshot was out of
	 * date or didn't exist yet.
	 */
	if (model == DATAMODEL_ILP32) {

		snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

		/*
		 * Calculate size of buffer needed for 32-bit snapshot,
		 * rounding up size of each object to allow for alignment
		 * of next object in buffer.
		 */
		snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
		    sizeof (caddr32_t));
		info_size =
		    P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
		    sizeof (processorid_t));
		cpuids_size =
		    P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t),
		    sizeof (ulong_t));

		bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
		bitmasks_size = (((2 + LGRP_RSRC_COUNT) * snap_nlgrpsmax) +
		    1) * bitmask_size;


		/*
		 * Size of latency table and buffer
		 */
		lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) +
		    (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int));

		bufsize = snap_hdr_size + info_size + cpuids_size +
		    bitmasks_size + lats_size;
		return (bufsize);
	}
#endif	/* _SYSCALL32_IMPL */

	return (lgrp_snap->ss_size);
}


/*
 * Copy snapshot into given user buffer, fix up any pointers in buffer to point
 * into user instead of kernel address space, and return size of buffer
 * needed to hold snapshot
 */
static int
lgrp_snapshot_copy(char *buf, size_t bufsize)
{
	size_t			bitmask_size;
	int			cpu_index;
	size_t			cpuids_size;
	int			i;
	size_t			info_size;
	lgrp_info_t		*lgrp_info;
	int			retval;
	size_t			snap_hdr_size;
	int			snap_ncpus;
	int			snap_nlgrpsmax;
	lgrp_snapshot_header_t	*user_snap;
	lgrp_info_t		*user_info;
	lgrp_info_t		*user_info_buffer;
	processorid_t		*user_cpuids;
	ulong_t			*user_lgrpset;
	ulong_t			*user_parents;
	ulong_t			*user_children;
	int			**user_lats;
	int			**user_lats_buffer;
	ulong_t			*user_rsets;

	if (lgrp_snap == NULL)
		return (0);

	if (buf == NULL || bufsize <= 0)
		return (lgrp_snap->ss_size);

	/*
	 * User needs to try getting size of buffer again
	 * because given buffer size is too small.
	 * The lgroup hierarchy may have changed after they asked for the size
	 * but before the snapshot was taken.
	 */
	if (bufsize < lgrp_snap->ss_size)
		return (set_errno(EAGAIN));

	snap_ncpus = lgrp_snap->ss_ncpus;
	snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

	/*
	 * Fill in lgrpset now because caller may have change psets
	 */
	kpreempt_disable();
	for (i = 0; i < snap_nlgrpsmax; i++) {
		if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset,
		    i)) {
			BT_SET(lgrp_snap->ss_lgrpset, i);
		}
	}
	kpreempt_enable();

	/*
	 * Copy lgroup snapshot (snapshot header, lgroup info, and CPU IDs)
	 * into user buffer all at once
	 */
	if (copyout(lgrp_snap, buf, lgrp_snap->ss_size) != 0)
		return (set_errno(EFAULT));

	/*
	 * Round up sizes of lgroup snapshot header and info for alignment
	 */
	snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t),
	    sizeof (void *));
	info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t),
	    sizeof (processorid_t));
	cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
	    sizeof (ulong_t));

	bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);

	/*
	 * Calculate pointers into user buffer for lgroup snapshot header,
	 * info, and CPU IDs
	 */
	user_snap = (lgrp_snapshot_header_t *)buf;
	user_info = (lgrp_info_t *)((uintptr_t)user_snap + snap_hdr_size);
	user_cpuids = (processorid_t *)((uintptr_t)user_info + info_size);
	user_lgrpset = (ulong_t *)((uintptr_t)user_cpuids + cpuids_size);
	user_parents = (ulong_t *)((uintptr_t)user_lgrpset + bitmask_size);
	user_children = (ulong_t *)((uintptr_t)user_parents +
	    (snap_nlgrpsmax * bitmask_size));
	user_rsets = (ulong_t *)((uintptr_t)user_children +
	    (snap_nlgrpsmax * bitmask_size));
	user_lats = (int **)((uintptr_t)user_rsets +
	    (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size));

	/*
	 * Copyout magic number (ie. pointer to beginning of buffer)
	 */
	if (copyout(&buf, &user_snap->ss_magic, sizeof (buf)) != 0)
		return (set_errno(EFAULT));

	/*
	 * Fix up pointers in user buffer to point into user buffer
	 * not kernel snapshot
	 */
	if (copyout(&user_info, &user_snap->ss_info, sizeof (user_info)) != 0)
		return (set_errno(EFAULT));

	if (copyout(&user_cpuids, &user_snap->ss_cpuids,
	    sizeof (user_cpuids)) != 0)
		return (set_errno(EFAULT));

	if (copyout(&user_lgrpset, &user_snap->ss_lgrpset,
	    sizeof (user_lgrpset)) != 0)
		return (set_errno(EFAULT));

	if (copyout(&user_parents, &user_snap->ss_parents,
	    sizeof (user_parents)) != 0)
		return (set_errno(EFAULT));

	if (copyout(&user_children, &user_snap->ss_children,
	    sizeof (user_children)) != 0)
		return (set_errno(EFAULT));

	if (copyout(&user_rsets, &user_snap->ss_rsets,
	    sizeof (user_rsets)) != 0)
		return (set_errno(EFAULT));

	if (copyout(&user_lats, &user_snap->ss_latencies,
	    sizeof (user_lats)) != 0)
		return (set_errno(EFAULT));

	/*
	 * Make copies of lgroup info and latency table, fix up pointers,
	 * and then copy them into user buffer
	 */
	user_info_buffer = kmem_zalloc(info_size, KM_NOSLEEP);
	if (user_info_buffer == NULL)
		return (set_errno(ENOMEM));

	user_lats_buffer = kmem_zalloc(snap_nlgrpsmax * sizeof (int *),
	    KM_NOSLEEP);
	if (user_lats_buffer == NULL) {
		kmem_free(user_info_buffer, info_size);
		return (set_errno(ENOMEM));
	}

	lgrp_info = (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size);
	bcopy(lgrp_info, user_info_buffer, info_size);

	cpu_index = 0;
	for (i = 0; i < snap_nlgrpsmax; i++) {
		ulong_t	*snap_rset;

		/*
		 * Skip non-existent lgroups
		 */
		if (user_info_buffer[i].info_lgrpid == LGRP_NONE)
			continue;

		/*
		 * Update free memory size since it changes frequently
		 * Only do so for lgroups directly containing memory
		 *
		 * NOTE: This must be done before changing the pointers to
		 *	 point into user space since we need to dereference
		 *	 lgroup resource set
		 */
		snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM *
		    BT_BITOUL(snap_nlgrpsmax)];
		if (BT_TEST(snap_rset, i))
			user_info_buffer[i].info_mem_free =
			    lgrp_mem_size(i, LGRP_MEM_SIZE_FREE);

		/*
		 * Fix up pointers to parents, children, resources, and
		 * latencies
		 */
		user_info_buffer[i].info_parents =
		    (ulong_t *)((uintptr_t)user_parents + (i * bitmask_size));
		user_info_buffer[i].info_children =
		    (ulong_t *)((uintptr_t)user_children + (i * bitmask_size));
		user_info_buffer[i].info_rset =
		    (ulong_t *)((uintptr_t)user_rsets +
		    (i * LGRP_RSRC_COUNT * bitmask_size));
		user_lats_buffer[i] = (int *)((uintptr_t)user_lats +
		    (snap_nlgrpsmax * sizeof (int *)) + (i * snap_nlgrpsmax *
		    sizeof (int)));

		/*
		 * Fix up pointer to CPU IDs
		 */
		if (user_info_buffer[i].info_ncpus == 0) {
			user_info_buffer[i].info_cpuids = NULL;
			continue;
		}
		user_info_buffer[i].info_cpuids = &user_cpuids[cpu_index];
		cpu_index += user_info_buffer[i].info_ncpus;
	}
	ASSERT(cpu_index == snap_ncpus);

	/*
	 * Copy lgroup info and latency table with pointers fixed up to point
	 * into user buffer out to user buffer now
	 */
	retval = lgrp_snap->ss_size;
	if (copyout(user_info_buffer, user_info, info_size) != 0)
		retval = set_errno(EFAULT);
	kmem_free(user_info_buffer, info_size);

	if (copyout(user_lats_buffer, user_lats, snap_nlgrpsmax *
	    sizeof (int *)) != 0)
		retval = set_errno(EFAULT);
	kmem_free(user_lats_buffer, snap_nlgrpsmax * sizeof (int *));

	return (retval);
}


#ifdef	_SYSCALL32_IMPL
/*
 * Make 32-bit copy of snapshot, fix up any pointers in buffer to point
 * into user instead of kernel address space, copy 32-bit snapshot into
 * given user buffer, and return size of buffer needed to hold snapshot
 */
static int
lgrp_snapshot_copy32(caddr32_t buf, size32_t bufsize)
{
	size32_t			bitmask_size;
	size32_t			bitmasks_size;
	size32_t			children_size;
	int				cpu_index;
	size32_t			cpuids_size;
	int				i;
	int				j;
	size32_t			info_size;
	size32_t			lats_size;
	lgrp_info_t			*lgrp_info;
	lgrp_snapshot_header32_t	*lgrp_snap32;
	lgrp_info32_t			*lgrp_info32;
	processorid_t			*lgrp_cpuids32;
	caddr32_t			*lgrp_lats32;
	int				**lgrp_lats32_kernel;
	uint_t				*lgrp_set32;
	uint_t				*lgrp_parents32;
	uint_t				*lgrp_children32;
	uint_t				*lgrp_rsets32;
	size32_t			parents_size;
	size32_t			rsets_size;
	size32_t			set_size;
	size32_t			snap_hdr_size;
	int				snap_ncpus;
	int				snap_nlgrpsmax;
	size32_t			snap_size;

	if (lgrp_snap == NULL)
		return (0);

	snap_ncpus = lgrp_snap->ss_ncpus;
	snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

	/*
	 * Calculate size of buffer needed for 32-bit snapshot,
	 * rounding up size of each object to allow for alignment
	 * of next object in buffer.
	 */
	snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
	    sizeof (caddr32_t));
	info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
	    sizeof (processorid_t));
	cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
	    sizeof (ulong_t));

	bitmask_size = BT_SIZEOFMAP32(snap_nlgrpsmax);

	set_size = bitmask_size;
	parents_size = snap_nlgrpsmax * bitmask_size;
	children_size = snap_nlgrpsmax * bitmask_size;
	rsets_size = P2ROUNDUP(LGRP_RSRC_COUNT * snap_nlgrpsmax *
	    (int)bitmask_size, sizeof (caddr32_t));

	bitmasks_size = set_size + parents_size + children_size + rsets_size;

	/*
	 * Size of latency table and buffer
	 */
	lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) +
	    (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int));

	snap_size = snap_hdr_size + info_size + cpuids_size + bitmasks_size +
	    lats_size;

	if (buf == NULL || bufsize <= 0) {
		return (snap_size);
	}

	/*
	 * User needs to try getting size of buffer again
	 * because given buffer size is too small.
	 * The lgroup hierarchy may have changed after they asked for the size
	 * but before the snapshot was taken.
	 */
	if (bufsize < snap_size)
		return (set_errno(EAGAIN));

	/*
	 * Make 32-bit copy of snapshot, fix up pointers to point into user
	 * buffer not kernel, and then copy whole thing into user buffer
	 */
	lgrp_snap32 = kmem_zalloc(snap_size, KM_NOSLEEP);
	if (lgrp_snap32 == NULL)
		return (set_errno(ENOMEM));

	/*
	 * Calculate pointers into 32-bit copy of snapshot
	 * for lgroup info, CPU IDs, pset lgroup bitmask, parents, children,
	 * resources, and latency table and buffer
	 */
	lgrp_info32 = (lgrp_info32_t *)((uintptr_t)lgrp_snap32 +
	    snap_hdr_size);
	lgrp_cpuids32 = (processorid_t *)((uintptr_t)lgrp_info32 + info_size);
	lgrp_set32 = (uint_t *)((uintptr_t)lgrp_cpuids32 + cpuids_size);
	lgrp_parents32 = (uint_t *)((uintptr_t)lgrp_set32 + set_size);
	lgrp_children32 = (uint_t *)((uintptr_t)lgrp_parents32 + parents_size);
	lgrp_rsets32 = (uint_t *)((uintptr_t)lgrp_children32 + children_size);
	lgrp_lats32 = (caddr32_t *)((uintptr_t)lgrp_rsets32 + rsets_size);

	/*
	 * Make temporary lgroup latency table of pointers for kernel to use
	 * to fill in rows of table with latencies from each lgroup
	 */
	lgrp_lats32_kernel =  kmem_zalloc(snap_nlgrpsmax * sizeof (int *),
	    KM_NOSLEEP);
	if (lgrp_lats32_kernel == NULL) {
		kmem_free(lgrp_snap32, snap_size);
		return (set_errno(ENOMEM));
	}

	/*
	 * Fill in 32-bit lgroup snapshot header
	 * (with pointers into user's buffer for lgroup info, CPU IDs,
	 * bit masks, and latencies)
	 */
	lgrp_snap32->ss_version = lgrp_snap->ss_version;
	lgrp_snap32->ss_levels = lgrp_snap->ss_levels;
	lgrp_snap32->ss_nlgrps = lgrp_snap32->ss_nlgrps_os =
	    lgrp_snap->ss_nlgrps;
	lgrp_snap32->ss_nlgrps_max = snap_nlgrpsmax;
	lgrp_snap32->ss_root = lgrp_snap->ss_root;
	lgrp_snap32->ss_ncpus = lgrp_snap->ss_ncpus;
	lgrp_snap32->ss_gen = lgrp_snap->ss_gen;
	lgrp_snap32->ss_view = LGRP_VIEW_OS;
	lgrp_snap32->ss_size = snap_size;
	lgrp_snap32->ss_magic = buf;
	lgrp_snap32->ss_info = buf + snap_hdr_size;
	lgrp_snap32->ss_cpuids = lgrp_snap32->ss_info + info_size;
	lgrp_snap32->ss_lgrpset = lgrp_snap32->ss_cpuids + cpuids_size;
	lgrp_snap32->ss_parents = lgrp_snap32->ss_lgrpset + bitmask_size;
	lgrp_snap32->ss_children = lgrp_snap32->ss_parents +
	    (snap_nlgrpsmax * bitmask_size);
	lgrp_snap32->ss_rsets = lgrp_snap32->ss_children +
	    (snap_nlgrpsmax * bitmask_size);
	lgrp_snap32->ss_latencies = lgrp_snap32->ss_rsets +
	    (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size);

	/*
	 * Fill in lgrpset now because caller may have change psets
	 */
	kpreempt_disable();
	for (i = 0; i < snap_nlgrpsmax; i++) {
		if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset,
		    i)) {
			BT_SET32(lgrp_set32, i);
		}
	}
	kpreempt_enable();

	/*
	 * Fill in 32-bit copy of lgroup info and fix up pointers
	 * to point into user's buffer instead of kernel's
	 */
	cpu_index = 0;
	lgrp_info = lgrp_snap->ss_info;
	for (i = 0; i < snap_nlgrpsmax; i++) {
		uint_t	*children;
		uint_t	*lgrp_rset;
		uint_t	*parents;
		ulong_t	*snap_rset;

		/*
		 * Skip non-existent lgroups
		 */
		if (lgrp_info[i].info_lgrpid == LGRP_NONE) {
			bzero(&lgrp_info32[i], sizeof (lgrp_info32[i]));
			lgrp_info32[i].info_lgrpid = LGRP_NONE;
			continue;
		}

		/*
		 * Fill in parents, children, lgroup resource set, and
		 * latencies from snapshot
		 */
		parents = (uint_t *)((uintptr_t)lgrp_parents32 +
		    i * bitmask_size);
		children = (uint_t *)((uintptr_t)lgrp_children32 +
		    i * bitmask_size);
		snap_rset = (ulong_t *)((uintptr_t)lgrp_snap->ss_rsets +
		    (i * LGRP_RSRC_COUNT * BT_SIZEOFMAP(snap_nlgrpsmax)));
		lgrp_rset = (uint_t *)((uintptr_t)lgrp_rsets32 +
		    (i * LGRP_RSRC_COUNT * bitmask_size));
		lgrp_lats32_kernel[i] = (int *)((uintptr_t)lgrp_lats32 +
		    snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax *
		    sizeof (int));
		for (j = 0; j < snap_nlgrpsmax; j++) {
			int	k;
			uint_t	*rset;

			if (BT_TEST(&lgrp_snap->ss_parents[i], j))
				BT_SET32(parents, j);

			if (BT_TEST(&lgrp_snap->ss_children[i], j))
				BT_SET32(children, j);

			for (k = 0; k < LGRP_RSRC_COUNT; k++) {
				rset = (uint_t *)((uintptr_t)lgrp_rset +
				    k * bitmask_size);
				if (BT_TEST(&snap_rset[k], j))
					BT_SET32(rset, j);
			}

			lgrp_lats32_kernel[i][j] =
			    lgrp_snap->ss_latencies[i][j];
		}

		/*
		 * Fix up pointer to latency buffer
		 */
		lgrp_lats32[i] = lgrp_snap32->ss_latencies +
		    snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax *
		    sizeof (int);

		/*
		 * Fix up pointers for parents, children, and resources
		 */
		lgrp_info32[i].info_parents = lgrp_snap32->ss_parents +
		    (i * bitmask_size);
		lgrp_info32[i].info_children = lgrp_snap32->ss_children +
		    (i * bitmask_size);
		lgrp_info32[i].info_rset = lgrp_snap32->ss_rsets +
		    (i * LGRP_RSRC_COUNT * bitmask_size);

		/*
		 * Fill in memory and CPU info
		 * Only fill in memory for lgroups directly containing memory
		 */
		snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM *
		    BT_BITOUL(snap_nlgrpsmax)];
		if (BT_TEST(snap_rset, i)) {
			lgrp_info32[i].info_mem_free = lgrp_mem_size(i,
			    LGRP_MEM_SIZE_FREE);
			lgrp_info32[i].info_mem_install =
			    lgrp_info[i].info_mem_install;
		}

		lgrp_info32[i].info_ncpus = lgrp_info[i].info_ncpus;

		lgrp_info32[i].info_lgrpid = lgrp_info[i].info_lgrpid;
		lgrp_info32[i].info_latency = lgrp_info[i].info_latency;

		if (lgrp_info32[i].info_ncpus == 0) {
			lgrp_info32[i].info_cpuids = 0;
			continue;
		}

		/*
		 * Fix up pointer for CPU IDs
		 */
		lgrp_info32[i].info_cpuids = lgrp_snap32->ss_cpuids +
		    (cpu_index * sizeof (processorid_t));
		cpu_index += lgrp_info32[i].info_ncpus;
	}
	ASSERT(cpu_index == snap_ncpus);

	/*
	 * Copy lgroup CPU IDs into 32-bit snapshot
	 * before copying it out into user's buffer
	 */
	bcopy(lgrp_snap->ss_cpuids, lgrp_cpuids32, cpuids_size);

	/*
	 * Copy 32-bit lgroup snapshot into user's buffer all at once
	 */
	if (copyout(lgrp_snap32, (void *)(uintptr_t)buf, snap_size) != 0) {
		kmem_free(lgrp_snap32, snap_size);
		kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *));
		return (set_errno(EFAULT));
	}

	kmem_free(lgrp_snap32, snap_size);
	kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *));

	return (snap_size);
}
#endif	/* _SYSCALL32_IMPL */


int
lgrpsys(int subcode, long ia, void *ap)
{
	size_t	bufsize;
	int	latency;

	switch (subcode) {

	case LGRP_SYS_AFFINITY_GET:
		return (lgrp_affinity_get((lgrp_affinity_args_t *)ap));

	case LGRP_SYS_AFFINITY_SET:
		return (lgrp_affinity_set((lgrp_affinity_args_t *)ap));

	case LGRP_SYS_GENERATION:
		return (lgrp_generation(ia));

	case LGRP_SYS_HOME:
		return (lgrp_home_get((idtype_t)ia, (id_t)(uintptr_t)ap));

	case LGRP_SYS_LATENCY:
		mutex_enter(&cpu_lock);
		latency = lgrp_latency(ia, (lgrp_id_t)(uintptr_t)ap);
		mutex_exit(&cpu_lock);
		return (latency);

	case LGRP_SYS_MEMINFO:
		return (meminfo(ia, (struct meminfo *)ap));

	case LGRP_SYS_VERSION:
		return (lgrp_version(ia));

	case LGRP_SYS_SNAPSHOT:
		mutex_enter(&lgrp_snap_lock);
		bufsize = lgrp_snapshot();
		if (ap && ia > 0) {
			if (get_udatamodel() == DATAMODEL_NATIVE)
				bufsize = lgrp_snapshot_copy(ap, ia);
#ifdef	_SYSCALL32_IMPL
			else
				bufsize = lgrp_snapshot_copy32(
				    (caddr32_t)(uintptr_t)ap, ia);
#endif	/* _SYSCALL32_IMPL */
		}
		mutex_exit(&lgrp_snap_lock);
		return (bufsize);

	default:
		break;

	}

	return (set_errno(EINVAL));
}