xref: /illumos-gate/usr/src/uts/common/os/netstack.c (revision 45818ee124adeaaf947698996b4f4c722afc6d1f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
26  */
27 
28 #include <sys/param.h>
29 #include <sys/sysmacros.h>
30 #include <sys/vm.h>
31 #include <sys/proc.h>
32 #include <sys/tuneable.h>
33 #include <sys/systm.h>
34 #include <sys/cmn_err.h>
35 #include <sys/debug.h>
36 #include <sys/sdt.h>
37 #include <sys/mutex.h>
38 #include <sys/bitmap.h>
39 #include <sys/atomic.h>
40 #include <sys/kobj.h>
41 #include <sys/disp.h>
42 #include <vm/seg_kmem.h>
43 #include <sys/zone.h>
44 #include <sys/netstack.h>
45 
46 /*
47  * What we use so that the zones framework can tell us about new zones,
48  * which we use to create new stacks.
49  */
50 static zone_key_t netstack_zone_key;
51 
52 static int	netstack_initialized = 0;
53 
54 /*
55  * Track the registered netstacks.
56  * The global lock protects
57  * - ns_reg
58  * - the list starting at netstack_head and following the netstack_next
59  *   pointers.
60  */
61 static kmutex_t netstack_g_lock;
62 
63 /*
64  * Registry of netstacks with their create/shutdown/destory functions.
65  */
66 static struct netstack_registry	ns_reg[NS_MAX];
67 
68 /*
69  * Global list of existing stacks.  We use this when a new zone with
70  * an exclusive IP instance is created.
71  *
72  * Note that in some cases a netstack_t needs to stay around after the zone
73  * has gone away. This is because there might be outstanding references
74  * (from TCP TIME_WAIT connections, IPsec state, etc). The netstack_t data
75  * structure and all the foo_stack_t's hanging off of it will be cleaned up
76  * when the last reference to it is dropped.
77  * However, the same zone might be rebooted. That is handled using the
78  * assumption that the zones framework picks a new zoneid each time a zone
79  * is (re)booted. We assert for that condition in netstack_zone_create().
80  * Thus the old netstack_t can take its time for things to time out.
81  */
82 static netstack_t *netstack_head;
83 
84 /*
85  * To support kstat_create_netstack() using kstat_zone_add we need
86  * to track both
87  *  - all zoneids that use the global/shared stack
88  *  - all kstats that have been added for the shared stack
89  */
90 struct shared_zone_list {
91 	struct shared_zone_list *sz_next;
92 	zoneid_t		sz_zoneid;
93 };
94 
95 struct shared_kstat_list {
96 	struct shared_kstat_list *sk_next;
97 	kstat_t			 *sk_kstat;
98 };
99 
100 static kmutex_t netstack_shared_lock;	/* protects the following two */
101 static struct shared_zone_list	*netstack_shared_zones;
102 static struct shared_kstat_list	*netstack_shared_kstats;
103 
104 static void	*netstack_zone_create(zoneid_t zoneid);
105 static void	netstack_zone_shutdown(zoneid_t zoneid, void *arg);
106 static void	netstack_zone_destroy(zoneid_t zoneid, void *arg);
107 
108 static void	netstack_shared_zone_add(zoneid_t zoneid);
109 static void	netstack_shared_zone_remove(zoneid_t zoneid);
110 static void	netstack_shared_kstat_add(kstat_t *ks);
111 static void	netstack_shared_kstat_remove(kstat_t *ks);
112 
113 typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int);
114 
115 static void	apply_all_netstacks(int, applyfn_t *);
116 static void	apply_all_modules(netstack_t *, applyfn_t *);
117 static void	apply_all_modules_reverse(netstack_t *, applyfn_t *);
118 static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int);
119 static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int);
120 static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
121 static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
122 static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
123     kmutex_t *);
124 
125 void
126 netstack_init(void)
127 {
128 	mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
129 	mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
130 
131 	netstack_initialized = 1;
132 
133 	/*
134 	 * We want to be informed each time a zone is created or
135 	 * destroyed in the kernel, so we can maintain the
136 	 * stack instance information.
137 	 */
138 	zone_key_create(&netstack_zone_key, netstack_zone_create,
139 	    netstack_zone_shutdown, netstack_zone_destroy);
140 }
141 
142 /*
143  * Register a new module with the framework.
144  * This registers interest in changes to the set of netstacks.
145  * The createfn and destroyfn are required, but the shutdownfn can be
146  * NULL.
147  * Note that due to the current zsd implementation, when the create
148  * function is called the zone isn't fully present, thus functions
149  * like zone_find_by_* will fail, hence the create function can not
150  * use many zones kernel functions including zcmn_err().
151  */
152 void
153 netstack_register(int moduleid,
154     void *(*module_create)(netstackid_t, netstack_t *),
155     void (*module_shutdown)(netstackid_t, void *),
156     void (*module_destroy)(netstackid_t, void *))
157 {
158 	netstack_t *ns;
159 
160 	ASSERT(netstack_initialized);
161 	ASSERT(moduleid >= 0 && moduleid < NS_MAX);
162 	ASSERT(module_create != NULL);
163 
164 	/*
165 	 * Make instances created after this point in time run the create
166 	 * callback.
167 	 */
168 	mutex_enter(&netstack_g_lock);
169 	ASSERT(ns_reg[moduleid].nr_create == NULL);
170 	ASSERT(ns_reg[moduleid].nr_flags == 0);
171 	ns_reg[moduleid].nr_create = module_create;
172 	ns_reg[moduleid].nr_shutdown = module_shutdown;
173 	ns_reg[moduleid].nr_destroy = module_destroy;
174 	ns_reg[moduleid].nr_flags = NRF_REGISTERED;
175 
176 	/*
177 	 * Determine the set of stacks that exist before we drop the lock.
178 	 * Set NSS_CREATE_NEEDED for each of those.
179 	 * netstacks which have been deleted will have NSS_CREATE_COMPLETED
180 	 * set, but check NSF_CLOSING to be sure.
181 	 */
182 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
183 		nm_state_t *nms = &ns->netstack_m_state[moduleid];
184 
185 		mutex_enter(&ns->netstack_lock);
186 		if (!(ns->netstack_flags & NSF_CLOSING) &&
187 		    (nms->nms_flags & NSS_CREATE_ALL) == 0) {
188 			nms->nms_flags |= NSS_CREATE_NEEDED;
189 			DTRACE_PROBE2(netstack__create__needed,
190 			    netstack_t *, ns, int, moduleid);
191 		}
192 		mutex_exit(&ns->netstack_lock);
193 	}
194 	mutex_exit(&netstack_g_lock);
195 
196 	/*
197 	 * At this point in time a new instance can be created or an instance
198 	 * can be destroyed, or some other module can register or unregister.
199 	 * Make sure we either run all the create functions for this moduleid
200 	 * or we wait for any other creators for this moduleid.
201 	 */
202 	apply_all_netstacks(moduleid, netstack_apply_create);
203 }
204 
205 void
206 netstack_unregister(int moduleid)
207 {
208 	netstack_t *ns;
209 
210 	ASSERT(moduleid >= 0 && moduleid < NS_MAX);
211 
212 	ASSERT(ns_reg[moduleid].nr_create != NULL);
213 	ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
214 
215 	mutex_enter(&netstack_g_lock);
216 	/*
217 	 * Determine the set of stacks that exist before we drop the lock.
218 	 * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those.
219 	 * That ensures that when we return all the callbacks for existing
220 	 * instances have completed. And since we set NRF_DYING no new
221 	 * instances can use this module.
222 	 */
223 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
224 		boolean_t created = B_FALSE;
225 		nm_state_t *nms = &ns->netstack_m_state[moduleid];
226 
227 		mutex_enter(&ns->netstack_lock);
228 
229 		/*
230 		 * We need to be careful here. We could actually have a netstack
231 		 * being created as we speak waiting for us to let go of this
232 		 * lock to proceed. It may have set NSS_CREATE_NEEDED, but not
233 		 * have gotten to the point of completing it yet. If
234 		 * NSS_CREATE_NEEDED, we can safely just remove it here and
235 		 * never create the module. However, if NSS_CREATE_INPROGRESS is
236 		 * set, we need to still flag this module for shutdown and
237 		 * deletion, just as though it had reached NSS_CREATE_COMPLETED.
238 		 *
239 		 * It is safe to do that because of two different guarantees
240 		 * that exist in the system. The first is that before we do a
241 		 * create, shutdown, or destroy, we ensure that nothing else is
242 		 * in progress in the system for this netstack and wait for it
243 		 * to complete. Secondly, because the zone is being created, we
244 		 * know that the following call to apply_all_netstack will block
245 		 * on the zone finishing its initialization.
246 		 */
247 		if (nms->nms_flags & NSS_CREATE_NEEDED)
248 			nms->nms_flags &= ~NSS_CREATE_NEEDED;
249 
250 		if (nms->nms_flags & NSS_CREATE_INPROGRESS ||
251 		    nms->nms_flags & NSS_CREATE_COMPLETED)
252 			created = B_TRUE;
253 
254 		if (ns_reg[moduleid].nr_shutdown != NULL && created &&
255 		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
256 		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
257 			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
258 			DTRACE_PROBE2(netstack__shutdown__needed,
259 			    netstack_t *, ns, int, moduleid);
260 		}
261 		if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
262 		    ns_reg[moduleid].nr_destroy != NULL && created &&
263 		    (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
264 			nms->nms_flags |= NSS_DESTROY_NEEDED;
265 			DTRACE_PROBE2(netstack__destroy__needed,
266 			    netstack_t *, ns, int, moduleid);
267 		}
268 		mutex_exit(&ns->netstack_lock);
269 	}
270 	/*
271 	 * Prevent any new netstack from calling the registered create
272 	 * function, while keeping the function pointers in place until the
273 	 * shutdown and destroy callbacks are complete.
274 	 */
275 	ns_reg[moduleid].nr_flags |= NRF_DYING;
276 	mutex_exit(&netstack_g_lock);
277 
278 	apply_all_netstacks(moduleid, netstack_apply_shutdown);
279 	apply_all_netstacks(moduleid, netstack_apply_destroy);
280 
281 	/*
282 	 * Clear the nms_flags so that we can handle this module
283 	 * being loaded again.
284 	 * Also remove the registered functions.
285 	 */
286 	mutex_enter(&netstack_g_lock);
287 	ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
288 	ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING);
289 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
290 		nm_state_t *nms = &ns->netstack_m_state[moduleid];
291 
292 		mutex_enter(&ns->netstack_lock);
293 		if (nms->nms_flags & NSS_DESTROY_COMPLETED) {
294 			nms->nms_flags = 0;
295 			DTRACE_PROBE2(netstack__destroy__done,
296 			    netstack_t *, ns, int, moduleid);
297 		}
298 		mutex_exit(&ns->netstack_lock);
299 	}
300 
301 	ns_reg[moduleid].nr_create = NULL;
302 	ns_reg[moduleid].nr_shutdown = NULL;
303 	ns_reg[moduleid].nr_destroy = NULL;
304 	ns_reg[moduleid].nr_flags = 0;
305 	mutex_exit(&netstack_g_lock);
306 }
307 
308 /*
309  * Lookup and/or allocate a netstack for this zone.
310  */
311 static void *
312 netstack_zone_create(zoneid_t zoneid)
313 {
314 	netstackid_t stackid;
315 	netstack_t *ns;
316 	netstack_t **nsp;
317 	zone_t	*zone;
318 	int i;
319 
320 	ASSERT(netstack_initialized);
321 
322 	zone = zone_find_by_id_nolock(zoneid);
323 	ASSERT(zone != NULL);
324 
325 	if (zone->zone_flags & ZF_NET_EXCL) {
326 		stackid = zoneid;
327 	} else {
328 		/* Look for the stack instance for the global */
329 		stackid = GLOBAL_NETSTACKID;
330 	}
331 
332 	/* Allocate even if it isn't needed; simplifies locking */
333 	ns = (netstack_t *)kmem_zalloc(sizeof (netstack_t), KM_SLEEP);
334 
335 	/* Look if there is a matching stack instance */
336 	mutex_enter(&netstack_g_lock);
337 	for (nsp = &netstack_head; *nsp != NULL;
338 	    nsp = &((*nsp)->netstack_next)) {
339 		if ((*nsp)->netstack_stackid == stackid) {
340 			/*
341 			 * Should never find a pre-existing exclusive stack
342 			 */
343 			ASSERT(stackid == GLOBAL_NETSTACKID);
344 			kmem_free(ns, sizeof (netstack_t));
345 			ns = *nsp;
346 			mutex_enter(&ns->netstack_lock);
347 			ns->netstack_numzones++;
348 			mutex_exit(&ns->netstack_lock);
349 			mutex_exit(&netstack_g_lock);
350 			DTRACE_PROBE1(netstack__inc__numzones,
351 			    netstack_t *, ns);
352 			/* Record that we have a new shared stack zone */
353 			netstack_shared_zone_add(zoneid);
354 			zone->zone_netstack = ns;
355 			return (ns);
356 		}
357 	}
358 	/* Not found */
359 	mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
360 	cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL);
361 	ns->netstack_stackid = zoneid;
362 	ns->netstack_numzones = 1;
363 	ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
364 	ns->netstack_flags = NSF_UNINIT;
365 	*nsp = ns;
366 	zone->zone_netstack = ns;
367 
368 	mutex_enter(&ns->netstack_lock);
369 	/*
370 	 * Mark this netstack as having a CREATE running so
371 	 * any netstack_register/netstack_unregister waits for
372 	 * the existing create callbacks to complete in moduleid order
373 	 */
374 	ns->netstack_flags |= NSF_ZONE_CREATE;
375 
376 	/*
377 	 * Determine the set of module create functions that need to be
378 	 * called before we drop the lock.
379 	 * Set NSS_CREATE_NEEDED for each of those.
380 	 * Skip any with NRF_DYING set, since those are in the process of
381 	 * going away, by checking for flags being exactly NRF_REGISTERED.
382 	 */
383 	for (i = 0; i < NS_MAX; i++) {
384 		nm_state_t *nms = &ns->netstack_m_state[i];
385 
386 		cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL);
387 
388 		if ((ns_reg[i].nr_flags == NRF_REGISTERED) &&
389 		    (nms->nms_flags & NSS_CREATE_ALL) == 0) {
390 			nms->nms_flags |= NSS_CREATE_NEEDED;
391 			DTRACE_PROBE2(netstack__create__needed,
392 			    netstack_t *, ns, int, i);
393 		}
394 	}
395 	mutex_exit(&ns->netstack_lock);
396 	mutex_exit(&netstack_g_lock);
397 
398 	apply_all_modules(ns, netstack_apply_create);
399 
400 	/* Tell any waiting netstack_register/netstack_unregister to proceed */
401 	mutex_enter(&ns->netstack_lock);
402 	ns->netstack_flags &= ~NSF_UNINIT;
403 	ASSERT(ns->netstack_flags & NSF_ZONE_CREATE);
404 	ns->netstack_flags &= ~NSF_ZONE_CREATE;
405 	cv_broadcast(&ns->netstack_cv);
406 	mutex_exit(&ns->netstack_lock);
407 
408 	return (ns);
409 }
410 
411 /* ARGSUSED */
412 static void
413 netstack_zone_shutdown(zoneid_t zoneid, void *arg)
414 {
415 	netstack_t *ns = (netstack_t *)arg;
416 	int i;
417 
418 	ASSERT(arg != NULL);
419 
420 	mutex_enter(&ns->netstack_lock);
421 	ASSERT(ns->netstack_numzones > 0);
422 	if (ns->netstack_numzones != 1) {
423 		/* Stack instance being used by other zone */
424 		mutex_exit(&ns->netstack_lock);
425 		ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
426 		return;
427 	}
428 	mutex_exit(&ns->netstack_lock);
429 
430 	mutex_enter(&netstack_g_lock);
431 	mutex_enter(&ns->netstack_lock);
432 	/*
433 	 * Mark this netstack as having a SHUTDOWN running so
434 	 * any netstack_register/netstack_unregister waits for
435 	 * the existing create callbacks to complete in moduleid order
436 	 */
437 	ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
438 	ns->netstack_flags |= NSF_ZONE_SHUTDOWN;
439 
440 	/*
441 	 * Determine the set of stacks that exist before we drop the lock.
442 	 * Set NSS_SHUTDOWN_NEEDED for each of those.
443 	 */
444 	for (i = 0; i < NS_MAX; i++) {
445 		nm_state_t *nms = &ns->netstack_m_state[i];
446 
447 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
448 		    ns_reg[i].nr_shutdown != NULL &&
449 		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
450 		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
451 			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
452 			DTRACE_PROBE2(netstack__shutdown__needed,
453 			    netstack_t *, ns, int, i);
454 		}
455 	}
456 	mutex_exit(&ns->netstack_lock);
457 	mutex_exit(&netstack_g_lock);
458 
459 	/*
460 	 * Call the shutdown function for all registered modules for this
461 	 * netstack.
462 	 */
463 	apply_all_modules_reverse(ns, netstack_apply_shutdown);
464 
465 	/* Tell any waiting netstack_register/netstack_unregister to proceed */
466 	mutex_enter(&ns->netstack_lock);
467 	ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN);
468 	ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN;
469 	cv_broadcast(&ns->netstack_cv);
470 	mutex_exit(&ns->netstack_lock);
471 }
472 
473 /*
474  * Common routine to release a zone.
475  * If this was the last zone using the stack instance then prepare to
476  * have the refcnt dropping to zero free the zone.
477  */
478 /* ARGSUSED */
479 static void
480 netstack_zone_destroy(zoneid_t zoneid, void *arg)
481 {
482 	netstack_t *ns = (netstack_t *)arg;
483 
484 	ASSERT(arg != NULL);
485 
486 	mutex_enter(&ns->netstack_lock);
487 	ASSERT(ns->netstack_numzones > 0);
488 	ns->netstack_numzones--;
489 	if (ns->netstack_numzones != 0) {
490 		/* Stack instance being used by other zone */
491 		mutex_exit(&ns->netstack_lock);
492 		ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
493 		/* Record that we a shared stack zone has gone away */
494 		netstack_shared_zone_remove(zoneid);
495 		return;
496 	}
497 	/*
498 	 * Set CLOSING so that netstack_find_by will not find it.
499 	 */
500 	ns->netstack_flags |= NSF_CLOSING;
501 	mutex_exit(&ns->netstack_lock);
502 	DTRACE_PROBE1(netstack__dec__numzones, netstack_t *, ns);
503 	/* No other thread can call zone_destroy for this stack */
504 
505 	/*
506 	 * Decrease refcnt to account for the one in netstack_zone_init()
507 	 */
508 	netstack_rele(ns);
509 }
510 
511 /*
512  * Called when the reference count drops to zero.
513  * Call the destroy functions for each registered module.
514  */
515 static void
516 netstack_stack_inactive(netstack_t *ns)
517 {
518 	int i;
519 
520 	mutex_enter(&netstack_g_lock);
521 	mutex_enter(&ns->netstack_lock);
522 	/*
523 	 * Mark this netstack as having a DESTROY running so
524 	 * any netstack_register/netstack_unregister waits for
525 	 * the existing destroy callbacks to complete in reverse moduleid order
526 	 */
527 	ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
528 	ns->netstack_flags |= NSF_ZONE_DESTROY;
529 	/*
530 	 * If the shutdown callback wasn't called earlier (e.g., if this is
531 	 * a netstack shared between multiple zones), then we schedule it now.
532 	 *
533 	 * Determine the set of stacks that exist before we drop the lock.
534 	 * Set NSS_DESTROY_NEEDED for each of those. That
535 	 * ensures that when we return all the callbacks for existing
536 	 * instances have completed.
537 	 */
538 	for (i = 0; i < NS_MAX; i++) {
539 		nm_state_t *nms = &ns->netstack_m_state[i];
540 
541 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
542 		    ns_reg[i].nr_shutdown != NULL &&
543 		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
544 		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
545 			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
546 			DTRACE_PROBE2(netstack__shutdown__needed,
547 			    netstack_t *, ns, int, i);
548 		}
549 
550 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
551 		    ns_reg[i].nr_destroy != NULL &&
552 		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
553 		    (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
554 			nms->nms_flags |= NSS_DESTROY_NEEDED;
555 			DTRACE_PROBE2(netstack__destroy__needed,
556 			    netstack_t *, ns, int, i);
557 		}
558 	}
559 	mutex_exit(&ns->netstack_lock);
560 	mutex_exit(&netstack_g_lock);
561 
562 	/*
563 	 * Call the shutdown and destroy functions for all registered modules
564 	 * for this netstack.
565 	 *
566 	 * Since there are some ordering dependencies between the modules we
567 	 * tear them down in the reverse order of what was used to create them.
568 	 *
569 	 * Since a netstack_t is never reused (when a zone is rebooted it gets
570 	 * a new zoneid == netstackid i.e. a new netstack_t is allocated) we
571 	 * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set.
572 	 * That is different than in the netstack_unregister() case.
573 	 */
574 	apply_all_modules_reverse(ns, netstack_apply_shutdown);
575 	apply_all_modules_reverse(ns, netstack_apply_destroy);
576 
577 	/* Tell any waiting netstack_register/netstack_unregister to proceed */
578 	mutex_enter(&ns->netstack_lock);
579 	ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY);
580 	ns->netstack_flags &= ~NSF_ZONE_DESTROY;
581 	cv_broadcast(&ns->netstack_cv);
582 	mutex_exit(&ns->netstack_lock);
583 }
584 
585 /*
586  * Apply a function to all netstacks for a particular moduleid.
587  *
588  * If there is any zone activity (due to a zone being created, shutdown,
589  * or destroyed) we wait for that to complete before we proceed. This ensures
590  * that the moduleids are processed in order when a zone is created or
591  * destroyed.
592  *
593  * The applyfn has to drop netstack_g_lock if it does some work.
594  * In that case we don't follow netstack_next,
595  * even if it is possible to do so without any hazards. This is
596  * because we want the design to allow for the list of netstacks threaded
597  * by netstack_next to change in any arbitrary way during the time the
598  * lock was dropped.
599  *
600  * It is safe to restart the loop at netstack_head since the applyfn
601  * changes netstack_m_state as it processes things, so a subsequent
602  * pass through will have no effect in applyfn, hence the loop will terminate
603  * in at worst O(N^2).
604  */
605 static void
606 apply_all_netstacks(int moduleid, applyfn_t *applyfn)
607 {
608 	netstack_t *ns;
609 
610 	mutex_enter(&netstack_g_lock);
611 	ns = netstack_head;
612 	while (ns != NULL) {
613 		if (wait_for_zone_creator(ns, &netstack_g_lock)) {
614 			/* Lock dropped - restart at head */
615 			ns = netstack_head;
616 		} else if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
617 			/* Lock dropped - restart at head */
618 			ns = netstack_head;
619 		} else {
620 			ns = ns->netstack_next;
621 		}
622 	}
623 	mutex_exit(&netstack_g_lock);
624 }
625 
626 /*
627  * Apply a function to all moduleids for a particular netstack.
628  *
629  * Since the netstack linkage doesn't matter in this case we can
630  * ignore whether the function drops the lock.
631  */
632 static void
633 apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
634 {
635 	int i;
636 
637 	mutex_enter(&netstack_g_lock);
638 	for (i = 0; i < NS_MAX; i++) {
639 		/*
640 		 * We don't care whether the lock was dropped
641 		 * since we are not iterating over netstack_head.
642 		 */
643 		(void) (applyfn)(&netstack_g_lock, ns, i);
644 	}
645 	mutex_exit(&netstack_g_lock);
646 }
647 
648 /* Like the above but in reverse moduleid order */
649 static void
650 apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
651 {
652 	int i;
653 
654 	mutex_enter(&netstack_g_lock);
655 	for (i = NS_MAX-1; i >= 0; i--) {
656 		/*
657 		 * We don't care whether the lock was dropped
658 		 * since we are not iterating over netstack_head.
659 		 */
660 		(void) (applyfn)(&netstack_g_lock, ns, i);
661 	}
662 	mutex_exit(&netstack_g_lock);
663 }
664 
665 /*
666  * Call the create function for the ns and moduleid if CREATE_NEEDED
667  * is set.
668  * If some other thread gets here first and sets *_INPROGRESS, then
669  * we wait for that thread to complete so that we can ensure that
670  * all the callbacks are done when we've looped over all netstacks/moduleids.
671  *
672  * When we call the create function, we temporarily drop the netstack_lock
673  * held by the caller, and return true to tell the caller it needs to
674  * re-evalute the state.
675  */
676 static boolean_t
677 netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
678 {
679 	void *result;
680 	netstackid_t stackid;
681 	nm_state_t *nms = &ns->netstack_m_state[moduleid];
682 	boolean_t dropped = B_FALSE;
683 
684 	ASSERT(MUTEX_HELD(lockp));
685 	mutex_enter(&ns->netstack_lock);
686 
687 	if (wait_for_nms_inprogress(ns, nms, lockp))
688 		dropped = B_TRUE;
689 
690 	if (nms->nms_flags & NSS_CREATE_NEEDED) {
691 		nms->nms_flags &= ~NSS_CREATE_NEEDED;
692 		nms->nms_flags |= NSS_CREATE_INPROGRESS;
693 		DTRACE_PROBE2(netstack__create__inprogress,
694 		    netstack_t *, ns, int, moduleid);
695 		mutex_exit(&ns->netstack_lock);
696 		mutex_exit(lockp);
697 		dropped = B_TRUE;
698 
699 		ASSERT(ns_reg[moduleid].nr_create != NULL);
700 		stackid = ns->netstack_stackid;
701 		DTRACE_PROBE2(netstack__create__start,
702 		    netstackid_t, stackid,
703 		    netstack_t *, ns);
704 		result = (ns_reg[moduleid].nr_create)(stackid, ns);
705 		DTRACE_PROBE2(netstack__create__end,
706 		    void *, result, netstack_t *, ns);
707 
708 		ASSERT(result != NULL);
709 		mutex_enter(lockp);
710 		mutex_enter(&ns->netstack_lock);
711 		ns->netstack_modules[moduleid] = result;
712 		nms->nms_flags &= ~NSS_CREATE_INPROGRESS;
713 		nms->nms_flags |= NSS_CREATE_COMPLETED;
714 		cv_broadcast(&nms->nms_cv);
715 		DTRACE_PROBE2(netstack__create__completed,
716 		    netstack_t *, ns, int, moduleid);
717 		mutex_exit(&ns->netstack_lock);
718 		return (dropped);
719 	} else {
720 		mutex_exit(&ns->netstack_lock);
721 		return (dropped);
722 	}
723 }
724 
725 /*
726  * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
727  * is set.
728  * If some other thread gets here first and sets *_INPROGRESS, then
729  * we wait for that thread to complete so that we can ensure that
730  * all the callbacks are done when we've looped over all netstacks/moduleids.
731  *
732  * When we call the shutdown function, we temporarily drop the netstack_lock
733  * held by the caller, and return true to tell the caller it needs to
734  * re-evalute the state.
735  */
736 static boolean_t
737 netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
738 {
739 	netstackid_t stackid;
740 	void * netstack_module;
741 	nm_state_t *nms = &ns->netstack_m_state[moduleid];
742 	boolean_t dropped = B_FALSE;
743 
744 	ASSERT(MUTEX_HELD(lockp));
745 	mutex_enter(&ns->netstack_lock);
746 
747 	if (wait_for_nms_inprogress(ns, nms, lockp))
748 		dropped = B_TRUE;
749 
750 	if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) {
751 		nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED;
752 		nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS;
753 		DTRACE_PROBE2(netstack__shutdown__inprogress,
754 		    netstack_t *, ns, int, moduleid);
755 		mutex_exit(&ns->netstack_lock);
756 		mutex_exit(lockp);
757 		dropped = B_TRUE;
758 
759 		ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
760 		stackid = ns->netstack_stackid;
761 		netstack_module = ns->netstack_modules[moduleid];
762 		DTRACE_PROBE2(netstack__shutdown__start,
763 		    netstackid_t, stackid,
764 		    void *, netstack_module);
765 		(ns_reg[moduleid].nr_shutdown)(stackid, netstack_module);
766 		DTRACE_PROBE1(netstack__shutdown__end,
767 		    netstack_t *, ns);
768 
769 		mutex_enter(lockp);
770 		mutex_enter(&ns->netstack_lock);
771 		nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS;
772 		nms->nms_flags |= NSS_SHUTDOWN_COMPLETED;
773 		cv_broadcast(&nms->nms_cv);
774 		DTRACE_PROBE2(netstack__shutdown__completed,
775 		    netstack_t *, ns, int, moduleid);
776 		mutex_exit(&ns->netstack_lock);
777 		return (dropped);
778 	} else {
779 		mutex_exit(&ns->netstack_lock);
780 		return (dropped);
781 	}
782 }
783 
784 /*
785  * Call the destroy function for the ns and moduleid if DESTROY_NEEDED
786  * is set.
787  * If some other thread gets here first and sets *_INPROGRESS, then
788  * we wait for that thread to complete so that we can ensure that
789  * all the callbacks are done when we've looped over all netstacks/moduleids.
790  *
791  * When we call the destroy function, we temporarily drop the netstack_lock
792  * held by the caller, and return true to tell the caller it needs to
793  * re-evalute the state.
794  */
795 static boolean_t
796 netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
797 {
798 	netstackid_t stackid;
799 	void * netstack_module;
800 	nm_state_t *nms = &ns->netstack_m_state[moduleid];
801 	boolean_t dropped = B_FALSE;
802 
803 	ASSERT(MUTEX_HELD(lockp));
804 	mutex_enter(&ns->netstack_lock);
805 
806 	if (wait_for_nms_inprogress(ns, nms, lockp))
807 		dropped = B_TRUE;
808 
809 	if (nms->nms_flags & NSS_DESTROY_NEEDED) {
810 		nms->nms_flags &= ~NSS_DESTROY_NEEDED;
811 		nms->nms_flags |= NSS_DESTROY_INPROGRESS;
812 		DTRACE_PROBE2(netstack__destroy__inprogress,
813 		    netstack_t *, ns, int, moduleid);
814 		mutex_exit(&ns->netstack_lock);
815 		mutex_exit(lockp);
816 		dropped = B_TRUE;
817 
818 		ASSERT(ns_reg[moduleid].nr_destroy != NULL);
819 		stackid = ns->netstack_stackid;
820 		netstack_module = ns->netstack_modules[moduleid];
821 		DTRACE_PROBE2(netstack__destroy__start,
822 		    netstackid_t, stackid,
823 		    void *, netstack_module);
824 		(ns_reg[moduleid].nr_destroy)(stackid, netstack_module);
825 		DTRACE_PROBE1(netstack__destroy__end,
826 		    netstack_t *, ns);
827 
828 		mutex_enter(lockp);
829 		mutex_enter(&ns->netstack_lock);
830 		ns->netstack_modules[moduleid] = NULL;
831 		nms->nms_flags &= ~NSS_DESTROY_INPROGRESS;
832 		nms->nms_flags |= NSS_DESTROY_COMPLETED;
833 		cv_broadcast(&nms->nms_cv);
834 		DTRACE_PROBE2(netstack__destroy__completed,
835 		    netstack_t *, ns, int, moduleid);
836 		mutex_exit(&ns->netstack_lock);
837 		return (dropped);
838 	} else {
839 		mutex_exit(&ns->netstack_lock);
840 		return (dropped);
841 	}
842 }
843 
844 /*
845  * If somebody  is creating the netstack (due to a new zone being created)
846  * then we wait for them to complete. This ensures that any additional
847  * netstack_register() doesn't cause the create functions to run out of
848  * order.
849  * Note that we do not need such a global wait in the case of the shutdown
850  * and destroy callbacks, since in that case it is sufficient for both
851  * threads to set NEEDED and wait for INPROGRESS to ensure ordering.
852  * Returns true if lockp was temporarily dropped while waiting.
853  */
854 static boolean_t
855 wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp)
856 {
857 	boolean_t dropped = B_FALSE;
858 
859 	mutex_enter(&ns->netstack_lock);
860 	while (ns->netstack_flags & NSF_ZONE_CREATE) {
861 		DTRACE_PROBE1(netstack__wait__zone__inprogress,
862 		    netstack_t *, ns);
863 		if (lockp != NULL) {
864 			dropped = B_TRUE;
865 			mutex_exit(lockp);
866 		}
867 		cv_wait(&ns->netstack_cv, &ns->netstack_lock);
868 		if (lockp != NULL) {
869 			/* First drop netstack_lock to preserve order */
870 			mutex_exit(&ns->netstack_lock);
871 			mutex_enter(lockp);
872 			mutex_enter(&ns->netstack_lock);
873 		}
874 	}
875 	mutex_exit(&ns->netstack_lock);
876 	return (dropped);
877 }
878 
879 /*
880  * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid
881  * combination.
882  * Returns true if lockp was temporarily dropped while waiting.
883  */
884 static boolean_t
885 wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp)
886 {
887 	boolean_t dropped = B_FALSE;
888 
889 	while (nms->nms_flags & NSS_ALL_INPROGRESS) {
890 		DTRACE_PROBE2(netstack__wait__nms__inprogress,
891 		    netstack_t *, ns, nm_state_t *, nms);
892 		if (lockp != NULL) {
893 			dropped = B_TRUE;
894 			mutex_exit(lockp);
895 		}
896 		cv_wait(&nms->nms_cv, &ns->netstack_lock);
897 		if (lockp != NULL) {
898 			/* First drop netstack_lock to preserve order */
899 			mutex_exit(&ns->netstack_lock);
900 			mutex_enter(lockp);
901 			mutex_enter(&ns->netstack_lock);
902 		}
903 	}
904 	return (dropped);
905 }
906 
907 /*
908  * Get the stack instance used in caller's zone.
909  * Increases the reference count, caller must do a netstack_rele.
910  * It can't be called after zone_destroy() has started.
911  */
912 netstack_t *
913 netstack_get_current(void)
914 {
915 	netstack_t *ns;
916 
917 	ns = curproc->p_zone->zone_netstack;
918 	ASSERT(ns != NULL);
919 	if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
920 		return (NULL);
921 
922 	netstack_hold(ns);
923 
924 	return (ns);
925 }
926 
927 /*
928  * Find a stack instance given the cred.
929  * This is used by the modules to potentially allow for a future when
930  * something other than the zoneid is used to determine the stack.
931  */
932 netstack_t *
933 netstack_find_by_cred(const cred_t *cr)
934 {
935 	zoneid_t zoneid = crgetzoneid(cr);
936 
937 	/* Handle the case when cr_zone is NULL */
938 	if (zoneid == (zoneid_t)-1)
939 		zoneid = GLOBAL_ZONEID;
940 
941 	/* For performance ... */
942 	if (curproc->p_zone->zone_id == zoneid)
943 		return (netstack_get_current());
944 	else
945 		return (netstack_find_by_zoneid(zoneid));
946 }
947 
948 /*
949  * Find a stack instance given the zoneid.
950  * Increases the reference count if found; caller must do a
951  * netstack_rele().
952  *
953  * If there is no exact match then assume the shared stack instance
954  * matches.
955  *
956  * Skip the unitialized ones.
957  */
958 netstack_t *
959 netstack_find_by_zoneid(zoneid_t zoneid)
960 {
961 	netstack_t *ns;
962 	zone_t *zone;
963 
964 	zone = zone_find_by_id(zoneid);
965 
966 	if (zone == NULL)
967 		return (NULL);
968 
969 	ns = zone->zone_netstack;
970 	ASSERT(ns != NULL);
971 	if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
972 		ns = NULL;
973 	else
974 		netstack_hold(ns);
975 
976 	zone_rele(zone);
977 	return (ns);
978 }
979 
980 /*
981  * Find a stack instance given the zoneid. Can only be called from
982  * the create callback. See the comments in zone_find_by_id_nolock why
983  * that limitation exists.
984  *
985  * Increases the reference count if found; caller must do a
986  * netstack_rele().
987  *
988  * If there is no exact match then assume the shared stack instance
989  * matches.
990  *
991  * Skip the unitialized ones.
992  */
993 netstack_t *
994 netstack_find_by_zoneid_nolock(zoneid_t zoneid)
995 {
996 	netstack_t *ns;
997 	zone_t *zone;
998 
999 	zone = zone_find_by_id_nolock(zoneid);
1000 
1001 	if (zone == NULL)
1002 		return (NULL);
1003 
1004 	ns = zone->zone_netstack;
1005 	ASSERT(ns != NULL);
1006 
1007 	if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
1008 		ns = NULL;
1009 	else
1010 		netstack_hold(ns);
1011 
1012 	/* zone_find_by_id_nolock does not have a hold on the zone */
1013 	return (ns);
1014 }
1015 
1016 /*
1017  * Find a stack instance given the stackid with exact match?
1018  * Increases the reference count if found; caller must do a
1019  * netstack_rele().
1020  *
1021  * Skip the unitialized ones.
1022  */
1023 netstack_t *
1024 netstack_find_by_stackid(netstackid_t stackid)
1025 {
1026 	netstack_t *ns;
1027 
1028 	mutex_enter(&netstack_g_lock);
1029 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1030 		mutex_enter(&ns->netstack_lock);
1031 		if (ns->netstack_stackid == stackid &&
1032 		    !(ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))) {
1033 			mutex_exit(&ns->netstack_lock);
1034 			netstack_hold(ns);
1035 			mutex_exit(&netstack_g_lock);
1036 			return (ns);
1037 		}
1038 		mutex_exit(&ns->netstack_lock);
1039 	}
1040 	mutex_exit(&netstack_g_lock);
1041 	return (NULL);
1042 }
1043 
1044 void
1045 netstack_rele(netstack_t *ns)
1046 {
1047 	netstack_t **nsp;
1048 	boolean_t found;
1049 	int refcnt, numzones;
1050 	int i;
1051 
1052 	mutex_enter(&ns->netstack_lock);
1053 	ASSERT(ns->netstack_refcnt > 0);
1054 	ns->netstack_refcnt--;
1055 	/*
1056 	 * As we drop the lock additional netstack_rele()s can come in
1057 	 * and decrement the refcnt to zero and free the netstack_t.
1058 	 * Store pointers in local variables and if we were not the last
1059 	 * then don't reference the netstack_t after that.
1060 	 */
1061 	refcnt = ns->netstack_refcnt;
1062 	numzones = ns->netstack_numzones;
1063 	DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
1064 	mutex_exit(&ns->netstack_lock);
1065 
1066 	if (refcnt == 0 && numzones == 0) {
1067 		/*
1068 		 * Time to call the destroy functions and free up
1069 		 * the structure
1070 		 */
1071 		netstack_stack_inactive(ns);
1072 
1073 		/* Make sure nothing increased the references */
1074 		ASSERT(ns->netstack_refcnt == 0);
1075 		ASSERT(ns->netstack_numzones == 0);
1076 
1077 		/* Finally remove from list of netstacks */
1078 		mutex_enter(&netstack_g_lock);
1079 		found = B_FALSE;
1080 		for (nsp = &netstack_head; *nsp != NULL;
1081 		    nsp = &(*nsp)->netstack_next) {
1082 			if (*nsp == ns) {
1083 				*nsp = ns->netstack_next;
1084 				ns->netstack_next = NULL;
1085 				found = B_TRUE;
1086 				break;
1087 			}
1088 		}
1089 		ASSERT(found);
1090 		mutex_exit(&netstack_g_lock);
1091 
1092 		/* Make sure nothing increased the references */
1093 		ASSERT(ns->netstack_refcnt == 0);
1094 		ASSERT(ns->netstack_numzones == 0);
1095 
1096 		ASSERT(ns->netstack_flags & NSF_CLOSING);
1097 
1098 		for (i = 0; i < NS_MAX; i++) {
1099 			nm_state_t *nms = &ns->netstack_m_state[i];
1100 
1101 			cv_destroy(&nms->nms_cv);
1102 		}
1103 		mutex_destroy(&ns->netstack_lock);
1104 		cv_destroy(&ns->netstack_cv);
1105 		kmem_free(ns, sizeof (*ns));
1106 	}
1107 }
1108 
1109 void
1110 netstack_hold(netstack_t *ns)
1111 {
1112 	mutex_enter(&ns->netstack_lock);
1113 	ns->netstack_refcnt++;
1114 	ASSERT(ns->netstack_refcnt > 0);
1115 	mutex_exit(&ns->netstack_lock);
1116 	DTRACE_PROBE1(netstack__inc__ref, netstack_t *, ns);
1117 }
1118 
1119 /*
1120  * To support kstat_create_netstack() using kstat_zone_add we need
1121  * to track both
1122  *  - all zoneids that use the global/shared stack
1123  *  - all kstats that have been added for the shared stack
1124  */
1125 kstat_t *
1126 kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
1127     char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags,
1128     netstackid_t ks_netstackid)
1129 {
1130 	kstat_t *ks;
1131 
1132 	if (ks_netstackid == GLOBAL_NETSTACKID) {
1133 		ks = kstat_create_zone(ks_module, ks_instance, ks_name,
1134 		    ks_class, ks_type, ks_ndata, ks_flags, GLOBAL_ZONEID);
1135 		if (ks != NULL)
1136 			netstack_shared_kstat_add(ks);
1137 		return (ks);
1138 	} else {
1139 		zoneid_t zoneid = ks_netstackid;
1140 
1141 		return (kstat_create_zone(ks_module, ks_instance, ks_name,
1142 		    ks_class, ks_type, ks_ndata, ks_flags, zoneid));
1143 	}
1144 }
1145 
1146 void
1147 kstat_delete_netstack(kstat_t *ks, netstackid_t ks_netstackid)
1148 {
1149 	if (ks_netstackid == GLOBAL_NETSTACKID) {
1150 		netstack_shared_kstat_remove(ks);
1151 	}
1152 	kstat_delete(ks);
1153 }
1154 
1155 static void
1156 netstack_shared_zone_add(zoneid_t zoneid)
1157 {
1158 	struct shared_zone_list *sz;
1159 	struct shared_kstat_list *sk;
1160 
1161 	sz = (struct shared_zone_list *)kmem_zalloc(sizeof (*sz), KM_SLEEP);
1162 	sz->sz_zoneid = zoneid;
1163 
1164 	/* Insert in list */
1165 	mutex_enter(&netstack_shared_lock);
1166 	sz->sz_next = netstack_shared_zones;
1167 	netstack_shared_zones = sz;
1168 
1169 	/*
1170 	 * Perform kstat_zone_add for each existing shared stack kstat.
1171 	 * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1172 	 */
1173 	for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1174 		kstat_zone_add(sk->sk_kstat, zoneid);
1175 	}
1176 	mutex_exit(&netstack_shared_lock);
1177 }
1178 
1179 static void
1180 netstack_shared_zone_remove(zoneid_t zoneid)
1181 {
1182 	struct shared_zone_list **szp, *sz;
1183 	struct shared_kstat_list *sk;
1184 
1185 	/* Find in list */
1186 	mutex_enter(&netstack_shared_lock);
1187 	sz = NULL;
1188 	for (szp = &netstack_shared_zones; *szp != NULL;
1189 	    szp = &((*szp)->sz_next)) {
1190 		if ((*szp)->sz_zoneid == zoneid) {
1191 			sz = *szp;
1192 			break;
1193 		}
1194 	}
1195 	/* We must find it */
1196 	ASSERT(sz != NULL);
1197 	*szp = sz->sz_next;
1198 	sz->sz_next = NULL;
1199 
1200 	/*
1201 	 * Perform kstat_zone_remove for each existing shared stack kstat.
1202 	 * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1203 	 */
1204 	for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1205 		kstat_zone_remove(sk->sk_kstat, zoneid);
1206 	}
1207 	mutex_exit(&netstack_shared_lock);
1208 
1209 	kmem_free(sz, sizeof (*sz));
1210 }
1211 
1212 static void
1213 netstack_shared_kstat_add(kstat_t *ks)
1214 {
1215 	struct shared_zone_list *sz;
1216 	struct shared_kstat_list *sk;
1217 
1218 	sk = (struct shared_kstat_list *)kmem_zalloc(sizeof (*sk), KM_SLEEP);
1219 	sk->sk_kstat = ks;
1220 
1221 	/* Insert in list */
1222 	mutex_enter(&netstack_shared_lock);
1223 	sk->sk_next = netstack_shared_kstats;
1224 	netstack_shared_kstats = sk;
1225 
1226 	/*
1227 	 * Perform kstat_zone_add for each existing shared stack zone.
1228 	 * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1229 	 */
1230 	for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1231 		kstat_zone_add(ks, sz->sz_zoneid);
1232 	}
1233 	mutex_exit(&netstack_shared_lock);
1234 }
1235 
1236 static void
1237 netstack_shared_kstat_remove(kstat_t *ks)
1238 {
1239 	struct shared_zone_list *sz;
1240 	struct shared_kstat_list **skp, *sk;
1241 
1242 	/* Find in list */
1243 	mutex_enter(&netstack_shared_lock);
1244 	sk = NULL;
1245 	for (skp = &netstack_shared_kstats; *skp != NULL;
1246 	    skp = &((*skp)->sk_next)) {
1247 		if ((*skp)->sk_kstat == ks) {
1248 			sk = *skp;
1249 			break;
1250 		}
1251 	}
1252 	/* Must find it */
1253 	ASSERT(sk != NULL);
1254 	*skp = sk->sk_next;
1255 	sk->sk_next = NULL;
1256 
1257 	/*
1258 	 * Perform kstat_zone_remove for each existing shared stack kstat.
1259 	 * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1260 	 */
1261 	for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1262 		kstat_zone_remove(ks, sz->sz_zoneid);
1263 	}
1264 	mutex_exit(&netstack_shared_lock);
1265 	kmem_free(sk, sizeof (*sk));
1266 }
1267 
1268 /*
1269  * If a zoneid is part of the shared zone, return true
1270  */
1271 static boolean_t
1272 netstack_find_shared_zoneid(zoneid_t zoneid)
1273 {
1274 	struct shared_zone_list *sz;
1275 
1276 	mutex_enter(&netstack_shared_lock);
1277 	for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1278 		if (sz->sz_zoneid == zoneid) {
1279 			mutex_exit(&netstack_shared_lock);
1280 			return (B_TRUE);
1281 		}
1282 	}
1283 	mutex_exit(&netstack_shared_lock);
1284 	return (B_FALSE);
1285 }
1286 
1287 /*
1288  * Hide the fact that zoneids and netstackids are allocated from
1289  * the same space in the current implementation.
1290  * We currently do not check that the stackid/zoneids are valid, since there
1291  * is no need for that. But this should only be done for ids that are
1292  * valid.
1293  */
1294 zoneid_t
1295 netstackid_to_zoneid(netstackid_t stackid)
1296 {
1297 	return (stackid);
1298 }
1299 
1300 netstackid_t
1301 zoneid_to_netstackid(zoneid_t zoneid)
1302 {
1303 	if (netstack_find_shared_zoneid(zoneid))
1304 		return (GLOBAL_ZONEID);
1305 	else
1306 		return (zoneid);
1307 }
1308 
1309 zoneid_t
1310 netstack_get_zoneid(netstack_t *ns)
1311 {
1312 	return (netstackid_to_zoneid(ns->netstack_stackid));
1313 }
1314 
1315 /*
1316  * Simplistic support for walking all the handles.
1317  * Example usage:
1318  *	netstack_handle_t nh;
1319  *	netstack_t *ns;
1320  *
1321  *	netstack_next_init(&nh);
1322  *	while ((ns = netstack_next(&nh)) != NULL) {
1323  *		do something;
1324  *		netstack_rele(ns);
1325  *	}
1326  *	netstack_next_fini(&nh);
1327  */
1328 void
1329 netstack_next_init(netstack_handle_t *handle)
1330 {
1331 	*handle = 0;
1332 }
1333 
1334 /* ARGSUSED */
1335 void
1336 netstack_next_fini(netstack_handle_t *handle)
1337 {
1338 }
1339 
1340 netstack_t *
1341 netstack_next(netstack_handle_t *handle)
1342 {
1343 	netstack_t *ns;
1344 	int i, end;
1345 
1346 	end = *handle;
1347 	/* Walk skipping *handle number of instances */
1348 
1349 	/* Look if there is a matching stack instance */
1350 	mutex_enter(&netstack_g_lock);
1351 	ns = netstack_head;
1352 	for (i = 0; i < end; i++) {
1353 		if (ns == NULL)
1354 			break;
1355 		ns = ns->netstack_next;
1356 	}
1357 	/* skip those with that aren't really here */
1358 	while (ns != NULL) {
1359 		mutex_enter(&ns->netstack_lock);
1360 		if ((ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) == 0) {
1361 			mutex_exit(&ns->netstack_lock);
1362 			break;
1363 		}
1364 		mutex_exit(&ns->netstack_lock);
1365 		end++;
1366 		ns = ns->netstack_next;
1367 	}
1368 	if (ns != NULL) {
1369 		*handle = end + 1;
1370 		netstack_hold(ns);
1371 	}
1372 	mutex_exit(&netstack_g_lock);
1373 	return (ns);
1374 }
1375