xref: /illumos-gate/usr/src/uts/common/io/mac/mac.c (revision 56f33205c9ed776c3c909e07d52e94610a675740)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * MAC Services Module
29  *
30  * The GLDv3 framework locking -  The MAC layer
31  * --------------------------------------------
32  *
33  * The MAC layer is central to the GLD framework and can provide the locking
34  * framework needed for itself and for the use of MAC clients. MAC end points
35  * are fairly disjoint and don't share a lot of state. So a coarse grained
36  * multi-threading scheme is to single thread all create/modify/delete or set
37  * type of control operations on a per mac end point while allowing data threads
38  * concurrently.
39  *
40  * Control operations (set) that modify a mac end point are always serialized on
41  * a per mac end point basis, We have at most 1 such thread per mac end point
42  * at a time.
43  *
44  * All other operations that are not serialized are essentially multi-threaded.
45  * For example a control operation (get) like getting statistics which may not
46  * care about reading values atomically or data threads sending or receiving
47  * data. Mostly these type of operations don't modify the control state. Any
48  * state these operations care about are protected using traditional locks.
49  *
50  * The perimeter only serializes serial operations. It does not imply there
51  * aren't any other concurrent operations. However a serialized operation may
52  * sometimes need to make sure it is the only thread. In this case it needs
53  * to use reference counting mechanisms to cv_wait until any current data
54  * threads are done.
55  *
56  * The mac layer itself does not hold any locks across a call to another layer.
57  * The perimeter is however held across a down call to the driver to make the
58  * whole control operation atomic with respect to other control operations.
59  * Also the data path and get type control operations may proceed concurrently.
60  * These operations synchronize with the single serial operation on a given mac
61  * end point using regular locks. The perimeter ensures that conflicting
62  * operations like say a mac_multicast_add and a mac_multicast_remove on the
63  * same mac end point don't interfere with each other and also ensures that the
64  * changes in the mac layer and the call to the underlying driver to say add a
65  * multicast address are done atomically without interference from a thread
66  * trying to delete the same address.
67  *
68  * For example, consider
69  * mac_multicst_add()
70  * {
71  *	mac_perimeter_enter();	serialize all control operations
72  *
73  *	grab list lock		protect against access by data threads
74  *	add to list
75  *	drop list lock
76  *
77  *	call driver's mi_multicst
78  *
79  *	mac_perimeter_exit();
80  * }
81  *
82  * To lessen the number of serialization locks and simplify the lock hierarchy,
83  * we serialize all the control operations on a per mac end point by using a
84  * single serialization lock called the perimeter. We allow recursive entry into
85  * the perimeter to facilitate use of this mechanism by both the mac client and
86  * the MAC layer itself.
87  *
88  * MAC client means an entity that does an operation on a mac handle
89  * obtained from a mac_open/mac_client_open. Similarly MAC driver means
90  * an entity that does an operation on a mac handle obtained from a
91  * mac_register. An entity could be both client and driver but on different
92  * handles eg. aggr. and should only make the corresponding mac interface calls
93  * i.e. mac driver interface or mac client interface as appropriate for that
94  * mac handle.
95  *
96  * General rules.
97  * -------------
98  *
99  * R1. The lock order of upcall threads is natually opposite to downcall
100  * threads. Hence upcalls must not hold any locks across layers for fear of
101  * recursive lock enter and lock order violation. This applies to all layers.
102  *
103  * R2. The perimeter is just another lock. Since it is held in the down
104  * direction, acquiring the perimeter in an upcall is prohibited as it would
105  * cause a deadlock. This applies to all layers.
106  *
107  * Note that upcalls that need to grab the mac perimeter (for example
108  * mac_notify upcalls) can still achieve that by posting the request to a
109  * thread, which can then grab all the required perimeters and locks in the
110  * right global order. Note that in the above example the mac layer iself
111  * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
112  * to the client must do that. Please see the aggr code for an example.
113  *
114  * MAC client rules
115  * ----------------
116  *
117  * R3. A MAC client may use the MAC provided perimeter facility to serialize
118  * control operations on a per mac end point. It does this by by acquring
119  * and holding the perimeter across a sequence of calls to the mac layer.
120  * This ensures atomicity across the entire block of mac calls. In this
121  * model the MAC client must not hold any client locks across the calls to
122  * the mac layer. This model is the preferred solution.
123  *
124  * R4. However if a MAC client has a lot of global state across all mac end
125  * points the per mac end point serialization may not be sufficient. In this
126  * case the client may choose to use global locks or use its own serialization.
127  * To avoid deadlocks, these client layer locks held across the mac calls
128  * in the control path must never be acquired by the data path for the reason
129  * mentioned below.
130  *
131  * (Assume that a control operation that holds a client lock blocks in the
132  * mac layer waiting for upcall reference counts to drop to zero. If an upcall
133  * data thread that holds this reference count, tries to acquire the same
134  * client lock subsequently it will deadlock).
135  *
136  * A MAC client may follow either the R3 model or the R4 model, but can't
137  * mix both. In the former, the hierarchy is Perim -> client locks, but in
138  * the latter it is client locks -> Perim.
139  *
140  * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
141  * context since they may block while trying to acquire the perimeter.
142  * In addition some calls may block waiting for upcall refcnts to come down to
143  * zero.
144  *
145  * R6. MAC clients must make sure that they are single threaded and all threads
146  * from the top (in particular data threads) have finished before calling
147  * mac_client_close. The MAC framework does not track the number of client
148  * threads using the mac client handle. Also mac clients must make sure
149  * they have undone all the control operations before calling mac_client_close.
150  * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
151  * mac_unicast_add/mac_multicast_add.
152  *
153  * MAC framework rules
154  * -------------------
155  *
156  * R7. The mac layer itself must not hold any mac layer locks (except the mac
157  * perimeter) across a call to any other layer from the mac layer. The call to
158  * any other layer could be via mi_* entry points, classifier entry points into
159  * the driver or via upcall pointers into layers above. The mac perimeter may
160  * be acquired or held only in the down direction, for e.g. when calling into
161  * a mi_* driver enty point to provide atomicity of the operation.
162  *
163  * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
164  * mac driver interfaces, the MAC layer must provide a cut out for control
165  * interfaces like upcall notifications and start them in a separate thread.
166  *
167  * R9. Note that locking order also implies a plumbing order. For example
168  * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
169  * to plumb in any other order must be failed at mac_open time, otherwise it
170  * could lead to deadlocks due to inverse locking order.
171  *
172  * R10. MAC driver interfaces must not block since the driver could call them
173  * in interrupt context.
174  *
175  * R11. Walkers must preferably not hold any locks while calling walker
176  * callbacks. Instead these can operate on reference counts. In simple
177  * callbacks it may be ok to hold a lock and call the callbacks, but this is
178  * harder to maintain in the general case of arbitrary callbacks.
179  *
180  * R12. The MAC layer must protect upcall notification callbacks using reference
181  * counts rather than holding locks across the callbacks.
182  *
183  * R13. Given the variety of drivers, it is preferable if the MAC layer can make
184  * sure that any pointers (such as mac ring pointers) it passes to the driver
185  * remain valid until mac unregister time. Currently the mac layer achieves
186  * this by using generation numbers for rings and freeing the mac rings only
187  * at unregister time.  The MAC layer must provide a layer of indirection and
188  * must not expose underlying driver rings or driver data structures/pointers
189  * directly to MAC clients.
190  *
191  * MAC driver rules
192  * ----------------
193  *
194  * R14. It would be preferable if MAC drivers don't hold any locks across any
195  * mac call. However at a minimum they must not hold any locks across data
196  * upcalls. They must also make sure that all references to mac data structures
197  * are cleaned up and that it is single threaded at mac_unregister time.
198  *
199  * R15. MAC driver interfaces don't block and so the action may be done
200  * asynchronously in a separate thread as for example handling notifications.
201  * The driver must not assume that the action is complete when the call
202  * returns.
203  *
204  * R16. Drivers must maintain a generation number per Rx ring, and pass it
205  * back to mac_rx_ring(); They are expected to increment the generation
206  * number whenever the ring's stop routine is invoked.
207  * See comments in mac_rx_ring();
208  *
209  * R17 Similarly mi_stop is another synchronization point and the driver must
210  * ensure that all upcalls are done and there won't be any future upcall
211  * before returning from mi_stop.
212  *
213  * R18. The driver may assume that all set/modify control operations via
214  * the mi_* entry points are single threaded on a per mac end point.
215  *
216  * Lock and Perimeter hierarchy scenarios
217  * ---------------------------------------
218  *
219  * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
220  *
221  * ft_lock -> fe_lock [mac_flow_lookup]
222  *
223  * mi_rw_lock -> fe_lock [mac_bcast_send]
224  *
225  * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
226  *
227  * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
228  *
229  * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
230  *
231  * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
232  * client to driver. In the case of clients that explictly use the mac provided
233  * perimeter mechanism for its serialization, the hierarchy is
234  * Perimeter -> mac layer locks, since the client never holds any locks across
235  * the mac calls. In the case of clients that use its own locks the hierarchy
236  * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
237  * calls mac_perim_enter/exit in this case.
238  *
239  * Subflow creation rules
240  * ---------------------------
241  * o In case of a user specified cpulist present on underlying link and flows,
242  * the flows cpulist must be a subset of the underlying link.
243  * o In case of a user specified fanout mode present on link and flow, the
244  * subflow fanout count has to be less than or equal to that of the
245  * underlying link. The cpu-bindings for the subflows will be a subset of
246  * the underlying link.
247  * o In case if no cpulist specified on both underlying link and flow, the
248  * underlying link relies on a  MAC tunable to provide out of box fanout.
249  * The subflow will have no cpulist (the subflow will be unbound)
250  * o In case if no cpulist is specified on the underlying link, a subflow can
251  * carry  either a user-specified cpulist or fanout count. The cpu-bindings
252  * for the subflow will not adhere to restriction that they need to be subset
253  * of the underlying link.
254  * o In case where the underlying link is carrying either a user specified
255  * cpulist or fanout mode and for a unspecified subflow, the subflow will be
256  * created unbound.
257  * o While creating unbound subflows, bandwidth mode changes attempt to
258  * figure a right fanout count. In such cases the fanout count will override
259  * the unbound cpu-binding behavior.
260  * o In addition to this, while cycling between flow and link properties, we
261  * impose a restriction that if a link property has a subflow with
262  * user-specified attributes, we will not allow changing the link property.
263  * The administrator needs to reset all the user specified properties for the
264  * subflows before attempting a link property change.
265  * Some of the above rules can be overridden by specifying additional command
266  * line options while creating or modifying link or subflow properties.
267  */
268 
269 #include <sys/types.h>
270 #include <sys/conf.h>
271 #include <sys/id_space.h>
272 #include <sys/esunddi.h>
273 #include <sys/stat.h>
274 #include <sys/mkdev.h>
275 #include <sys/stream.h>
276 #include <sys/strsun.h>
277 #include <sys/strsubr.h>
278 #include <sys/dlpi.h>
279 #include <sys/modhash.h>
280 #include <sys/mac_provider.h>
281 #include <sys/mac_client_impl.h>
282 #include <sys/mac_soft_ring.h>
283 #include <sys/mac_impl.h>
284 #include <sys/mac.h>
285 #include <sys/dls.h>
286 #include <sys/dld.h>
287 #include <sys/modctl.h>
288 #include <sys/fs/dv_node.h>
289 #include <sys/thread.h>
290 #include <sys/proc.h>
291 #include <sys/callb.h>
292 #include <sys/cpuvar.h>
293 #include <sys/atomic.h>
294 #include <sys/bitmap.h>
295 #include <sys/sdt.h>
296 #include <sys/mac_flow.h>
297 #include <sys/ddi_intr_impl.h>
298 #include <sys/disp.h>
299 #include <sys/sdt.h>
300 #include <sys/vnic.h>
301 #include <sys/vnic_impl.h>
302 #include <sys/vlan.h>
303 #include <inet/ip.h>
304 #include <inet/ip6.h>
305 #include <sys/exacct.h>
306 #include <sys/exacct_impl.h>
307 #include <inet/nd.h>
308 #include <sys/ethernet.h>
309 
310 #define	IMPL_HASHSZ	67	/* prime */
311 
312 kmem_cache_t	*i_mac_impl_cachep;
313 mod_hash_t		*i_mac_impl_hash;
314 krwlock_t		i_mac_impl_lock;
315 uint_t			i_mac_impl_count;
316 static kmem_cache_t	*mac_ring_cache;
317 static id_space_t	*minor_ids;
318 static uint32_t		minor_count;
319 
320 /*
321  * Logging stuff. Perhaps mac_logging_interval could be broken into
322  * mac_flow_log_interval and mac_link_log_interval if we want to be
323  * able to schedule them differently.
324  */
325 uint_t			mac_logging_interval;
326 boolean_t		mac_flow_log_enable;
327 boolean_t		mac_link_log_enable;
328 timeout_id_t		mac_logging_timer;
329 
330 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */
331 int mac_dbg = 0;
332 
333 #define	MACTYPE_KMODDIR	"mac"
334 #define	MACTYPE_HASHSZ	67
335 static mod_hash_t	*i_mactype_hash;
336 /*
337  * i_mactype_lock synchronizes threads that obtain references to mactype_t
338  * structures through i_mactype_getplugin().
339  */
340 static kmutex_t		i_mactype_lock;
341 
342 /*
343  * mac_tx_percpu_cnt
344  *
345  * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
346  * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
347  * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
348  * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
349  */
350 int mac_tx_percpu_cnt;
351 int mac_tx_percpu_cnt_max = 128;
352 
353 /*
354  * Call back functions for the bridge module.  These are guaranteed to be valid
355  * when holding a reference on a link or when holding mip->mi_bridge_lock and
356  * mi_bridge_link is non-NULL.
357  */
358 mac_bridge_tx_t mac_bridge_tx_cb;
359 mac_bridge_rx_t mac_bridge_rx_cb;
360 mac_bridge_ref_t mac_bridge_ref_cb;
361 mac_bridge_ls_t mac_bridge_ls_cb;
362 
363 static int i_mac_constructor(void *, void *, int);
364 static void i_mac_destructor(void *, void *);
365 static int i_mac_ring_ctor(void *, void *, int);
366 static void i_mac_ring_dtor(void *, void *);
367 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
368 void mac_tx_client_flush(mac_client_impl_t *);
369 void mac_tx_client_block(mac_client_impl_t *);
370 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
371 static int mac_start_group_and_rings(mac_group_t *);
372 static void mac_stop_group_and_rings(mac_group_t *);
373 
374 /*
375  * Module initialization functions.
376  */
377 
378 void
379 mac_init(void)
380 {
381 	mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
382 	    boot_max_ncpus);
383 
384 	/* Upper bound is mac_tx_percpu_cnt_max */
385 	if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
386 		mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
387 
388 	if (mac_tx_percpu_cnt < 1) {
389 		/* Someone set max_tx_percpu_cnt_max to 0 or less */
390 		mac_tx_percpu_cnt = 1;
391 	}
392 
393 	ASSERT(mac_tx_percpu_cnt >= 1);
394 	mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
395 	/*
396 	 * Make it of the form 2**N - 1 in the range
397 	 * [0 .. mac_tx_percpu_cnt_max - 1]
398 	 */
399 	mac_tx_percpu_cnt--;
400 
401 	i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
402 	    sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
403 	    NULL, NULL, NULL, 0);
404 	ASSERT(i_mac_impl_cachep != NULL);
405 
406 	mac_ring_cache = kmem_cache_create("mac_ring_cache",
407 	    sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
408 	    NULL, NULL, 0);
409 	ASSERT(mac_ring_cache != NULL);
410 
411 	i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
412 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
413 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
414 	rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
415 
416 	mac_flow_init();
417 	mac_soft_ring_init();
418 	mac_bcast_init();
419 	mac_client_init();
420 
421 	i_mac_impl_count = 0;
422 
423 	i_mactype_hash = mod_hash_create_extended("mactype_hash",
424 	    MACTYPE_HASHSZ,
425 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
426 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
427 
428 	/*
429 	 * Allocate an id space to manage minor numbers. The range of the
430 	 * space will be from MAC_MAX_MINOR+1 to MAC_PRIVATE_MINOR-1.  This
431 	 * leaves half of the 32-bit minors available for driver private use.
432 	 */
433 	minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1,
434 	    MAC_PRIVATE_MINOR-1);
435 	ASSERT(minor_ids != NULL);
436 	minor_count = 0;
437 
438 	/* Let's default to 20 seconds */
439 	mac_logging_interval = 20;
440 	mac_flow_log_enable = B_FALSE;
441 	mac_link_log_enable = B_FALSE;
442 	mac_logging_timer = 0;
443 }
444 
445 int
446 mac_fini(void)
447 {
448 	if (i_mac_impl_count > 0 || minor_count > 0)
449 		return (EBUSY);
450 
451 	id_space_destroy(minor_ids);
452 	mac_flow_fini();
453 
454 	mod_hash_destroy_hash(i_mac_impl_hash);
455 	rw_destroy(&i_mac_impl_lock);
456 
457 	mac_client_fini();
458 	kmem_cache_destroy(mac_ring_cache);
459 
460 	mod_hash_destroy_hash(i_mactype_hash);
461 	mac_soft_ring_finish();
462 	return (0);
463 }
464 
465 /*
466  * Initialize a GLDv3 driver's device ops.  A driver that manages its own ops
467  * (e.g. softmac) may pass in a NULL ops argument.
468  */
469 void
470 mac_init_ops(struct dev_ops *ops, const char *name)
471 {
472 	major_t major = ddi_name_to_major((char *)name);
473 
474 	/*
475 	 * By returning on error below, we are not letting the driver continue
476 	 * in an undefined context.  The mac_register() function will faill if
477 	 * DN_GLDV3_DRIVER isn't set.
478 	 */
479 	if (major == DDI_MAJOR_T_NONE)
480 		return;
481 	LOCK_DEV_OPS(&devnamesp[major].dn_lock);
482 	devnamesp[major].dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
483 	UNLOCK_DEV_OPS(&devnamesp[major].dn_lock);
484 	if (ops != NULL)
485 		dld_init_ops(ops, name);
486 }
487 
488 void
489 mac_fini_ops(struct dev_ops *ops)
490 {
491 	dld_fini_ops(ops);
492 }
493 
494 /*ARGSUSED*/
495 static int
496 i_mac_constructor(void *buf, void *arg, int kmflag)
497 {
498 	mac_impl_t	*mip = buf;
499 
500 	bzero(buf, sizeof (mac_impl_t));
501 
502 	mip->mi_linkstate = LINK_STATE_UNKNOWN;
503 
504 	mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL);
505 	rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
506 	mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
507 	mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
508 	mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
509 
510 	mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
511 	cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
512 	mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
513 	cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
514 
515 	mutex_init(&mip->mi_bridge_lock, NULL, MUTEX_DEFAULT, NULL);
516 
517 	return (0);
518 }
519 
520 /*ARGSUSED*/
521 static void
522 i_mac_destructor(void *buf, void *arg)
523 {
524 	mac_impl_t	*mip = buf;
525 	mac_cb_info_t	*mcbi;
526 
527 	ASSERT(mip->mi_ref == 0);
528 	ASSERT(mip->mi_active == 0);
529 	ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
530 	ASSERT(mip->mi_devpromisc == 0);
531 	ASSERT(mip->mi_ksp == NULL);
532 	ASSERT(mip->mi_kstat_count == 0);
533 	ASSERT(mip->mi_nclients == 0);
534 	ASSERT(mip->mi_nactiveclients == 0);
535 	ASSERT(mip->mi_single_active_client == NULL);
536 	ASSERT(mip->mi_state_flags == 0);
537 	ASSERT(mip->mi_factory_addr == NULL);
538 	ASSERT(mip->mi_factory_addr_num == 0);
539 	ASSERT(mip->mi_default_tx_ring == NULL);
540 
541 	mcbi = &mip->mi_notify_cb_info;
542 	ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
543 	ASSERT(mip->mi_notify_bits == 0);
544 	ASSERT(mip->mi_notify_thread == NULL);
545 	ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
546 	mcbi->mcbi_lockp = NULL;
547 
548 	mcbi = &mip->mi_promisc_cb_info;
549 	ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
550 	ASSERT(mip->mi_promisc_list == NULL);
551 	ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
552 	mcbi->mcbi_lockp = NULL;
553 
554 	ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
555 	ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
556 
557 	mutex_destroy(&mip->mi_lock);
558 	rw_destroy(&mip->mi_rw_lock);
559 
560 	mutex_destroy(&mip->mi_promisc_lock);
561 	cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
562 	mutex_destroy(&mip->mi_notify_lock);
563 	cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
564 	mutex_destroy(&mip->mi_ring_lock);
565 
566 	ASSERT(mip->mi_bridge_link == NULL);
567 }
568 
569 /* ARGSUSED */
570 static int
571 i_mac_ring_ctor(void *buf, void *arg, int kmflag)
572 {
573 	mac_ring_t *ring = (mac_ring_t *)buf;
574 
575 	bzero(ring, sizeof (mac_ring_t));
576 	cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
577 	mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
578 	ring->mr_state = MR_FREE;
579 	return (0);
580 }
581 
582 /* ARGSUSED */
583 static void
584 i_mac_ring_dtor(void *buf, void *arg)
585 {
586 	mac_ring_t *ring = (mac_ring_t *)buf;
587 
588 	cv_destroy(&ring->mr_cv);
589 	mutex_destroy(&ring->mr_lock);
590 }
591 
592 /*
593  * Common functions to do mac callback addition and deletion. Currently this is
594  * used by promisc callbacks and notify callbacks. List addition and deletion
595  * need to take care of list walkers. List walkers in general, can't hold list
596  * locks and make upcall callbacks due to potential lock order and recursive
597  * reentry issues. Instead list walkers increment the list walker count to mark
598  * the presence of a walker thread. Addition can be carefully done to ensure
599  * that the list walker always sees either the old list or the new list.
600  * However the deletion can't be done while the walker is active, instead the
601  * deleting thread simply marks the entry as logically deleted. The last walker
602  * physically deletes and frees up the logically deleted entries when the walk
603  * is complete.
604  */
605 void
606 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
607     mac_cb_t *mcb_elem)
608 {
609 	mac_cb_t	*p;
610 	mac_cb_t	**pp;
611 
612 	/* Verify it is not already in the list */
613 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
614 		if (p == mcb_elem)
615 			break;
616 	}
617 	VERIFY(p == NULL);
618 
619 	/*
620 	 * Add it to the head of the callback list. The membar ensures that
621 	 * the following list pointer manipulations reach global visibility
622 	 * in exactly the program order below.
623 	 */
624 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
625 
626 	mcb_elem->mcb_nextp = *mcb_head;
627 	membar_producer();
628 	*mcb_head = mcb_elem;
629 }
630 
631 /*
632  * Mark the entry as logically deleted. If there aren't any walkers unlink
633  * from the list. In either case return the corresponding status.
634  */
635 boolean_t
636 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
637     mac_cb_t *mcb_elem)
638 {
639 	mac_cb_t	*p;
640 	mac_cb_t	**pp;
641 
642 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
643 	/*
644 	 * Search the callback list for the entry to be removed
645 	 */
646 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
647 		if (p == mcb_elem)
648 			break;
649 	}
650 	VERIFY(p != NULL);
651 
652 	/*
653 	 * If there are walkers just mark it as deleted and the last walker
654 	 * will remove from the list and free it.
655 	 */
656 	if (mcbi->mcbi_walker_cnt != 0) {
657 		p->mcb_flags |= MCB_CONDEMNED;
658 		mcbi->mcbi_del_cnt++;
659 		return (B_FALSE);
660 	}
661 
662 	ASSERT(mcbi->mcbi_del_cnt == 0);
663 	*pp = p->mcb_nextp;
664 	p->mcb_nextp = NULL;
665 	return (B_TRUE);
666 }
667 
668 /*
669  * Wait for all pending callback removals to be completed
670  */
671 void
672 mac_callback_remove_wait(mac_cb_info_t *mcbi)
673 {
674 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
675 	while (mcbi->mcbi_del_cnt != 0) {
676 		DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
677 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
678 	}
679 }
680 
681 /*
682  * The last mac callback walker does the cleanup. Walk the list and unlik
683  * all the logically deleted entries and construct a temporary list of
684  * removed entries. Return the list of removed entries to the caller.
685  */
686 mac_cb_t *
687 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
688 {
689 	mac_cb_t	*p;
690 	mac_cb_t	**pp;
691 	mac_cb_t	*rmlist = NULL;		/* List of removed elements */
692 	int	cnt = 0;
693 
694 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
695 	ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
696 
697 	pp = mcb_head;
698 	while (*pp != NULL) {
699 		if ((*pp)->mcb_flags & MCB_CONDEMNED) {
700 			p = *pp;
701 			*pp = p->mcb_nextp;
702 			p->mcb_nextp = rmlist;
703 			rmlist = p;
704 			cnt++;
705 			continue;
706 		}
707 		pp = &(*pp)->mcb_nextp;
708 	}
709 
710 	ASSERT(mcbi->mcbi_del_cnt == cnt);
711 	mcbi->mcbi_del_cnt = 0;
712 	return (rmlist);
713 }
714 
715 boolean_t
716 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
717 {
718 	mac_cb_t	*mcb;
719 
720 	/* Verify it is not already in the list */
721 	for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
722 		if (mcb == mcb_elem)
723 			return (B_TRUE);
724 	}
725 
726 	return (B_FALSE);
727 }
728 
729 boolean_t
730 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
731 {
732 	boolean_t	found;
733 
734 	mutex_enter(mcbi->mcbi_lockp);
735 	found = mac_callback_lookup(mcb_headp, mcb_elem);
736 	mutex_exit(mcbi->mcbi_lockp);
737 
738 	return (found);
739 }
740 
741 /* Free the list of removed callbacks */
742 void
743 mac_callback_free(mac_cb_t *rmlist)
744 {
745 	mac_cb_t	*mcb;
746 	mac_cb_t	*mcb_next;
747 
748 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
749 		mcb_next = mcb->mcb_nextp;
750 		kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
751 	}
752 }
753 
754 /*
755  * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
756  * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
757  * is only a single shared total walker count, and an entry can't be physically
758  * unlinked if a walker is active on either list. The last walker does this
759  * cleanup of logically deleted entries.
760  */
761 void
762 i_mac_promisc_walker_cleanup(mac_impl_t *mip)
763 {
764 	mac_cb_t	*rmlist;
765 	mac_cb_t	*mcb;
766 	mac_cb_t	*mcb_next;
767 	mac_promisc_impl_t	*mpip;
768 
769 	/*
770 	 * Construct a temporary list of deleted callbacks by walking the
771 	 * the mi_promisc_list. Then for each entry in the temporary list,
772 	 * remove it from the mci_promisc_list and free the entry.
773 	 */
774 	rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
775 	    &mip->mi_promisc_list);
776 
777 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
778 		mcb_next = mcb->mcb_nextp;
779 		mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
780 		VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
781 		    &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
782 		mcb->mcb_flags = 0;
783 		mcb->mcb_nextp = NULL;
784 		kmem_cache_free(mac_promisc_impl_cache, mpip);
785 	}
786 }
787 
788 void
789 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
790 {
791 	mac_cb_info_t	*mcbi;
792 
793 	/*
794 	 * Signal the notify thread even after mi_ref has become zero and
795 	 * mi_disabled is set. The synchronization with the notify thread
796 	 * happens in mac_unregister and that implies the driver must make
797 	 * sure it is single-threaded (with respect to mac calls) and that
798 	 * all pending mac calls have returned before it calls mac_unregister
799 	 */
800 	rw_enter(&i_mac_impl_lock, RW_READER);
801 	if (mip->mi_state_flags & MIS_DISABLED)
802 		goto exit;
803 
804 	/*
805 	 * Guard against incorrect notifications.  (Running a newer
806 	 * mac client against an older implementation?)
807 	 */
808 	if (type >= MAC_NNOTE)
809 		goto exit;
810 
811 	mcbi = &mip->mi_notify_cb_info;
812 	mutex_enter(mcbi->mcbi_lockp);
813 	mip->mi_notify_bits |= (1 << type);
814 	cv_broadcast(&mcbi->mcbi_cv);
815 	mutex_exit(mcbi->mcbi_lockp);
816 
817 exit:
818 	rw_exit(&i_mac_impl_lock);
819 }
820 
821 /*
822  * Mac serialization primitives. Please see the block comment at the
823  * top of the file.
824  */
825 void
826 i_mac_perim_enter(mac_impl_t *mip)
827 {
828 	mac_client_impl_t	*mcip;
829 
830 	if (mip->mi_state_flags & MIS_IS_VNIC) {
831 		/*
832 		 * This is a VNIC. Return the lower mac since that is what
833 		 * we want to serialize on.
834 		 */
835 		mcip = mac_vnic_lower(mip);
836 		mip = mcip->mci_mip;
837 	}
838 
839 	mutex_enter(&mip->mi_perim_lock);
840 	if (mip->mi_perim_owner == curthread) {
841 		mip->mi_perim_ocnt++;
842 		mutex_exit(&mip->mi_perim_lock);
843 		return;
844 	}
845 
846 	while (mip->mi_perim_owner != NULL)
847 		cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
848 
849 	mip->mi_perim_owner = curthread;
850 	ASSERT(mip->mi_perim_ocnt == 0);
851 	mip->mi_perim_ocnt++;
852 #ifdef DEBUG
853 	mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
854 	    MAC_PERIM_STACK_DEPTH);
855 #endif
856 	mutex_exit(&mip->mi_perim_lock);
857 }
858 
859 int
860 i_mac_perim_enter_nowait(mac_impl_t *mip)
861 {
862 	/*
863 	 * The vnic is a special case, since the serialization is done based
864 	 * on the lower mac. If the lower mac is busy, it does not imply the
865 	 * vnic can't be unregistered. But in the case of other drivers,
866 	 * a busy perimeter or open mac handles implies that the mac is busy
867 	 * and can't be unregistered.
868 	 */
869 	if (mip->mi_state_flags & MIS_IS_VNIC) {
870 		i_mac_perim_enter(mip);
871 		return (0);
872 	}
873 
874 	mutex_enter(&mip->mi_perim_lock);
875 	if (mip->mi_perim_owner != NULL) {
876 		mutex_exit(&mip->mi_perim_lock);
877 		return (EBUSY);
878 	}
879 	ASSERT(mip->mi_perim_ocnt == 0);
880 	mip->mi_perim_owner = curthread;
881 	mip->mi_perim_ocnt++;
882 	mutex_exit(&mip->mi_perim_lock);
883 
884 	return (0);
885 }
886 
887 void
888 i_mac_perim_exit(mac_impl_t *mip)
889 {
890 	mac_client_impl_t *mcip;
891 
892 	if (mip->mi_state_flags & MIS_IS_VNIC) {
893 		/*
894 		 * This is a VNIC. Return the lower mac since that is what
895 		 * we want to serialize on.
896 		 */
897 		mcip = mac_vnic_lower(mip);
898 		mip = mcip->mci_mip;
899 	}
900 
901 	ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
902 
903 	mutex_enter(&mip->mi_perim_lock);
904 	if (--mip->mi_perim_ocnt == 0) {
905 		mip->mi_perim_owner = NULL;
906 		cv_signal(&mip->mi_perim_cv);
907 	}
908 	mutex_exit(&mip->mi_perim_lock);
909 }
910 
911 /*
912  * Returns whether the current thread holds the mac perimeter. Used in making
913  * assertions.
914  */
915 boolean_t
916 mac_perim_held(mac_handle_t mh)
917 {
918 	mac_impl_t	*mip = (mac_impl_t *)mh;
919 	mac_client_impl_t *mcip;
920 
921 	if (mip->mi_state_flags & MIS_IS_VNIC) {
922 		/*
923 		 * This is a VNIC. Return the lower mac since that is what
924 		 * we want to serialize on.
925 		 */
926 		mcip = mac_vnic_lower(mip);
927 		mip = mcip->mci_mip;
928 	}
929 	return (mip->mi_perim_owner == curthread);
930 }
931 
932 /*
933  * mac client interfaces to enter the mac perimeter of a mac end point, given
934  * its mac handle, or macname or linkid.
935  */
936 void
937 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
938 {
939 	mac_impl_t	*mip = (mac_impl_t *)mh;
940 
941 	i_mac_perim_enter(mip);
942 	/*
943 	 * The mac_perim_handle_t returned encodes the 'mip' and whether a
944 	 * mac_open has been done internally while entering the perimeter.
945 	 * This information is used in mac_perim_exit
946 	 */
947 	MAC_ENCODE_MPH(*mphp, mip, 0);
948 }
949 
950 int
951 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
952 {
953 	int	err;
954 	mac_handle_t	mh;
955 
956 	if ((err = mac_open(name, &mh)) != 0)
957 		return (err);
958 
959 	mac_perim_enter_by_mh(mh, mphp);
960 	MAC_ENCODE_MPH(*mphp, mh, 1);
961 	return (0);
962 }
963 
964 int
965 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
966 {
967 	int	err;
968 	mac_handle_t	mh;
969 
970 	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
971 		return (err);
972 
973 	mac_perim_enter_by_mh(mh, mphp);
974 	MAC_ENCODE_MPH(*mphp, mh, 1);
975 	return (0);
976 }
977 
978 void
979 mac_perim_exit(mac_perim_handle_t mph)
980 {
981 	mac_impl_t	*mip;
982 	boolean_t	need_close;
983 
984 	MAC_DECODE_MPH(mph, mip, need_close);
985 	i_mac_perim_exit(mip);
986 	if (need_close)
987 		mac_close((mac_handle_t)mip);
988 }
989 
990 int
991 mac_hold(const char *macname, mac_impl_t **pmip)
992 {
993 	mac_impl_t	*mip;
994 	int		err;
995 
996 	/*
997 	 * Check the device name length to make sure it won't overflow our
998 	 * buffer.
999 	 */
1000 	if (strlen(macname) >= MAXNAMELEN)
1001 		return (EINVAL);
1002 
1003 	/*
1004 	 * Look up its entry in the global hash table.
1005 	 */
1006 	rw_enter(&i_mac_impl_lock, RW_WRITER);
1007 	err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
1008 	    (mod_hash_val_t *)&mip);
1009 
1010 	if (err != 0) {
1011 		rw_exit(&i_mac_impl_lock);
1012 		return (ENOENT);
1013 	}
1014 
1015 	if (mip->mi_state_flags & MIS_DISABLED) {
1016 		rw_exit(&i_mac_impl_lock);
1017 		return (ENOENT);
1018 	}
1019 
1020 	if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
1021 		rw_exit(&i_mac_impl_lock);
1022 		return (EBUSY);
1023 	}
1024 
1025 	mip->mi_ref++;
1026 	rw_exit(&i_mac_impl_lock);
1027 
1028 	*pmip = mip;
1029 	return (0);
1030 }
1031 
1032 void
1033 mac_rele(mac_impl_t *mip)
1034 {
1035 	rw_enter(&i_mac_impl_lock, RW_WRITER);
1036 	ASSERT(mip->mi_ref != 0);
1037 	if (--mip->mi_ref == 0) {
1038 		ASSERT(mip->mi_nactiveclients == 0 &&
1039 		    !(mip->mi_state_flags & MIS_EXCLUSIVE));
1040 	}
1041 	rw_exit(&i_mac_impl_lock);
1042 }
1043 
1044 /*
1045  * Private GLDv3 function to start a MAC instance.
1046  */
1047 int
1048 mac_start(mac_handle_t mh)
1049 {
1050 	mac_impl_t	*mip = (mac_impl_t *)mh;
1051 	int		err = 0;
1052 
1053 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1054 	ASSERT(mip->mi_start != NULL);
1055 
1056 	/*
1057 	 * Check whether the device is already started.
1058 	 */
1059 	if (mip->mi_active++ == 0) {
1060 		mac_ring_t *ring = NULL;
1061 
1062 		/*
1063 		 * Start the device.
1064 		 */
1065 		err = mip->mi_start(mip->mi_driver);
1066 		if (err != 0) {
1067 			mip->mi_active--;
1068 			return (err);
1069 		}
1070 
1071 		/*
1072 		 * Start the default tx ring.
1073 		 */
1074 		if (mip->mi_default_tx_ring != NULL) {
1075 
1076 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1077 			err = mac_start_ring(ring);
1078 			if (err != 0) {
1079 				mip->mi_active--;
1080 				return (err);
1081 			}
1082 			ring->mr_state = MR_INUSE;
1083 		}
1084 
1085 		if (mip->mi_rx_groups != NULL) {
1086 			/*
1087 			 * Start the default ring, since it will be needed
1088 			 * to receive broadcast and multicast traffic for
1089 			 * both primary and non-primary MAC clients.
1090 			 */
1091 			mac_group_t *grp = &mip->mi_rx_groups[0];
1092 
1093 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1094 			err = mac_start_group_and_rings(grp);
1095 			if (err != 0) {
1096 				mip->mi_active--;
1097 				if (ring != NULL) {
1098 					mac_stop_ring(ring);
1099 					ring->mr_state = MR_FREE;
1100 				}
1101 				return (err);
1102 			}
1103 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_SHARED);
1104 		}
1105 	}
1106 
1107 	return (err);
1108 }
1109 
1110 /*
1111  * Private GLDv3 function to stop a MAC instance.
1112  */
1113 void
1114 mac_stop(mac_handle_t mh)
1115 {
1116 	mac_impl_t	*mip = (mac_impl_t *)mh;
1117 
1118 	ASSERT(mip->mi_stop != NULL);
1119 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1120 
1121 	/*
1122 	 * Check whether the device is still needed.
1123 	 */
1124 	ASSERT(mip->mi_active != 0);
1125 	if (--mip->mi_active == 0) {
1126 		if (mip->mi_rx_groups != NULL) {
1127 			/*
1128 			 * There should be no more active clients since the
1129 			 * MAC is being stopped. Stop the default RX group
1130 			 * and transition it back to registered state.
1131 			 */
1132 			mac_group_t *grp = &mip->mi_rx_groups[0];
1133 
1134 			/*
1135 			 * When clients are torn down, the groups
1136 			 * are release via mac_release_rx_group which
1137 			 * knows the the default group is always in
1138 			 * started mode since broadcast uses it. So
1139 			 * we can assert that their are no clients
1140 			 * (since mac_bcast_add doesn't register itself
1141 			 * as a client) and group is in SHARED state.
1142 			 */
1143 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1144 			ASSERT(MAC_RX_GROUP_NO_CLIENT(grp) &&
1145 			    mip->mi_nactiveclients == 0);
1146 			mac_stop_group_and_rings(grp);
1147 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1148 		}
1149 
1150 		if (mip->mi_default_tx_ring != NULL) {
1151 			mac_ring_t *ring;
1152 
1153 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1154 			mac_stop_ring(ring);
1155 			ring->mr_state = MR_FREE;
1156 		}
1157 
1158 		/*
1159 		 * Stop the device.
1160 		 */
1161 		mip->mi_stop(mip->mi_driver);
1162 	}
1163 }
1164 
1165 int
1166 i_mac_promisc_set(mac_impl_t *mip, boolean_t on)
1167 {
1168 	int		err = 0;
1169 
1170 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1171 	ASSERT(mip->mi_setpromisc != NULL);
1172 
1173 	if (on) {
1174 		/*
1175 		 * Enable promiscuous mode on the device if not yet enabled.
1176 		 */
1177 		if (mip->mi_devpromisc++ == 0) {
1178 			err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1179 			if (err != 0) {
1180 				mip->mi_devpromisc--;
1181 				return (err);
1182 			}
1183 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1184 		}
1185 	} else {
1186 		if (mip->mi_devpromisc == 0)
1187 			return (EPROTO);
1188 
1189 		/*
1190 		 * Disable promiscuous mode on the device if this is the last
1191 		 * enabling.
1192 		 */
1193 		if (--mip->mi_devpromisc == 0) {
1194 			err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1195 			if (err != 0) {
1196 				mip->mi_devpromisc++;
1197 				return (err);
1198 			}
1199 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1200 		}
1201 	}
1202 
1203 	return (0);
1204 }
1205 
1206 /*
1207  * The promiscuity state can change any time. If the caller needs to take
1208  * actions that are atomic with the promiscuity state, then the caller needs
1209  * to bracket the entire sequence with mac_perim_enter/exit
1210  */
1211 boolean_t
1212 mac_promisc_get(mac_handle_t mh)
1213 {
1214 	mac_impl_t		*mip = (mac_impl_t *)mh;
1215 
1216 	/*
1217 	 * Return the current promiscuity.
1218 	 */
1219 	return (mip->mi_devpromisc != 0);
1220 }
1221 
1222 /*
1223  * Invoked at MAC instance attach time to initialize the list
1224  * of factory MAC addresses supported by a MAC instance. This function
1225  * builds a local cache in the mac_impl_t for the MAC addresses
1226  * supported by the underlying hardware. The MAC clients themselves
1227  * use the mac_addr_factory*() functions to query and reserve
1228  * factory MAC addresses.
1229  */
1230 void
1231 mac_addr_factory_init(mac_impl_t *mip)
1232 {
1233 	mac_capab_multifactaddr_t capab;
1234 	uint8_t *addr;
1235 	int i;
1236 
1237 	/*
1238 	 * First round to see how many factory MAC addresses are available.
1239 	 */
1240 	bzero(&capab, sizeof (capab));
1241 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1242 	    &capab) || (capab.mcm_naddr == 0)) {
1243 		/*
1244 		 * The MAC instance doesn't support multiple factory
1245 		 * MAC addresses, we're done here.
1246 		 */
1247 		return;
1248 	}
1249 
1250 	/*
1251 	 * Allocate the space and get all the factory addresses.
1252 	 */
1253 	addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1254 	capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1255 
1256 	mip->mi_factory_addr_num = capab.mcm_naddr;
1257 	mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1258 	    sizeof (mac_factory_addr_t), KM_SLEEP);
1259 
1260 	for (i = 0; i < capab.mcm_naddr; i++) {
1261 		bcopy(addr + i * MAXMACADDRLEN,
1262 		    mip->mi_factory_addr[i].mfa_addr,
1263 		    mip->mi_type->mt_addr_length);
1264 		mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1265 	}
1266 
1267 	kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1268 }
1269 
1270 void
1271 mac_addr_factory_fini(mac_impl_t *mip)
1272 {
1273 	if (mip->mi_factory_addr == NULL) {
1274 		ASSERT(mip->mi_factory_addr_num == 0);
1275 		return;
1276 	}
1277 
1278 	kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1279 	    sizeof (mac_factory_addr_t));
1280 
1281 	mip->mi_factory_addr = NULL;
1282 	mip->mi_factory_addr_num = 0;
1283 }
1284 
1285 /*
1286  * Reserve a factory MAC address. If *slot is set to -1, the function
1287  * attempts to reserve any of the available factory MAC addresses and
1288  * returns the reserved slot id. If no slots are available, the function
1289  * returns ENOSPC. If *slot is not set to -1, the function reserves
1290  * the specified slot if it is available, or returns EBUSY is the slot
1291  * is already used. Returns ENOTSUP if the underlying MAC does not
1292  * support multiple factory addresses. If the slot number is not -1 but
1293  * is invalid, returns EINVAL.
1294  */
1295 int
1296 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1297 {
1298 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1299 	mac_impl_t *mip = mcip->mci_mip;
1300 	int i, ret = 0;
1301 
1302 	i_mac_perim_enter(mip);
1303 	/*
1304 	 * Protect against concurrent readers that may need a self-consistent
1305 	 * view of the factory addresses
1306 	 */
1307 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1308 
1309 	if (mip->mi_factory_addr_num == 0) {
1310 		ret = ENOTSUP;
1311 		goto bail;
1312 	}
1313 
1314 	if (*slot != -1) {
1315 		/* check the specified slot */
1316 		if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1317 			ret = EINVAL;
1318 			goto bail;
1319 		}
1320 		if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1321 			ret = EBUSY;
1322 			goto bail;
1323 		}
1324 	} else {
1325 		/* pick the next available slot */
1326 		for (i = 0; i < mip->mi_factory_addr_num; i++) {
1327 			if (!mip->mi_factory_addr[i].mfa_in_use)
1328 				break;
1329 		}
1330 
1331 		if (i == mip->mi_factory_addr_num) {
1332 			ret = ENOSPC;
1333 			goto bail;
1334 		}
1335 		*slot = i+1;
1336 	}
1337 
1338 	mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1339 	mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1340 
1341 bail:
1342 	rw_exit(&mip->mi_rw_lock);
1343 	i_mac_perim_exit(mip);
1344 	return (ret);
1345 }
1346 
1347 /*
1348  * Release the specified factory MAC address slot.
1349  */
1350 void
1351 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1352 {
1353 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1354 	mac_impl_t *mip = mcip->mci_mip;
1355 
1356 	i_mac_perim_enter(mip);
1357 	/*
1358 	 * Protect against concurrent readers that may need a self-consistent
1359 	 * view of the factory addresses
1360 	 */
1361 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1362 
1363 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1364 	ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1365 
1366 	mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1367 
1368 	rw_exit(&mip->mi_rw_lock);
1369 	i_mac_perim_exit(mip);
1370 }
1371 
1372 /*
1373  * Stores in mac_addr the value of the specified MAC address. Returns
1374  * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1375  * The caller must provide a string of at least MAXNAMELEN bytes.
1376  */
1377 void
1378 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1379     uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1380 {
1381 	mac_impl_t *mip = (mac_impl_t *)mh;
1382 	boolean_t in_use;
1383 
1384 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1385 
1386 	/*
1387 	 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1388 	 * and mi_rw_lock
1389 	 */
1390 	rw_enter(&mip->mi_rw_lock, RW_READER);
1391 	bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1392 	*addr_len = mip->mi_type->mt_addr_length;
1393 	in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1394 	if (in_use && client_name != NULL) {
1395 		bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1396 		    client_name, MAXNAMELEN);
1397 	}
1398 	if (in_use_arg != NULL)
1399 		*in_use_arg = in_use;
1400 	rw_exit(&mip->mi_rw_lock);
1401 }
1402 
1403 /*
1404  * Returns the number of factory MAC addresses (in addition to the
1405  * primary MAC address), 0 if the underlying MAC doesn't support
1406  * that feature.
1407  */
1408 uint_t
1409 mac_addr_factory_num(mac_handle_t mh)
1410 {
1411 	mac_impl_t *mip = (mac_impl_t *)mh;
1412 
1413 	return (mip->mi_factory_addr_num);
1414 }
1415 
1416 
1417 void
1418 mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1419 {
1420 	mac_ring_t	*ring;
1421 
1422 	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1423 		ring->mr_flag &= ~flag;
1424 }
1425 
1426 /*
1427  * The following mac_hwrings_xxx() functions are private mac client functions
1428  * used by the aggr driver to access and control the underlying HW Rx group
1429  * and rings. In this case, the aggr driver has exclusive control of the
1430  * underlying HW Rx group/rings, it calls the following functions to
1431  * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1432  * addresses, or set up the Rx callback.
1433  */
1434 /* ARGSUSED */
1435 static void
1436 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1437     mblk_t *mp_chain, boolean_t loopback)
1438 {
1439 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
1440 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1441 	mac_direct_rx_t		proc;
1442 	void			*arg1;
1443 	mac_resource_handle_t	arg2;
1444 
1445 	proc = srs_rx->sr_func;
1446 	arg1 = srs_rx->sr_arg1;
1447 	arg2 = mac_srs->srs_mrh;
1448 
1449 	proc(arg1, arg2, mp_chain, NULL);
1450 }
1451 
1452 /*
1453  * This function is called to get the list of HW rings that are reserved by
1454  * an exclusive mac client.
1455  *
1456  * Return value: the number of HW rings.
1457  */
1458 int
1459 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1460     mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1461 {
1462 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1463 	int			cnt = 0;
1464 
1465 	switch (rtype) {
1466 	case MAC_RING_TYPE_RX: {
1467 		flow_entry_t	*flent = mcip->mci_flent;
1468 		mac_group_t	*grp;
1469 		mac_ring_t	*ring;
1470 
1471 		grp = flent->fe_rx_ring_group;
1472 		/*
1473 		 * The mac client did not reserve any RX group, return directly.
1474 		 * This is probably because the underlying MAC does not support
1475 		 * any groups.
1476 		 */
1477 		*hwgh = NULL;
1478 		if (grp == NULL)
1479 			return (0);
1480 		/*
1481 		 * This group must be reserved by this mac client.
1482 		 */
1483 		ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1484 		    (mch == (mac_client_handle_t)
1485 		    (MAC_RX_GROUP_ONLY_CLIENT(grp))));
1486 		for (ring = grp->mrg_rings;
1487 		    ring != NULL; ring = ring->mr_next, cnt++) {
1488 			ASSERT(cnt < MAX_RINGS_PER_GROUP);
1489 			hwrh[cnt] = (mac_ring_handle_t)ring;
1490 		}
1491 		*hwgh = (mac_group_handle_t)grp;
1492 		return (cnt);
1493 	}
1494 	case MAC_RING_TYPE_TX: {
1495 		mac_soft_ring_set_t	*tx_srs;
1496 		mac_srs_tx_t		*tx;
1497 
1498 		tx_srs = MCIP_TX_SRS(mcip);
1499 		tx = &tx_srs->srs_tx;
1500 		for (; cnt < tx->st_ring_count; cnt++)
1501 			hwrh[cnt] = tx->st_rings[cnt];
1502 		return (cnt);
1503 	}
1504 	default:
1505 		ASSERT(B_FALSE);
1506 		return (-1);
1507 	}
1508 }
1509 
1510 /*
1511  * Setup the RX callback of the mac client which exclusively controls HW ring.
1512  */
1513 void
1514 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh)
1515 {
1516 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1517 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1518 
1519 	mac_srs->srs_mrh = prh;
1520 	mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1521 }
1522 
1523 void
1524 mac_hwring_teardown(mac_ring_handle_t hwrh)
1525 {
1526 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1527 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1528 
1529 	mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1530 	mac_srs->srs_mrh = NULL;
1531 }
1532 
1533 int
1534 mac_hwring_disable_intr(mac_ring_handle_t rh)
1535 {
1536 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1537 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1538 
1539 	return (intr->mi_disable(intr->mi_handle));
1540 }
1541 
1542 int
1543 mac_hwring_enable_intr(mac_ring_handle_t rh)
1544 {
1545 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1546 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1547 
1548 	return (intr->mi_enable(intr->mi_handle));
1549 }
1550 
1551 int
1552 mac_hwring_start(mac_ring_handle_t rh)
1553 {
1554 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1555 
1556 	MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1557 	return (0);
1558 }
1559 
1560 void
1561 mac_hwring_stop(mac_ring_handle_t rh)
1562 {
1563 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1564 
1565 	mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1566 }
1567 
1568 mblk_t *
1569 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1570 {
1571 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1572 	mac_ring_info_t *info = &rr_ring->mr_info;
1573 
1574 	return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1575 }
1576 
1577 /*
1578  * Send packets through the selected tx ring.
1579  */
1580 mblk_t *
1581 mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
1582 {
1583 	mac_ring_t *ring = (mac_ring_t *)rh;
1584 	mac_ring_info_t *info = &ring->mr_info;
1585 
1586 	ASSERT(ring->mr_type == MAC_RING_TYPE_TX &&
1587 	    ring->mr_state >= MR_INUSE);
1588 	return (info->mri_tx(info->mri_driver, mp));
1589 }
1590 
1591 int
1592 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1593 {
1594 	mac_group_t *group = (mac_group_t *)gh;
1595 
1596 	return (mac_group_addmac(group, addr));
1597 }
1598 
1599 int
1600 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1601 {
1602 	mac_group_t *group = (mac_group_t *)gh;
1603 
1604 	return (mac_group_remmac(group, addr));
1605 }
1606 
1607 /*
1608  * Set the RX group to be shared/reserved. Note that the group must be
1609  * started/stopped outside of this function.
1610  */
1611 void
1612 mac_set_rx_group_state(mac_group_t *grp, mac_group_state_t state)
1613 {
1614 	/*
1615 	 * If there is no change in the group state, just return.
1616 	 */
1617 	if (grp->mrg_state == state)
1618 		return;
1619 
1620 	switch (state) {
1621 	case MAC_GROUP_STATE_RESERVED:
1622 		/*
1623 		 * Successfully reserved the group.
1624 		 *
1625 		 * Given that there is an exclusive client controlling this
1626 		 * group, we enable the group level polling when available,
1627 		 * so that SRSs get to turn on/off individual rings they's
1628 		 * assigned to.
1629 		 */
1630 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1631 
1632 		if (GROUP_INTR_DISABLE_FUNC(grp) != NULL)
1633 			GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1634 
1635 		break;
1636 
1637 	case MAC_GROUP_STATE_SHARED:
1638 		/*
1639 		 * Set all rings of this group to software classified.
1640 		 * If the group has an overriding interrupt, then re-enable it.
1641 		 */
1642 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1643 
1644 		if (GROUP_INTR_ENABLE_FUNC(grp) != NULL)
1645 			GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1646 
1647 		/* The ring is not available for reservations any more */
1648 		break;
1649 
1650 	case MAC_GROUP_STATE_REGISTERED:
1651 		/* Also callable from mac_register, perim is not held */
1652 		break;
1653 
1654 	default:
1655 		ASSERT(B_FALSE);
1656 		break;
1657 	}
1658 
1659 	grp->mrg_state = state;
1660 }
1661 
1662 /*
1663  * Quiesce future hardware classified packets for the specified Rx ring
1664  */
1665 static void
1666 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1667 {
1668 	ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1669 	ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
1670 
1671 	mutex_enter(&rx_ring->mr_lock);
1672 	rx_ring->mr_flag |= ring_flag;
1673 	while (rx_ring->mr_refcnt != 0)
1674 		cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1675 	mutex_exit(&rx_ring->mr_lock);
1676 }
1677 
1678 /*
1679  * Please see mac_tx for details about the per cpu locking scheme
1680  */
1681 static void
1682 mac_tx_lock_all(mac_client_impl_t *mcip)
1683 {
1684 	int	i;
1685 
1686 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1687 		mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1688 }
1689 
1690 static void
1691 mac_tx_unlock_all(mac_client_impl_t *mcip)
1692 {
1693 	int	i;
1694 
1695 	for (i = mac_tx_percpu_cnt; i >= 0; i--)
1696 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1697 }
1698 
1699 static void
1700 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1701 {
1702 	int	i;
1703 
1704 	for (i = mac_tx_percpu_cnt; i > 0; i--)
1705 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1706 }
1707 
1708 static int
1709 mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1710 {
1711 	int	i;
1712 	int	refcnt = 0;
1713 
1714 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1715 		refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1716 
1717 	return (refcnt);
1718 }
1719 
1720 /*
1721  * Stop future Tx packets coming down from the client in preparation for
1722  * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1723  * of rings between clients
1724  */
1725 void
1726 mac_tx_client_block(mac_client_impl_t *mcip)
1727 {
1728 	mac_tx_lock_all(mcip);
1729 	mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1730 	while (mac_tx_sum_refcnt(mcip) != 0) {
1731 		mac_tx_unlock_allbutzero(mcip);
1732 		cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1733 		mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1734 		mac_tx_lock_all(mcip);
1735 	}
1736 	mac_tx_unlock_all(mcip);
1737 }
1738 
1739 void
1740 mac_tx_client_unblock(mac_client_impl_t *mcip)
1741 {
1742 	mac_tx_lock_all(mcip);
1743 	mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1744 	mac_tx_unlock_all(mcip);
1745 	/*
1746 	 * We may fail to disable flow control for the last MAC_NOTE_TX
1747 	 * notification because the MAC client is quiesced. Send the
1748 	 * notification again.
1749 	 */
1750 	i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1751 }
1752 
1753 /*
1754  * Wait for an SRS to quiesce. The SRS worker will signal us when the
1755  * quiesce is done.
1756  */
1757 static void
1758 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1759 {
1760 	mutex_enter(&srs->srs_lock);
1761 	while (!(srs->srs_state & srs_flag))
1762 		cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1763 	mutex_exit(&srs->srs_lock);
1764 }
1765 
1766 /*
1767  * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1768  * works bottom up by cutting off packet flow from the bottommost point in the
1769  * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1770  * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1771  * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1772  * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1773  * for the SRS and MR flags. In the former case the threads pause waiting for
1774  * a restart, while in the latter case the threads exit. The Tx SRS teardown
1775  * is also mostly similar to the above.
1776  *
1777  * 1. Stop future hardware classified packets at the lowest level in the mac.
1778  *    Remove any hardware classification rule (CONDEMNED case) and mark the
1779  *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1780  *    from increasing. Upcalls from the driver that come through hardware
1781  *    classification will be dropped in mac_rx from now on. Then we wait for
1782  *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1783  *    sure there aren't any upcall threads from the driver through hardware
1784  *    classification. In the case of SRS teardown we also remove the
1785  *    classification rule in the driver.
1786  *
1787  * 2. Stop future software classified packets by marking the flow entry with
1788  *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1789  *    increasing. We also remove the flow entry from the table in the latter
1790  *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1791  *    that indicates there aren't any active threads using that flow entry.
1792  *
1793  * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1794  *    SRS worker thread, and the soft ring threads are quiesced in sequence
1795  *    with the SRS worker thread serving as a master controller. This
1796  *    mechansim is explained in mac_srs_worker_quiesce().
1797  *
1798  * The restart mechanism to reactivate the SRS and softrings is explained
1799  * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1800  * restart sequence.
1801  */
1802 void
1803 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1804 {
1805 	flow_entry_t	*flent = srs->srs_flent;
1806 	uint_t	mr_flag, srs_done_flag;
1807 
1808 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1809 	ASSERT(!(srs->srs_type & SRST_TX));
1810 
1811 	if (srs_quiesce_flag == SRS_CONDEMNED) {
1812 		mr_flag = MR_CONDEMNED;
1813 		srs_done_flag = SRS_CONDEMNED_DONE;
1814 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1815 			mac_srs_client_poll_disable(srs->srs_mcip, srs);
1816 	} else {
1817 		ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1818 		mr_flag = MR_QUIESCE;
1819 		srs_done_flag = SRS_QUIESCE_DONE;
1820 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1821 			mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1822 	}
1823 
1824 	if (srs->srs_ring != NULL) {
1825 		mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1826 	} else {
1827 		/*
1828 		 * SRS is driven by software classification. In case
1829 		 * of CONDEMNED, the top level teardown functions will
1830 		 * deal with flow removal.
1831 		 */
1832 		if (srs_quiesce_flag != SRS_CONDEMNED) {
1833 			FLOW_MARK(flent, FE_QUIESCE);
1834 			mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1835 		}
1836 	}
1837 
1838 	/*
1839 	 * Signal the SRS to quiesce itself, and then cv_wait for the
1840 	 * SRS quiesce to complete. The SRS worker thread will wake us
1841 	 * up when the quiesce is complete
1842 	 */
1843 	mac_srs_signal(srs, srs_quiesce_flag);
1844 	mac_srs_quiesce_wait(srs, srs_done_flag);
1845 }
1846 
1847 /*
1848  * Remove an SRS.
1849  */
1850 void
1851 mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1852 {
1853 	flow_entry_t *flent = srs->srs_flent;
1854 	int i;
1855 
1856 	mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1857 	/*
1858 	 * Locate and remove our entry in the fe_rx_srs[] array, and
1859 	 * adjust the fe_rx_srs array entries and array count by
1860 	 * moving the last entry into the vacated spot.
1861 	 */
1862 	mutex_enter(&flent->fe_lock);
1863 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1864 		if (flent->fe_rx_srs[i] == srs)
1865 			break;
1866 	}
1867 
1868 	ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1869 	if (i != flent->fe_rx_srs_cnt - 1) {
1870 		flent->fe_rx_srs[i] =
1871 		    flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1872 		i = flent->fe_rx_srs_cnt - 1;
1873 	}
1874 
1875 	flent->fe_rx_srs[i] = NULL;
1876 	flent->fe_rx_srs_cnt--;
1877 	mutex_exit(&flent->fe_lock);
1878 
1879 	mac_srs_free(srs);
1880 }
1881 
1882 static void
1883 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
1884 {
1885 	mutex_enter(&srs->srs_lock);
1886 	srs->srs_state &= ~flag;
1887 	mutex_exit(&srs->srs_lock);
1888 }
1889 
1890 void
1891 mac_rx_srs_restart(mac_soft_ring_set_t *srs)
1892 {
1893 	flow_entry_t	*flent = srs->srs_flent;
1894 	mac_ring_t	*mr;
1895 
1896 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1897 	ASSERT((srs->srs_type & SRST_TX) == 0);
1898 
1899 	/*
1900 	 * This handles a change in the number of SRSs between the quiesce and
1901 	 * and restart operation of a flow.
1902 	 */
1903 	if (!SRS_QUIESCED(srs))
1904 		return;
1905 
1906 	/*
1907 	 * Signal the SRS to restart itself. Wait for the restart to complete
1908 	 * Note that we only restart the SRS if it is not marked as
1909 	 * permanently quiesced.
1910 	 */
1911 	if (!SRS_QUIESCED_PERMANENT(srs)) {
1912 		mac_srs_signal(srs, SRS_RESTART);
1913 		mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
1914 		mac_srs_clear_flag(srs, SRS_RESTART_DONE);
1915 
1916 		mac_srs_client_poll_restart(srs->srs_mcip, srs);
1917 	}
1918 
1919 	/* Finally clear the flags to let the packets in */
1920 	mr = srs->srs_ring;
1921 	if (mr != NULL) {
1922 		MAC_RING_UNMARK(mr, MR_QUIESCE);
1923 		/* In case the ring was stopped, safely restart it */
1924 		(void) mac_start_ring(mr);
1925 	} else {
1926 		FLOW_UNMARK(flent, FE_QUIESCE);
1927 	}
1928 }
1929 
1930 /*
1931  * Temporary quiesce of a flow and associated Rx SRS.
1932  * Please see block comment above mac_rx_classify_flow_rem.
1933  */
1934 /* ARGSUSED */
1935 int
1936 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
1937 {
1938 	int		i;
1939 
1940 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1941 		mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
1942 		    SRS_QUIESCE);
1943 	}
1944 	return (0);
1945 }
1946 
1947 /*
1948  * Restart a flow and associated Rx SRS that has been quiesced temporarily
1949  * Please see block comment above mac_rx_classify_flow_rem
1950  */
1951 /* ARGSUSED */
1952 int
1953 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
1954 {
1955 	int		i;
1956 
1957 	for (i = 0; i < flent->fe_rx_srs_cnt; i++)
1958 		mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
1959 
1960 	return (0);
1961 }
1962 
1963 void
1964 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
1965 {
1966 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1967 	flow_entry_t		*flent = mcip->mci_flent;
1968 	mac_impl_t		*mip = mcip->mci_mip;
1969 	mac_soft_ring_set_t	*mac_srs;
1970 	int			i;
1971 
1972 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1973 
1974 	if (flent == NULL)
1975 		return;
1976 
1977 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1978 		mac_srs = flent->fe_rx_srs[i];
1979 		mutex_enter(&mac_srs->srs_lock);
1980 		if (on)
1981 			mac_srs->srs_state |= SRS_QUIESCE_PERM;
1982 		else
1983 			mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
1984 		mutex_exit(&mac_srs->srs_lock);
1985 	}
1986 }
1987 
1988 void
1989 mac_rx_client_quiesce(mac_client_handle_t mch)
1990 {
1991 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1992 	mac_impl_t		*mip = mcip->mci_mip;
1993 
1994 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1995 
1996 	if (MCIP_DATAPATH_SETUP(mcip)) {
1997 		(void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
1998 		    NULL);
1999 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2000 		    mac_rx_classify_flow_quiesce, NULL);
2001 	}
2002 }
2003 
2004 void
2005 mac_rx_client_restart(mac_client_handle_t mch)
2006 {
2007 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
2008 	mac_impl_t		*mip = mcip->mci_mip;
2009 
2010 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2011 
2012 	if (MCIP_DATAPATH_SETUP(mcip)) {
2013 		(void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
2014 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2015 		    mac_rx_classify_flow_restart, NULL);
2016 	}
2017 }
2018 
2019 /*
2020  * This function only quiesces the Tx SRS and softring worker threads. Callers
2021  * need to make sure that there aren't any mac client threads doing current or
2022  * future transmits in the mac before calling this function.
2023  */
2024 void
2025 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2026 {
2027 	mac_client_impl_t	*mcip = srs->srs_mcip;
2028 
2029 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2030 
2031 	ASSERT(srs->srs_type & SRST_TX);
2032 	ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2033 	    srs_quiesce_flag == SRS_QUIESCE);
2034 
2035 	/*
2036 	 * Signal the SRS to quiesce itself, and then cv_wait for the
2037 	 * SRS quiesce to complete. The SRS worker thread will wake us
2038 	 * up when the quiesce is complete
2039 	 */
2040 	mac_srs_signal(srs, srs_quiesce_flag);
2041 	mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2042 	    SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2043 }
2044 
2045 void
2046 mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2047 {
2048 	/*
2049 	 * Resizing the fanout could result in creation of new SRSs.
2050 	 * They may not necessarily be in the quiesced state in which
2051 	 * case it need be restarted
2052 	 */
2053 	if (!SRS_QUIESCED(srs))
2054 		return;
2055 
2056 	mac_srs_signal(srs, SRS_RESTART);
2057 	mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2058 	mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2059 }
2060 
2061 /*
2062  * Temporary quiesce of a flow and associated Rx SRS.
2063  * Please see block comment above mac_rx_srs_quiesce
2064  */
2065 /* ARGSUSED */
2066 int
2067 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2068 {
2069 	/*
2070 	 * The fe_tx_srs is null for a subflow on an interface that is
2071 	 * not plumbed
2072 	 */
2073 	if (flent->fe_tx_srs != NULL)
2074 		mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2075 	return (0);
2076 }
2077 
2078 /* ARGSUSED */
2079 int
2080 mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2081 {
2082 	/*
2083 	 * The fe_tx_srs is null for a subflow on an interface that is
2084 	 * not plumbed
2085 	 */
2086 	if (flent->fe_tx_srs != NULL)
2087 		mac_tx_srs_restart(flent->fe_tx_srs);
2088 	return (0);
2089 }
2090 
2091 void
2092 mac_tx_client_quiesce(mac_client_impl_t *mcip, uint_t srs_quiesce_flag)
2093 {
2094 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2095 
2096 	mac_tx_client_block(mcip);
2097 	if (MCIP_TX_SRS(mcip) != NULL) {
2098 		mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2099 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2100 		    mac_tx_flow_quiesce, NULL);
2101 	}
2102 }
2103 
2104 void
2105 mac_tx_client_restart(mac_client_impl_t *mcip)
2106 {
2107 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2108 
2109 	mac_tx_client_unblock(mcip);
2110 	if (MCIP_TX_SRS(mcip) != NULL) {
2111 		mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2112 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2113 		    mac_tx_flow_restart, NULL);
2114 	}
2115 }
2116 
2117 void
2118 mac_tx_client_flush(mac_client_impl_t *mcip)
2119 {
2120 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2121 
2122 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2123 	mac_tx_client_restart(mcip);
2124 }
2125 
2126 void
2127 mac_client_quiesce(mac_client_impl_t *mcip)
2128 {
2129 	mac_rx_client_quiesce((mac_client_handle_t)mcip);
2130 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2131 }
2132 
2133 void
2134 mac_client_restart(mac_client_impl_t *mcip)
2135 {
2136 	mac_rx_client_restart((mac_client_handle_t)mcip);
2137 	mac_tx_client_restart(mcip);
2138 }
2139 
2140 /*
2141  * Allocate a minor number.
2142  */
2143 minor_t
2144 mac_minor_hold(boolean_t sleep)
2145 {
2146 	minor_t	minor;
2147 
2148 	/*
2149 	 * Grab a value from the arena.
2150 	 */
2151 	atomic_add_32(&minor_count, 1);
2152 
2153 	if (sleep)
2154 		minor = (uint_t)id_alloc(minor_ids);
2155 	else
2156 		minor = (uint_t)id_alloc_nosleep(minor_ids);
2157 
2158 	if (minor == 0) {
2159 		atomic_add_32(&minor_count, -1);
2160 		return (0);
2161 	}
2162 
2163 	return (minor);
2164 }
2165 
2166 /*
2167  * Release a previously allocated minor number.
2168  */
2169 void
2170 mac_minor_rele(minor_t minor)
2171 {
2172 	/*
2173 	 * Return the value to the arena.
2174 	 */
2175 	id_free(minor_ids, minor);
2176 	atomic_add_32(&minor_count, -1);
2177 }
2178 
2179 uint32_t
2180 mac_no_notification(mac_handle_t mh)
2181 {
2182 	mac_impl_t *mip = (mac_impl_t *)mh;
2183 
2184 	return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2185 	    mip->mi_capab_legacy.ml_unsup_note : 0);
2186 }
2187 
2188 /*
2189  * Prevent any new opens of this mac in preparation for unregister
2190  */
2191 int
2192 i_mac_disable(mac_impl_t *mip)
2193 {
2194 	mac_client_impl_t	*mcip;
2195 
2196 	rw_enter(&i_mac_impl_lock, RW_WRITER);
2197 	if (mip->mi_state_flags & MIS_DISABLED) {
2198 		/* Already disabled, return success */
2199 		rw_exit(&i_mac_impl_lock);
2200 		return (0);
2201 	}
2202 	/*
2203 	 * See if there are any other references to this mac_t (e.g., VLAN's).
2204 	 * If so return failure. If all the other checks below pass, then
2205 	 * set mi_disabled atomically under the i_mac_impl_lock to prevent
2206 	 * any new VLAN's from being created or new mac client opens of this
2207 	 * mac end point.
2208 	 */
2209 	if (mip->mi_ref > 0) {
2210 		rw_exit(&i_mac_impl_lock);
2211 		return (EBUSY);
2212 	}
2213 
2214 	/*
2215 	 * mac clients must delete all multicast groups they join before
2216 	 * closing. bcast groups are reference counted, the last client
2217 	 * to delete the group will wait till the group is physically
2218 	 * deleted. Since all clients have closed this mac end point
2219 	 * mi_bcast_ngrps must be zero at this point
2220 	 */
2221 	ASSERT(mip->mi_bcast_ngrps == 0);
2222 
2223 	/*
2224 	 * Don't let go of this if it has some flows.
2225 	 * All other code guarantees no flows are added to a disabled
2226 	 * mac, therefore it is sufficient to check for the flow table
2227 	 * only here.
2228 	 */
2229 	mcip = mac_primary_client_handle(mip);
2230 	if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2231 		rw_exit(&i_mac_impl_lock);
2232 		return (ENOTEMPTY);
2233 	}
2234 
2235 	mip->mi_state_flags |= MIS_DISABLED;
2236 	rw_exit(&i_mac_impl_lock);
2237 	return (0);
2238 }
2239 
2240 int
2241 mac_disable_nowait(mac_handle_t mh)
2242 {
2243 	mac_impl_t	*mip = (mac_impl_t *)mh;
2244 	int err;
2245 
2246 	if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2247 		return (err);
2248 	err = i_mac_disable(mip);
2249 	i_mac_perim_exit(mip);
2250 	return (err);
2251 }
2252 
2253 int
2254 mac_disable(mac_handle_t mh)
2255 {
2256 	mac_impl_t	*mip = (mac_impl_t *)mh;
2257 	int err;
2258 
2259 	i_mac_perim_enter(mip);
2260 	err = i_mac_disable(mip);
2261 	i_mac_perim_exit(mip);
2262 
2263 	/*
2264 	 * Clean up notification thread and wait for it to exit.
2265 	 */
2266 	if (err == 0)
2267 		i_mac_notify_exit(mip);
2268 
2269 	return (err);
2270 }
2271 
2272 /*
2273  * Called when the MAC instance has a non empty flow table, to de-multiplex
2274  * incoming packets to the right flow.
2275  * The MAC's rw lock is assumed held as a READER.
2276  */
2277 /* ARGSUSED */
2278 static mblk_t *
2279 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2280 {
2281 	flow_entry_t	*flent = NULL;
2282 	uint_t		flags = FLOW_INBOUND;
2283 	int		err;
2284 
2285 	/*
2286 	 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2287 	 * to mac_flow_lookup() so that the VLAN packets can be successfully
2288 	 * passed to the non-VLAN aggregation flows.
2289 	 *
2290 	 * Note that there is possibly a race between this and
2291 	 * mac_unicast_remove/add() and VLAN packets could be incorrectly
2292 	 * classified to non-VLAN flows of non-aggregation mac clients. These
2293 	 * VLAN packets will be then filtered out by the mac module.
2294 	 */
2295 	if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2296 		flags |= FLOW_IGNORE_VLAN;
2297 
2298 	err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2299 	if (err != 0) {
2300 		/* no registered receive function */
2301 		return (mp);
2302 	} else {
2303 		mac_client_impl_t	*mcip;
2304 
2305 		/*
2306 		 * This flent might just be an additional one on the MAC client,
2307 		 * i.e. for classification purposes (different fdesc), however
2308 		 * the resources, SRS et. al., are in the mci_flent, so if
2309 		 * this isn't the mci_flent, we need to get it.
2310 		 */
2311 		if ((mcip = flent->fe_mcip) != NULL &&
2312 		    mcip->mci_flent != flent) {
2313 			FLOW_REFRELE(flent);
2314 			flent = mcip->mci_flent;
2315 			FLOW_TRY_REFHOLD(flent, err);
2316 			if (err != 0)
2317 				return (mp);
2318 		}
2319 		(flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2320 		    B_FALSE);
2321 		FLOW_REFRELE(flent);
2322 	}
2323 	return (NULL);
2324 }
2325 
2326 mblk_t *
2327 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2328 {
2329 	mac_impl_t	*mip = (mac_impl_t *)mh;
2330 	mblk_t		*bp, *bp1, **bpp, *list = NULL;
2331 
2332 	/*
2333 	 * We walk the chain and attempt to classify each packet.
2334 	 * The packets that couldn't be classified will be returned
2335 	 * back to the caller.
2336 	 */
2337 	bp = mp_chain;
2338 	bpp = &list;
2339 	while (bp != NULL) {
2340 		bp1 = bp;
2341 		bp = bp->b_next;
2342 		bp1->b_next = NULL;
2343 
2344 		if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2345 			*bpp = bp1;
2346 			bpp = &bp1->b_next;
2347 		}
2348 	}
2349 	return (list);
2350 }
2351 
2352 static int
2353 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2354 {
2355 	mac_ring_handle_t ring = arg;
2356 
2357 	if (flent->fe_tx_srs)
2358 		mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2359 	return (0);
2360 }
2361 
2362 void
2363 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2364 {
2365 	mac_client_impl_t	*cclient;
2366 	mac_soft_ring_set_t	*mac_srs;
2367 
2368 	/*
2369 	 * After grabbing the mi_rw_lock, the list of clients can't change.
2370 	 * If there are any clients mi_disabled must be B_FALSE and can't
2371 	 * get set since there are clients. If there aren't any clients we
2372 	 * don't do anything. In any case the mip has to be valid. The driver
2373 	 * must make sure that it goes single threaded (with respect to mac
2374 	 * calls) and wait for all pending mac calls to finish before calling
2375 	 * mac_unregister.
2376 	 */
2377 	rw_enter(&i_mac_impl_lock, RW_READER);
2378 	if (mip->mi_state_flags & MIS_DISABLED) {
2379 		rw_exit(&i_mac_impl_lock);
2380 		return;
2381 	}
2382 
2383 	/*
2384 	 * Get MAC tx srs from walking mac_client_handle list.
2385 	 */
2386 	rw_enter(&mip->mi_rw_lock, RW_READER);
2387 	for (cclient = mip->mi_clients_list; cclient != NULL;
2388 	    cclient = cclient->mci_client_next) {
2389 		if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL)
2390 			mac_tx_srs_wakeup(mac_srs, ring);
2391 		(void) mac_flow_walk(cclient->mci_subflow_tab,
2392 		    mac_tx_flow_srs_wakeup, ring);
2393 	}
2394 	rw_exit(&mip->mi_rw_lock);
2395 	rw_exit(&i_mac_impl_lock);
2396 }
2397 
2398 /* ARGSUSED */
2399 void
2400 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2401     boolean_t add)
2402 {
2403 	mac_impl_t *mip = (mac_impl_t *)mh;
2404 
2405 	i_mac_perim_enter((mac_impl_t *)mh);
2406 	/*
2407 	 * If no specific refresh function was given then default to the
2408 	 * driver's m_multicst entry point.
2409 	 */
2410 	if (refresh == NULL) {
2411 		refresh = mip->mi_multicst;
2412 		arg = mip->mi_driver;
2413 	}
2414 
2415 	mac_bcast_refresh(mip, refresh, arg, add);
2416 	i_mac_perim_exit((mac_impl_t *)mh);
2417 }
2418 
2419 void
2420 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2421 {
2422 	mac_impl_t	*mip = (mac_impl_t *)mh;
2423 
2424 	/*
2425 	 * If no specific refresh function was given then default to the
2426 	 * driver's m_promisc entry point.
2427 	 */
2428 	if (refresh == NULL) {
2429 		refresh = mip->mi_setpromisc;
2430 		arg = mip->mi_driver;
2431 	}
2432 	ASSERT(refresh != NULL);
2433 
2434 	/*
2435 	 * Call the refresh function with the current promiscuity.
2436 	 */
2437 	refresh(arg, (mip->mi_devpromisc != 0));
2438 }
2439 
2440 /*
2441  * The mac client requests that the mac not to change its margin size to
2442  * be less than the specified value.  If "current" is B_TRUE, then the client
2443  * requests the mac not to change its margin size to be smaller than the
2444  * current size. Further, return the current margin size value in this case.
2445  *
2446  * We keep every requested size in an ordered list from largest to smallest.
2447  */
2448 int
2449 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2450 {
2451 	mac_impl_t		*mip = (mac_impl_t *)mh;
2452 	mac_margin_req_t	**pp, *p;
2453 	int			err = 0;
2454 
2455 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2456 	if (current)
2457 		*marginp = mip->mi_margin;
2458 
2459 	/*
2460 	 * If the current margin value cannot satisfy the margin requested,
2461 	 * return ENOTSUP directly.
2462 	 */
2463 	if (*marginp > mip->mi_margin) {
2464 		err = ENOTSUP;
2465 		goto done;
2466 	}
2467 
2468 	/*
2469 	 * Check whether the given margin is already in the list. If so,
2470 	 * bump the reference count.
2471 	 */
2472 	for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2473 		if (p->mmr_margin == *marginp) {
2474 			/*
2475 			 * The margin requested is already in the list,
2476 			 * so just bump the reference count.
2477 			 */
2478 			p->mmr_ref++;
2479 			goto done;
2480 		}
2481 		if (p->mmr_margin < *marginp)
2482 			break;
2483 	}
2484 
2485 
2486 	p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2487 	p->mmr_margin = *marginp;
2488 	p->mmr_ref++;
2489 	p->mmr_nextp = *pp;
2490 	*pp = p;
2491 
2492 done:
2493 	rw_exit(&(mip->mi_rw_lock));
2494 	return (err);
2495 }
2496 
2497 /*
2498  * The mac client requests to cancel its previous mac_margin_add() request.
2499  * We remove the requested margin size from the list.
2500  */
2501 int
2502 mac_margin_remove(mac_handle_t mh, uint32_t margin)
2503 {
2504 	mac_impl_t		*mip = (mac_impl_t *)mh;
2505 	mac_margin_req_t	**pp, *p;
2506 	int			err = 0;
2507 
2508 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2509 	/*
2510 	 * Find the entry in the list for the given margin.
2511 	 */
2512 	for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2513 		if (p->mmr_margin == margin) {
2514 			if (--p->mmr_ref == 0)
2515 				break;
2516 
2517 			/*
2518 			 * There is still a reference to this address so
2519 			 * there's nothing more to do.
2520 			 */
2521 			goto done;
2522 		}
2523 	}
2524 
2525 	/*
2526 	 * We did not find an entry for the given margin.
2527 	 */
2528 	if (p == NULL) {
2529 		err = ENOENT;
2530 		goto done;
2531 	}
2532 
2533 	ASSERT(p->mmr_ref == 0);
2534 
2535 	/*
2536 	 * Remove it from the list.
2537 	 */
2538 	*pp = p->mmr_nextp;
2539 	kmem_free(p, sizeof (mac_margin_req_t));
2540 done:
2541 	rw_exit(&(mip->mi_rw_lock));
2542 	return (err);
2543 }
2544 
2545 boolean_t
2546 mac_margin_update(mac_handle_t mh, uint32_t margin)
2547 {
2548 	mac_impl_t	*mip = (mac_impl_t *)mh;
2549 	uint32_t	margin_needed = 0;
2550 
2551 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2552 
2553 	if (mip->mi_mmrp != NULL)
2554 		margin_needed = mip->mi_mmrp->mmr_margin;
2555 
2556 	if (margin_needed <= margin)
2557 		mip->mi_margin = margin;
2558 
2559 	rw_exit(&(mip->mi_rw_lock));
2560 
2561 	if (margin_needed <= margin)
2562 		i_mac_notify(mip, MAC_NOTE_MARGIN);
2563 
2564 	return (margin_needed <= margin);
2565 }
2566 
2567 /*
2568  * MAC Type Plugin functions.
2569  */
2570 
2571 mactype_t *
2572 mactype_getplugin(const char *pname)
2573 {
2574 	mactype_t	*mtype = NULL;
2575 	boolean_t	tried_modload = B_FALSE;
2576 
2577 	mutex_enter(&i_mactype_lock);
2578 
2579 find_registered_mactype:
2580 	if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2581 	    (mod_hash_val_t *)&mtype) != 0) {
2582 		if (!tried_modload) {
2583 			/*
2584 			 * If the plugin has not yet been loaded, then
2585 			 * attempt to load it now.  If modload() succeeds,
2586 			 * the plugin should have registered using
2587 			 * mactype_register(), in which case we can go back
2588 			 * and attempt to find it again.
2589 			 */
2590 			if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2591 				tried_modload = B_TRUE;
2592 				goto find_registered_mactype;
2593 			}
2594 		}
2595 	} else {
2596 		/*
2597 		 * Note that there's no danger that the plugin we've loaded
2598 		 * could be unloaded between the modload() step and the
2599 		 * reference count bump here, as we're holding
2600 		 * i_mactype_lock, which mactype_unregister() also holds.
2601 		 */
2602 		atomic_inc_32(&mtype->mt_ref);
2603 	}
2604 
2605 	mutex_exit(&i_mactype_lock);
2606 	return (mtype);
2607 }
2608 
2609 mactype_register_t *
2610 mactype_alloc(uint_t mactype_version)
2611 {
2612 	mactype_register_t *mtrp;
2613 
2614 	/*
2615 	 * Make sure there isn't a version mismatch between the plugin and
2616 	 * the framework.  In the future, if multiple versions are
2617 	 * supported, this check could become more sophisticated.
2618 	 */
2619 	if (mactype_version != MACTYPE_VERSION)
2620 		return (NULL);
2621 
2622 	mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2623 	mtrp->mtr_version = mactype_version;
2624 	return (mtrp);
2625 }
2626 
2627 void
2628 mactype_free(mactype_register_t *mtrp)
2629 {
2630 	kmem_free(mtrp, sizeof (mactype_register_t));
2631 }
2632 
2633 int
2634 mactype_register(mactype_register_t *mtrp)
2635 {
2636 	mactype_t	*mtp;
2637 	mactype_ops_t	*ops = mtrp->mtr_ops;
2638 
2639 	/* Do some sanity checking before we register this MAC type. */
2640 	if (mtrp->mtr_ident == NULL || ops == NULL)
2641 		return (EINVAL);
2642 
2643 	/*
2644 	 * Verify that all mandatory callbacks are set in the ops
2645 	 * vector.
2646 	 */
2647 	if (ops->mtops_unicst_verify == NULL ||
2648 	    ops->mtops_multicst_verify == NULL ||
2649 	    ops->mtops_sap_verify == NULL ||
2650 	    ops->mtops_header == NULL ||
2651 	    ops->mtops_header_info == NULL) {
2652 		return (EINVAL);
2653 	}
2654 
2655 	mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2656 	mtp->mt_ident = mtrp->mtr_ident;
2657 	mtp->mt_ops = *ops;
2658 	mtp->mt_type = mtrp->mtr_mactype;
2659 	mtp->mt_nativetype = mtrp->mtr_nativetype;
2660 	mtp->mt_addr_length = mtrp->mtr_addrlen;
2661 	if (mtrp->mtr_brdcst_addr != NULL) {
2662 		mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2663 		bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2664 		    mtrp->mtr_addrlen);
2665 	}
2666 
2667 	mtp->mt_stats = mtrp->mtr_stats;
2668 	mtp->mt_statcount = mtrp->mtr_statcount;
2669 
2670 	mtp->mt_mapping = mtrp->mtr_mapping;
2671 	mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2672 
2673 	if (mod_hash_insert(i_mactype_hash,
2674 	    (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2675 		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2676 		kmem_free(mtp, sizeof (*mtp));
2677 		return (EEXIST);
2678 	}
2679 	return (0);
2680 }
2681 
2682 int
2683 mactype_unregister(const char *ident)
2684 {
2685 	mactype_t	*mtp;
2686 	mod_hash_val_t	val;
2687 	int 		err;
2688 
2689 	/*
2690 	 * Let's not allow MAC drivers to use this plugin while we're
2691 	 * trying to unregister it.  Holding i_mactype_lock also prevents a
2692 	 * plugin from unregistering while a MAC driver is attempting to
2693 	 * hold a reference to it in i_mactype_getplugin().
2694 	 */
2695 	mutex_enter(&i_mactype_lock);
2696 
2697 	if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2698 	    (mod_hash_val_t *)&mtp)) != 0) {
2699 		/* A plugin is trying to unregister, but it never registered. */
2700 		err = ENXIO;
2701 		goto done;
2702 	}
2703 
2704 	if (mtp->mt_ref != 0) {
2705 		err = EBUSY;
2706 		goto done;
2707 	}
2708 
2709 	err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2710 	ASSERT(err == 0);
2711 	if (err != 0) {
2712 		/* This should never happen, thus the ASSERT() above. */
2713 		err = EINVAL;
2714 		goto done;
2715 	}
2716 	ASSERT(mtp == (mactype_t *)val);
2717 
2718 	if (mtp->mt_brdcst_addr != NULL)
2719 		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2720 	kmem_free(mtp, sizeof (mactype_t));
2721 done:
2722 	mutex_exit(&i_mactype_lock);
2723 	return (err);
2724 }
2725 
2726 /*
2727  * mac_set_prop() sets mac or hardware driver properties:
2728  * 	MAC resource properties include maxbw, priority, and cpu binding list.
2729  *	Driver properties are private properties to the hardware, such as mtu
2730  *	and speed.  There's one other MAC property -- the PVID.
2731  * If the property is a driver property, mac_set_prop() calls driver's callback
2732  * function to set it.
2733  * If the property is a mac resource property, mac_set_prop() invokes
2734  * mac_set_resources() which will cache the property value in mac_impl_t and
2735  * may call mac_client_set_resource() to update property value of the primary
2736  * mac client, if it exists.
2737  */
2738 int
2739 mac_set_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize)
2740 {
2741 	int err = ENOTSUP;
2742 	mac_impl_t *mip = (mac_impl_t *)mh;
2743 
2744 	ASSERT(MAC_PERIM_HELD(mh));
2745 
2746 	switch (macprop->mp_id) {
2747 	case MAC_PROP_MAXBW:
2748 	case MAC_PROP_PRIO:
2749 	case MAC_PROP_PROTECT:
2750 	case MAC_PROP_BIND_CPU: {
2751 		mac_resource_props_t mrp;
2752 
2753 		/* If it is mac property, call mac_set_resources() */
2754 		if (valsize < sizeof (mac_resource_props_t))
2755 			return (EINVAL);
2756 		bcopy(val, &mrp, sizeof (mrp));
2757 		err = mac_set_resources(mh, &mrp);
2758 		break;
2759 	}
2760 
2761 	case MAC_PROP_PVID:
2762 		if (valsize < sizeof (uint16_t) ||
2763 		    (mip->mi_state_flags & MIS_IS_VNIC))
2764 			return (EINVAL);
2765 		err = mac_set_pvid(mh, *(uint16_t *)val);
2766 		break;
2767 
2768 	case MAC_PROP_MTU: {
2769 		uint32_t mtu;
2770 
2771 		if (valsize < sizeof (mtu))
2772 			return (EINVAL);
2773 		bcopy(val, &mtu, sizeof (mtu));
2774 		err = mac_set_mtu(mh, mtu, NULL);
2775 		break;
2776 	}
2777 
2778 	case MAC_PROP_LLIMIT:
2779 	case MAC_PROP_LDECAY: {
2780 		uint32_t learnval;
2781 
2782 		if (valsize < sizeof (learnval) ||
2783 		    (mip->mi_state_flags & MIS_IS_VNIC))
2784 			return (EINVAL);
2785 		bcopy(val, &learnval, sizeof (learnval));
2786 		if (learnval == 0 && macprop->mp_id == MAC_PROP_LDECAY)
2787 			return (EINVAL);
2788 		if (macprop->mp_id == MAC_PROP_LLIMIT)
2789 			mip->mi_llimit = learnval;
2790 		else
2791 			mip->mi_ldecay = learnval;
2792 		err = 0;
2793 		break;
2794 	}
2795 
2796 	default:
2797 		/* For other driver properties, call driver's callback */
2798 		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
2799 			err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
2800 			    macprop->mp_name, macprop->mp_id, valsize, val);
2801 		}
2802 	}
2803 	return (err);
2804 }
2805 
2806 /*
2807  * mac_get_prop() gets mac or hardware driver properties.
2808  *
2809  * If the property is a driver property, mac_get_prop() calls driver's callback
2810  * function to get it.
2811  * If the property is a mac property, mac_get_prop() invokes mac_get_resources()
2812  * which returns the cached value in mac_impl_t.
2813  */
2814 int
2815 mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
2816     uint_t *perm)
2817 {
2818 	int err = ENOTSUP;
2819 	mac_impl_t *mip = (mac_impl_t *)mh;
2820 	link_state_t link_state;
2821 	boolean_t is_getprop, is_setprop;
2822 
2823 	is_getprop = (mip->mi_callbacks->mc_callbacks & MC_GETPROP);
2824 	is_setprop = (mip->mi_callbacks->mc_callbacks & MC_SETPROP);
2825 
2826 	switch (macprop->mp_id) {
2827 	case MAC_PROP_MAXBW:
2828 	case MAC_PROP_PRIO:
2829 	case MAC_PROP_PROTECT:
2830 	case MAC_PROP_BIND_CPU: {
2831 		mac_resource_props_t mrp;
2832 
2833 		/* If mac property, read from cache */
2834 		if (valsize < sizeof (mac_resource_props_t))
2835 			return (EINVAL);
2836 		mac_get_resources(mh, &mrp);
2837 		bcopy(&mrp, val, sizeof (mac_resource_props_t));
2838 		return (0);
2839 	}
2840 
2841 	case MAC_PROP_PVID:
2842 		if (valsize < sizeof (uint16_t) ||
2843 		    (mip->mi_state_flags & MIS_IS_VNIC))
2844 			return (EINVAL);
2845 		*(uint16_t *)val = mac_get_pvid(mh);
2846 		return (0);
2847 
2848 	case MAC_PROP_LLIMIT:
2849 	case MAC_PROP_LDECAY:
2850 		if (valsize < sizeof (uint32_t) ||
2851 		    (mip->mi_state_flags & MIS_IS_VNIC))
2852 			return (EINVAL);
2853 		if (macprop->mp_id == MAC_PROP_LLIMIT)
2854 			bcopy(&mip->mi_llimit, val, sizeof (mip->mi_llimit));
2855 		else
2856 			bcopy(&mip->mi_ldecay, val, sizeof (mip->mi_ldecay));
2857 		return (0);
2858 
2859 	case MAC_PROP_MTU: {
2860 		uint32_t sdu;
2861 		mac_propval_range_t range;
2862 
2863 		if ((macprop->mp_flags & MAC_PROP_POSSIBLE) != 0) {
2864 			if (valsize < sizeof (mac_propval_range_t))
2865 				return (EINVAL);
2866 			if (is_getprop) {
2867 				err = mip->mi_callbacks->mc_getprop(mip->
2868 				    mi_driver, macprop->mp_name, macprop->mp_id,
2869 				    macprop->mp_flags, valsize, val, perm);
2870 			}
2871 			/*
2872 			 * If the driver doesn't have *_m_getprop defined or
2873 			 * if the driver doesn't support setting MTU then
2874 			 * return the CURRENT value as POSSIBLE value.
2875 			 */
2876 			if (!is_getprop || err == ENOTSUP) {
2877 				mac_sdu_get(mh, NULL, &sdu);
2878 				range.mpr_count = 1;
2879 				range.mpr_type = MAC_PROPVAL_UINT32;
2880 				range.range_uint32[0].mpur_min =
2881 				    range.range_uint32[0].mpur_max = sdu;
2882 				bcopy(&range, val, sizeof (range));
2883 				err = 0;
2884 			}
2885 			return (err);
2886 		}
2887 		if (valsize < sizeof (sdu))
2888 			return (EINVAL);
2889 		if ((macprop->mp_flags & MAC_PROP_DEFAULT) == 0) {
2890 			mac_sdu_get(mh, NULL, &sdu);
2891 			bcopy(&sdu, val, sizeof (sdu));
2892 			if (is_setprop && (mip->mi_callbacks->mc_setprop(mip->
2893 			    mi_driver, macprop->mp_name, macprop->mp_id,
2894 			    valsize, val) == 0)) {
2895 				*perm = MAC_PROP_PERM_RW;
2896 			} else {
2897 				*perm = MAC_PROP_PERM_READ;
2898 			}
2899 			return (0);
2900 		} else {
2901 			if (mip->mi_info.mi_media == DL_ETHER) {
2902 				sdu = ETHERMTU;
2903 				bcopy(&sdu, val, sizeof (sdu));
2904 
2905 				return (0);
2906 			}
2907 			/*
2908 			 * ask driver for its default.
2909 			 */
2910 			break;
2911 		}
2912 	}
2913 	case MAC_PROP_STATUS:
2914 		if (valsize < sizeof (link_state))
2915 			return (EINVAL);
2916 		*perm = MAC_PROP_PERM_READ;
2917 		link_state = mac_link_get(mh);
2918 		bcopy(&link_state, val, sizeof (link_state));
2919 		return (0);
2920 	default:
2921 		break;
2922 
2923 	}
2924 	/* If driver property, request from driver */
2925 	if (is_getprop) {
2926 		err = mip->mi_callbacks->mc_getprop(mip->mi_driver,
2927 		    macprop->mp_name, macprop->mp_id, macprop->mp_flags,
2928 		    valsize, val, perm);
2929 	}
2930 	return (err);
2931 }
2932 
2933 int
2934 mac_fastpath_disable(mac_handle_t mh)
2935 {
2936 	mac_impl_t	*mip = (mac_impl_t *)mh;
2937 
2938 	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
2939 		return (0);
2940 
2941 	return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
2942 }
2943 
2944 void
2945 mac_fastpath_enable(mac_handle_t mh)
2946 {
2947 	mac_impl_t	*mip = (mac_impl_t *)mh;
2948 
2949 	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
2950 		return;
2951 
2952 	mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
2953 }
2954 
2955 void
2956 mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop)
2957 {
2958 	mac_priv_prop_t *mpriv;
2959 
2960 	if (mpp == NULL)
2961 		return;
2962 
2963 	mpriv = kmem_zalloc(nprop * sizeof (*mpriv), KM_SLEEP);
2964 	(void) memcpy(mpriv, mpp, nprop * sizeof (*mpriv));
2965 	mip->mi_priv_prop = mpriv;
2966 	mip->mi_priv_prop_count = nprop;
2967 }
2968 
2969 void
2970 mac_unregister_priv_prop(mac_impl_t *mip)
2971 {
2972 	mac_priv_prop_t	*mpriv;
2973 
2974 	mpriv = mip->mi_priv_prop;
2975 	if (mpriv != NULL) {
2976 		kmem_free(mpriv, mip->mi_priv_prop_count * sizeof (*mpriv));
2977 		mip->mi_priv_prop = NULL;
2978 	}
2979 	mip->mi_priv_prop_count = 0;
2980 }
2981 
2982 /*
2983  * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
2984  * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
2985  * cases if MAC free's the ring structure after mac_stop_ring(), any
2986  * illegal access to the ring structure coming from the driver will panic
2987  * the system. In order to protect the system from such inadverent access,
2988  * we maintain a cache of rings in the mac_impl_t after they get free'd up.
2989  * When packets are received on free'd up rings, MAC (through the generation
2990  * count mechanism) will drop such packets.
2991  */
2992 static mac_ring_t *
2993 mac_ring_alloc(mac_impl_t *mip, mac_capab_rings_t *cap_rings)
2994 {
2995 	mac_ring_t *ring;
2996 
2997 	if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2998 		mutex_enter(&mip->mi_ring_lock);
2999 		if (mip->mi_ring_freelist != NULL) {
3000 			ring = mip->mi_ring_freelist;
3001 			mip->mi_ring_freelist = ring->mr_next;
3002 			bzero(ring, sizeof (mac_ring_t));
3003 		} else {
3004 			ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
3005 		}
3006 		mutex_exit(&mip->mi_ring_lock);
3007 	} else {
3008 		ring = kmem_zalloc(sizeof (mac_ring_t), KM_SLEEP);
3009 	}
3010 	ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
3011 	return (ring);
3012 }
3013 
3014 static void
3015 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
3016 {
3017 	if (ring->mr_type == MAC_RING_TYPE_RX) {
3018 		mutex_enter(&mip->mi_ring_lock);
3019 		ring->mr_state = MR_FREE;
3020 		ring->mr_flag = 0;
3021 		ring->mr_next = mip->mi_ring_freelist;
3022 		mip->mi_ring_freelist = ring;
3023 		mutex_exit(&mip->mi_ring_lock);
3024 	} else {
3025 		kmem_free(ring, sizeof (mac_ring_t));
3026 	}
3027 }
3028 
3029 static void
3030 mac_ring_freeall(mac_impl_t *mip)
3031 {
3032 	mac_ring_t *ring_next;
3033 	mutex_enter(&mip->mi_ring_lock);
3034 	mac_ring_t *ring = mip->mi_ring_freelist;
3035 	while (ring != NULL) {
3036 		ring_next = ring->mr_next;
3037 		kmem_cache_free(mac_ring_cache, ring);
3038 		ring = ring_next;
3039 	}
3040 	mip->mi_ring_freelist = NULL;
3041 	mutex_exit(&mip->mi_ring_lock);
3042 }
3043 
3044 int
3045 mac_start_ring(mac_ring_t *ring)
3046 {
3047 	int rv = 0;
3048 
3049 	if (ring->mr_start != NULL)
3050 		rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
3051 
3052 	return (rv);
3053 }
3054 
3055 void
3056 mac_stop_ring(mac_ring_t *ring)
3057 {
3058 	if (ring->mr_stop != NULL)
3059 		ring->mr_stop(ring->mr_driver);
3060 
3061 	/*
3062 	 * Increment the ring generation number for this ring.
3063 	 */
3064 	ring->mr_gen_num++;
3065 }
3066 
3067 int
3068 mac_start_group(mac_group_t *group)
3069 {
3070 	int rv = 0;
3071 
3072 	if (group->mrg_start != NULL)
3073 		rv = group->mrg_start(group->mrg_driver);
3074 
3075 	return (rv);
3076 }
3077 
3078 void
3079 mac_stop_group(mac_group_t *group)
3080 {
3081 	if (group->mrg_stop != NULL)
3082 		group->mrg_stop(group->mrg_driver);
3083 }
3084 
3085 /*
3086  * Called from mac_start() on the default Rx group. Broadcast and multicast
3087  * packets are received only on the default group. Hence the default group
3088  * needs to be up even if the primary client is not up, for the other groups
3089  * to be functional. We do this by calling this function at mac_start time
3090  * itself. However the broadcast packets that are received can't make their
3091  * way beyond mac_rx until a mac client creates a broadcast flow.
3092  */
3093 static int
3094 mac_start_group_and_rings(mac_group_t *group)
3095 {
3096 	mac_ring_t	*ring;
3097 	int		rv = 0;
3098 
3099 	ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
3100 	if ((rv = mac_start_group(group)) != 0)
3101 		return (rv);
3102 
3103 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3104 		ASSERT(ring->mr_state == MR_FREE);
3105 		if ((rv = mac_start_ring(ring)) != 0)
3106 			goto error;
3107 		ring->mr_state = MR_INUSE;
3108 		ring->mr_classify_type = MAC_SW_CLASSIFIER;
3109 	}
3110 	return (0);
3111 
3112 error:
3113 	mac_stop_group_and_rings(group);
3114 	return (rv);
3115 }
3116 
3117 /* Called from mac_stop on the default Rx group */
3118 static void
3119 mac_stop_group_and_rings(mac_group_t *group)
3120 {
3121 	mac_ring_t	*ring;
3122 
3123 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3124 		if (ring->mr_state != MR_FREE) {
3125 			mac_stop_ring(ring);
3126 			ring->mr_state = MR_FREE;
3127 			ring->mr_flag = 0;
3128 			ring->mr_classify_type = MAC_NO_CLASSIFIER;
3129 		}
3130 	}
3131 	mac_stop_group(group);
3132 }
3133 
3134 
3135 static mac_ring_t *
3136 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3137     mac_capab_rings_t *cap_rings)
3138 {
3139 	mac_ring_t *ring;
3140 	mac_ring_info_t ring_info;
3141 
3142 	ring = mac_ring_alloc(mip, cap_rings);
3143 
3144 	/* Prepare basic information of ring */
3145 	ring->mr_index = index;
3146 	ring->mr_type = group->mrg_type;
3147 	ring->mr_gh = (mac_group_handle_t)group;
3148 
3149 	/* Insert the new ring to the list. */
3150 	ring->mr_next = group->mrg_rings;
3151 	group->mrg_rings = ring;
3152 
3153 	/* Zero to reuse the info data structure */
3154 	bzero(&ring_info, sizeof (ring_info));
3155 
3156 	/* Query ring information from driver */
3157 	cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3158 	    index, &ring_info, (mac_ring_handle_t)ring);
3159 
3160 	ring->mr_info = ring_info;
3161 
3162 	/* Update ring's status */
3163 	ring->mr_state = MR_FREE;
3164 	ring->mr_flag = 0;
3165 
3166 	/* Update the ring count of the group */
3167 	group->mrg_cur_count++;
3168 	return (ring);
3169 }
3170 
3171 /*
3172  * Rings are chained together for easy regrouping.
3173  */
3174 static void
3175 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3176     mac_capab_rings_t *cap_rings)
3177 {
3178 	int index;
3179 
3180 	/*
3181 	 * Initialize all ring members of this group. Size of zero will not
3182 	 * enter the loop, so it's safe for initializing an empty group.
3183 	 */
3184 	for (index = size - 1; index >= 0; index--)
3185 		(void) mac_init_ring(mip, group, index, cap_rings);
3186 }
3187 
3188 int
3189 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3190 {
3191 	mac_capab_rings_t *cap_rings;
3192 	mac_group_t *group, *groups;
3193 	mac_group_info_t group_info;
3194 	uint_t group_free = 0;
3195 	uint_t ring_left;
3196 	mac_ring_t *ring;
3197 	int g, err = 0;
3198 
3199 	switch (rtype) {
3200 	case MAC_RING_TYPE_RX:
3201 		ASSERT(mip->mi_rx_groups == NULL);
3202 
3203 		cap_rings = &mip->mi_rx_rings_cap;
3204 		cap_rings->mr_type = MAC_RING_TYPE_RX;
3205 		break;
3206 	case MAC_RING_TYPE_TX:
3207 		ASSERT(mip->mi_tx_groups == NULL);
3208 
3209 		cap_rings = &mip->mi_tx_rings_cap;
3210 		cap_rings->mr_type = MAC_RING_TYPE_TX;
3211 		break;
3212 	default:
3213 		ASSERT(B_FALSE);
3214 	}
3215 
3216 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS,
3217 	    cap_rings))
3218 		return (0);
3219 
3220 	/*
3221 	 * Allocate a contiguous buffer for all groups.
3222 	 */
3223 	groups = kmem_zalloc(sizeof (mac_group_t) * (cap_rings->mr_gnum + 1),
3224 	    KM_SLEEP);
3225 
3226 	ring_left = cap_rings->mr_rnum;
3227 
3228 	/*
3229 	 * Get all ring groups if any, and get their ring members
3230 	 * if any.
3231 	 */
3232 	for (g = 0; g < cap_rings->mr_gnum; g++) {
3233 		group = groups + g;
3234 
3235 		/* Prepare basic information of the group */
3236 		group->mrg_index = g;
3237 		group->mrg_type = rtype;
3238 		group->mrg_state = MAC_GROUP_STATE_UNINIT;
3239 		group->mrg_mh = (mac_handle_t)mip;
3240 		group->mrg_next = group + 1;
3241 
3242 		/* Zero to reuse the info data structure */
3243 		bzero(&group_info, sizeof (group_info));
3244 
3245 		/* Query group information from driver */
3246 		cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3247 		    (mac_group_handle_t)group);
3248 
3249 		switch (cap_rings->mr_group_type) {
3250 		case MAC_GROUP_TYPE_DYNAMIC:
3251 			if (cap_rings->mr_gaddring == NULL ||
3252 			    cap_rings->mr_gremring == NULL) {
3253 				DTRACE_PROBE3(
3254 				    mac__init__rings_no_addremring,
3255 				    char *, mip->mi_name,
3256 				    mac_group_add_ring_t,
3257 				    cap_rings->mr_gaddring,
3258 				    mac_group_add_ring_t,
3259 				    cap_rings->mr_gremring);
3260 				err = EINVAL;
3261 				goto bail;
3262 			}
3263 
3264 			switch (rtype) {
3265 			case MAC_RING_TYPE_RX:
3266 				/*
3267 				 * The first RX group must have non-zero
3268 				 * rings, and the following groups must
3269 				 * have zero rings.
3270 				 */
3271 				if (g == 0 && group_info.mgi_count == 0) {
3272 					DTRACE_PROBE1(
3273 					    mac__init__rings__rx__def__zero,
3274 					    char *, mip->mi_name);
3275 					err = EINVAL;
3276 					goto bail;
3277 				}
3278 				if (g > 0 && group_info.mgi_count != 0) {
3279 					DTRACE_PROBE3(
3280 					    mac__init__rings__rx__nonzero,
3281 					    char *, mip->mi_name,
3282 					    int, g, int, group_info.mgi_count);
3283 					err = EINVAL;
3284 					goto bail;
3285 				}
3286 				break;
3287 			case MAC_RING_TYPE_TX:
3288 				/*
3289 				 * All TX ring groups must have zero rings.
3290 				 */
3291 				if (group_info.mgi_count != 0) {
3292 					DTRACE_PROBE3(
3293 					    mac__init__rings__tx__nonzero,
3294 					    char *, mip->mi_name,
3295 					    int, g, int, group_info.mgi_count);
3296 					err = EINVAL;
3297 					goto bail;
3298 				}
3299 				break;
3300 			}
3301 			break;
3302 		case MAC_GROUP_TYPE_STATIC:
3303 			/*
3304 			 * Note that an empty group is allowed, e.g., an aggr
3305 			 * would start with an empty group.
3306 			 */
3307 			break;
3308 		default:
3309 			/* unknown group type */
3310 			DTRACE_PROBE2(mac__init__rings__unknown__type,
3311 			    char *, mip->mi_name,
3312 			    int, cap_rings->mr_group_type);
3313 			err = EINVAL;
3314 			goto bail;
3315 		}
3316 
3317 
3318 		/*
3319 		 * Driver must register group->mgi_addmac/remmac() for rx groups
3320 		 * to support multiple MAC addresses.
3321 		 */
3322 		if (rtype == MAC_RING_TYPE_RX) {
3323 			if ((group_info.mgi_addmac == NULL) ||
3324 			    (group_info.mgi_addmac == NULL))
3325 				goto bail;
3326 		}
3327 
3328 		/* Cache driver-supplied information */
3329 		group->mrg_info = group_info;
3330 
3331 		/* Update the group's status and group count. */
3332 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3333 		group_free++;
3334 
3335 		group->mrg_rings = NULL;
3336 		group->mrg_cur_count = 0;
3337 		mac_init_group(mip, group, group_info.mgi_count, cap_rings);
3338 		ring_left -= group_info.mgi_count;
3339 
3340 		/* The current group size should be equal to default value */
3341 		ASSERT(group->mrg_cur_count == group_info.mgi_count);
3342 	}
3343 
3344 	/* Build up a dummy group for free resources as a pool */
3345 	group = groups + cap_rings->mr_gnum;
3346 
3347 	/* Prepare basic information of the group */
3348 	group->mrg_index = -1;
3349 	group->mrg_type = rtype;
3350 	group->mrg_state = MAC_GROUP_STATE_UNINIT;
3351 	group->mrg_mh = (mac_handle_t)mip;
3352 	group->mrg_next = NULL;
3353 
3354 	/*
3355 	 * If there are ungrouped rings, allocate a continuous buffer for
3356 	 * remaining resources.
3357 	 */
3358 	if (ring_left != 0) {
3359 		group->mrg_rings = NULL;
3360 		group->mrg_cur_count = 0;
3361 		mac_init_group(mip, group, ring_left, cap_rings);
3362 
3363 		/* The current group size should be equal to ring_left */
3364 		ASSERT(group->mrg_cur_count == ring_left);
3365 
3366 		ring_left = 0;
3367 
3368 		/* Update this group's status */
3369 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3370 	} else
3371 		group->mrg_rings = NULL;
3372 
3373 	ASSERT(ring_left == 0);
3374 
3375 bail:
3376 	/* Cache other important information to finalize the initialization */
3377 	switch (rtype) {
3378 	case MAC_RING_TYPE_RX:
3379 		mip->mi_rx_group_type = cap_rings->mr_group_type;
3380 		mip->mi_rx_group_count = cap_rings->mr_gnum;
3381 		mip->mi_rx_groups = groups;
3382 		break;
3383 	case MAC_RING_TYPE_TX:
3384 		mip->mi_tx_group_type = cap_rings->mr_group_type;
3385 		mip->mi_tx_group_count = cap_rings->mr_gnum;
3386 		mip->mi_tx_group_free = group_free;
3387 		mip->mi_tx_groups = groups;
3388 
3389 		/*
3390 		 * Ring 0 is used as the default one and it could be assigned
3391 		 * to a client as well.
3392 		 */
3393 		group = groups + cap_rings->mr_gnum;
3394 		ring = group->mrg_rings;
3395 		while ((ring->mr_index != 0) && (ring->mr_next != NULL))
3396 			ring = ring->mr_next;
3397 		ASSERT(ring->mr_index == 0);
3398 		mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
3399 		break;
3400 	default:
3401 		ASSERT(B_FALSE);
3402 	}
3403 
3404 	if (err != 0)
3405 		mac_free_rings(mip, rtype);
3406 
3407 	return (err);
3408 }
3409 
3410 /*
3411  * Called to free all ring groups with particular type. It's supposed all groups
3412  * have been released by clinet.
3413  */
3414 void
3415 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3416 {
3417 	mac_group_t *group, *groups;
3418 	uint_t group_count;
3419 
3420 	switch (rtype) {
3421 	case MAC_RING_TYPE_RX:
3422 		if (mip->mi_rx_groups == NULL)
3423 			return;
3424 
3425 		groups = mip->mi_rx_groups;
3426 		group_count = mip->mi_rx_group_count;
3427 
3428 		mip->mi_rx_groups = NULL;
3429 		mip->mi_rx_group_count = 0;
3430 		break;
3431 	case MAC_RING_TYPE_TX:
3432 		ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
3433 
3434 		if (mip->mi_tx_groups == NULL)
3435 			return;
3436 
3437 		groups = mip->mi_tx_groups;
3438 		group_count = mip->mi_tx_group_count;
3439 
3440 		mip->mi_tx_groups = NULL;
3441 		mip->mi_tx_group_count = 0;
3442 		mip->mi_tx_group_free = 0;
3443 		mip->mi_default_tx_ring = NULL;
3444 		break;
3445 	default:
3446 		ASSERT(B_FALSE);
3447 	}
3448 
3449 	for (group = groups; group != NULL; group = group->mrg_next) {
3450 		mac_ring_t *ring;
3451 
3452 		if (group->mrg_cur_count == 0)
3453 			continue;
3454 
3455 		ASSERT(group->mrg_rings != NULL);
3456 
3457 		while ((ring = group->mrg_rings) != NULL) {
3458 			group->mrg_rings = ring->mr_next;
3459 			mac_ring_free(mip, ring);
3460 		}
3461 	}
3462 
3463 	/* Free all the cached rings */
3464 	mac_ring_freeall(mip);
3465 	/* Free the block of group data strutures */
3466 	kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
3467 }
3468 
3469 /*
3470  * Associate a MAC address with a receive group.
3471  *
3472  * The return value of this function should always be checked properly, because
3473  * any type of failure could cause unexpected results. A group can be added
3474  * or removed with a MAC address only after it has been reserved. Ideally,
3475  * a successful reservation always leads to calling mac_group_addmac() to
3476  * steer desired traffic. Failure of adding an unicast MAC address doesn't
3477  * always imply that the group is functioning abnormally.
3478  *
3479  * Currently this function is called everywhere, and it reflects assumptions
3480  * about MAC addresses in the implementation. CR 6735196.
3481  */
3482 int
3483 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
3484 {
3485 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3486 	ASSERT(group->mrg_info.mgi_addmac != NULL);
3487 
3488 	return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
3489 }
3490 
3491 /*
3492  * Remove the association between MAC address and receive group.
3493  */
3494 int
3495 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
3496 {
3497 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3498 	ASSERT(group->mrg_info.mgi_remmac != NULL);
3499 
3500 	return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
3501 }
3502 
3503 /*
3504  * Release a ring in use by marking it MR_FREE.
3505  * Any other client may reserve it for its use.
3506  */
3507 void
3508 mac_release_tx_ring(mac_ring_handle_t rh)
3509 {
3510 	mac_ring_t *ring = (mac_ring_t *)rh;
3511 	mac_group_t *group = (mac_group_t *)ring->mr_gh;
3512 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3513 
3514 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3515 	ASSERT(ring->mr_state != MR_FREE);
3516 
3517 	/*
3518 	 * Default tx ring will be released by mac_stop().
3519 	 */
3520 	if (rh == mip->mi_default_tx_ring)
3521 		return;
3522 
3523 	mac_stop_ring(ring);
3524 
3525 	ring->mr_state = MR_FREE;
3526 	ring->mr_flag = 0;
3527 }
3528 
3529 /*
3530  * This is the entry point for packets transmitted through the bridging code.
3531  * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh'
3532  * pointer may be NULL to select the default ring.
3533  */
3534 mblk_t *
3535 mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
3536 {
3537 	mac_handle_t mh;
3538 
3539 	/*
3540 	 * Once we take a reference on the bridge link, the bridge
3541 	 * module itself can't unload, so the callback pointers are
3542 	 * stable.
3543 	 */
3544 	mutex_enter(&mip->mi_bridge_lock);
3545 	if ((mh = mip->mi_bridge_link) != NULL)
3546 		mac_bridge_ref_cb(mh, B_TRUE);
3547 	mutex_exit(&mip->mi_bridge_lock);
3548 	if (mh == NULL) {
3549 		MAC_RING_TX(mip, rh, mp, mp);
3550 	} else {
3551 		mp = mac_bridge_tx_cb(mh, rh, mp);
3552 		mac_bridge_ref_cb(mh, B_FALSE);
3553 	}
3554 
3555 	return (mp);
3556 }
3557 
3558 /*
3559  * Find a ring from its index.
3560  */
3561 mac_ring_t *
3562 mac_find_ring(mac_group_t *group, int index)
3563 {
3564 	mac_ring_t *ring = group->mrg_rings;
3565 
3566 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
3567 		if (ring->mr_index == index)
3568 			break;
3569 
3570 	return (ring);
3571 }
3572 /*
3573  * Add a ring to an existing group.
3574  *
3575  * The ring must be either passed directly (for example if the ring
3576  * movement is initiated by the framework), or specified through a driver
3577  * index (for example when the ring is added by the driver.
3578  *
3579  * The caller needs to call mac_perim_enter() before calling this function.
3580  */
3581 int
3582 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
3583 {
3584 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3585 	mac_capab_rings_t *cap_rings;
3586 	boolean_t driver_call = (ring == NULL);
3587 	mac_group_type_t group_type;
3588 	int ret = 0;
3589 
3590 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3591 
3592 	switch (group->mrg_type) {
3593 	case MAC_RING_TYPE_RX:
3594 		cap_rings = &mip->mi_rx_rings_cap;
3595 		group_type = mip->mi_rx_group_type;
3596 		break;
3597 	case MAC_RING_TYPE_TX:
3598 		cap_rings = &mip->mi_tx_rings_cap;
3599 		group_type = mip->mi_tx_group_type;
3600 		break;
3601 	default:
3602 		ASSERT(B_FALSE);
3603 	}
3604 
3605 	/*
3606 	 * There should be no ring with the same ring index in the target
3607 	 * group.
3608 	 */
3609 	ASSERT(mac_find_ring(group, driver_call ? index : ring->mr_index) ==
3610 	    NULL);
3611 
3612 	if (driver_call) {
3613 		/*
3614 		 * The function is called as a result of a request from
3615 		 * a driver to add a ring to an existing group, for example
3616 		 * from the aggregation driver. Allocate a new mac_ring_t
3617 		 * for that ring.
3618 		 */
3619 		ring = mac_init_ring(mip, group, index, cap_rings);
3620 		ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
3621 	} else {
3622 		/*
3623 		 * The function is called as a result of a MAC layer request
3624 		 * to add a ring to an existing group. In this case the
3625 		 * ring is being moved between groups, which requires
3626 		 * the underlying driver to support dynamic grouping,
3627 		 * and the mac_ring_t already exists.
3628 		 */
3629 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3630 		ASSERT(cap_rings->mr_gaddring != NULL);
3631 		ASSERT(ring->mr_gh == NULL);
3632 	}
3633 
3634 	/*
3635 	 * At this point the ring should not be in use, and it should be
3636 	 * of the right for the target group.
3637 	 */
3638 	ASSERT(ring->mr_state < MR_INUSE);
3639 	ASSERT(ring->mr_srs == NULL);
3640 	ASSERT(ring->mr_type == group->mrg_type);
3641 
3642 	if (!driver_call) {
3643 		/*
3644 		 * Add the driver level hardware ring if the process was not
3645 		 * initiated by the driver, and the target group is not the
3646 		 * group.
3647 		 */
3648 		if (group->mrg_driver != NULL) {
3649 			cap_rings->mr_gaddring(group->mrg_driver,
3650 			    ring->mr_driver, ring->mr_type);
3651 		}
3652 
3653 		/*
3654 		 * Insert the ring ahead existing rings.
3655 		 */
3656 		ring->mr_next = group->mrg_rings;
3657 		group->mrg_rings = ring;
3658 		ring->mr_gh = (mac_group_handle_t)group;
3659 		group->mrg_cur_count++;
3660 	}
3661 
3662 	/*
3663 	 * If the group has not been actively used, we're done.
3664 	 */
3665 	if (group->mrg_index != -1 &&
3666 	    group->mrg_state < MAC_GROUP_STATE_RESERVED)
3667 		return (0);
3668 
3669 	/*
3670 	 * Set up SRS/SR according to the ring type.
3671 	 */
3672 	switch (ring->mr_type) {
3673 	case MAC_RING_TYPE_RX:
3674 		/*
3675 		 * Setup SRS on top of the new ring if the group is
3676 		 * reserved for someones exclusive use.
3677 		 */
3678 		if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
3679 			flow_entry_t *flent;
3680 			mac_client_impl_t *mcip;
3681 
3682 			mcip = MAC_RX_GROUP_ONLY_CLIENT(group);
3683 			ASSERT(mcip != NULL);
3684 			flent = mcip->mci_flent;
3685 			ASSERT(flent->fe_rx_srs_cnt > 0);
3686 			mac_srs_group_setup(mcip, flent, group, SRST_LINK);
3687 		}
3688 		break;
3689 	case MAC_RING_TYPE_TX:
3690 		/*
3691 		 * For TX this function is only invoked during the
3692 		 * initial creation of a group when a share is
3693 		 * associated with a MAC client. So the datapath is not
3694 		 * yet setup, and will be setup later after the
3695 		 * group has been reserved and populated.
3696 		 */
3697 		break;
3698 	default:
3699 		ASSERT(B_FALSE);
3700 	}
3701 
3702 	/*
3703 	 * Start the ring if needed. Failure causes to undo the grouping action.
3704 	 */
3705 	if ((ret = mac_start_ring(ring)) != 0) {
3706 		if (ring->mr_type == MAC_RING_TYPE_RX) {
3707 			if (ring->mr_srs != NULL) {
3708 				mac_rx_srs_remove(ring->mr_srs);
3709 				ring->mr_srs = NULL;
3710 			}
3711 		}
3712 		if (!driver_call) {
3713 			cap_rings->mr_gremring(group->mrg_driver,
3714 			    ring->mr_driver, ring->mr_type);
3715 		}
3716 		group->mrg_cur_count--;
3717 		group->mrg_rings = ring->mr_next;
3718 
3719 		ring->mr_gh = NULL;
3720 
3721 		if (driver_call)
3722 			mac_ring_free(mip, ring);
3723 
3724 		return (ret);
3725 	}
3726 
3727 	/*
3728 	 * Update the ring's state.
3729 	 */
3730 	ring->mr_state = MR_INUSE;
3731 	MAC_RING_UNMARK(ring, MR_INCIPIENT);
3732 	return (0);
3733 }
3734 
3735 /*
3736  * Remove a ring from it's current group. MAC internal function for dynamic
3737  * grouping.
3738  *
3739  * The caller needs to call mac_perim_enter() before calling this function.
3740  */
3741 void
3742 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
3743     boolean_t driver_call)
3744 {
3745 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3746 	mac_capab_rings_t *cap_rings = NULL;
3747 	mac_group_type_t group_type;
3748 
3749 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3750 
3751 	ASSERT(mac_find_ring(group, ring->mr_index) == ring);
3752 	ASSERT((mac_group_t *)ring->mr_gh == group);
3753 	ASSERT(ring->mr_type == group->mrg_type);
3754 
3755 	switch (ring->mr_type) {
3756 	case MAC_RING_TYPE_RX:
3757 		group_type = mip->mi_rx_group_type;
3758 		cap_rings = &mip->mi_rx_rings_cap;
3759 
3760 		if (group->mrg_state >= MAC_GROUP_STATE_RESERVED)
3761 			mac_stop_ring(ring);
3762 
3763 		/*
3764 		 * Only hardware classified packets hold a reference to the
3765 		 * ring all the way up the Rx path. mac_rx_srs_remove()
3766 		 * will take care of quiescing the Rx path and removing the
3767 		 * SRS. The software classified path neither holds a reference
3768 		 * nor any association with the ring in mac_rx.
3769 		 */
3770 		if (ring->mr_srs != NULL) {
3771 			mac_rx_srs_remove(ring->mr_srs);
3772 			ring->mr_srs = NULL;
3773 		}
3774 		ring->mr_state = MR_FREE;
3775 		ring->mr_flag = 0;
3776 
3777 		break;
3778 	case MAC_RING_TYPE_TX:
3779 		/*
3780 		 * For TX this function is only invoked in two
3781 		 * cases:
3782 		 *
3783 		 * 1) In the case of a failure during the
3784 		 * initial creation of a group when a share is
3785 		 * associated with a MAC client. So the SRS is not
3786 		 * yet setup, and will be setup later after the
3787 		 * group has been reserved and populated.
3788 		 *
3789 		 * 2) From mac_release_tx_group() when freeing
3790 		 * a TX SRS.
3791 		 *
3792 		 * In both cases the SRS and its soft rings are
3793 		 * already quiesced.
3794 		 */
3795 		ASSERT(!driver_call);
3796 		group_type = mip->mi_tx_group_type;
3797 		cap_rings = &mip->mi_tx_rings_cap;
3798 		break;
3799 	default:
3800 		ASSERT(B_FALSE);
3801 	}
3802 
3803 	/*
3804 	 * Remove the ring from the group.
3805 	 */
3806 	if (ring == group->mrg_rings)
3807 		group->mrg_rings = ring->mr_next;
3808 	else {
3809 		mac_ring_t *pre;
3810 
3811 		pre = group->mrg_rings;
3812 		while (pre->mr_next != ring)
3813 			pre = pre->mr_next;
3814 		pre->mr_next = ring->mr_next;
3815 	}
3816 	group->mrg_cur_count--;
3817 
3818 	if (!driver_call) {
3819 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3820 		ASSERT(cap_rings->mr_gremring != NULL);
3821 
3822 		/*
3823 		 * Remove the driver level hardware ring.
3824 		 */
3825 		if (group->mrg_driver != NULL) {
3826 			cap_rings->mr_gremring(group->mrg_driver,
3827 			    ring->mr_driver, ring->mr_type);
3828 		}
3829 	}
3830 
3831 	ring->mr_gh = NULL;
3832 	if (driver_call) {
3833 		mac_ring_free(mip, ring);
3834 	} else {
3835 		ring->mr_state = MR_FREE;
3836 		ring->mr_flag = 0;
3837 	}
3838 }
3839 
3840 /*
3841  * Move a ring to the target group. If needed, remove the ring from the group
3842  * that it currently belongs to.
3843  *
3844  * The caller need to enter MAC's perimeter by calling mac_perim_enter().
3845  */
3846 static int
3847 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
3848 {
3849 	mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
3850 	int rv;
3851 
3852 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3853 	ASSERT(d_group != NULL);
3854 	ASSERT(s_group->mrg_mh == d_group->mrg_mh);
3855 
3856 	if (s_group == d_group)
3857 		return (0);
3858 
3859 	/*
3860 	 * Remove it from current group first.
3861 	 */
3862 	if (s_group != NULL)
3863 		i_mac_group_rem_ring(s_group, ring, B_FALSE);
3864 
3865 	/*
3866 	 * Add it to the new group.
3867 	 */
3868 	rv = i_mac_group_add_ring(d_group, ring, 0);
3869 	if (rv != 0) {
3870 		/*
3871 		 * Failed to add ring back to source group. If
3872 		 * that fails, the ring is stuck in limbo, log message.
3873 		 */
3874 		if (i_mac_group_add_ring(s_group, ring, 0)) {
3875 			cmn_err(CE_WARN, "%s: failed to move ring %p\n",
3876 			    mip->mi_name, (void *)ring);
3877 		}
3878 	}
3879 
3880 	return (rv);
3881 }
3882 
3883 /*
3884  * Find a MAC address according to its value.
3885  */
3886 mac_address_t *
3887 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
3888 {
3889 	mac_address_t *map;
3890 
3891 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3892 
3893 	for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
3894 		if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
3895 			break;
3896 	}
3897 
3898 	return (map);
3899 }
3900 
3901 /*
3902  * Check whether the MAC address is shared by multiple clients.
3903  */
3904 boolean_t
3905 mac_check_macaddr_shared(mac_address_t *map)
3906 {
3907 	ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
3908 
3909 	return (map->ma_nusers > 1);
3910 }
3911 
3912 /*
3913  * Remove the specified MAC address from the MAC address list and free it.
3914  */
3915 static void
3916 mac_free_macaddr(mac_address_t *map)
3917 {
3918 	mac_impl_t *mip = map->ma_mip;
3919 
3920 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3921 	ASSERT(mip->mi_addresses != NULL);
3922 
3923 	map = mac_find_macaddr(mip, map->ma_addr);
3924 
3925 	ASSERT(map != NULL);
3926 	ASSERT(map->ma_nusers == 0);
3927 
3928 	if (map == mip->mi_addresses) {
3929 		mip->mi_addresses = map->ma_next;
3930 	} else {
3931 		mac_address_t *pre;
3932 
3933 		pre = mip->mi_addresses;
3934 		while (pre->ma_next != map)
3935 			pre = pre->ma_next;
3936 		pre->ma_next = map->ma_next;
3937 	}
3938 
3939 	kmem_free(map, sizeof (mac_address_t));
3940 }
3941 
3942 /*
3943  * Add a MAC address reference for a client. If the desired MAC address
3944  * exists, add a reference to it. Otherwise, add the new address by adding
3945  * it to a reserved group or setting promiscuous mode. Won't try different
3946  * group is the group is non-NULL, so the caller must explictly share
3947  * default group when needed.
3948  *
3949  * Note, the primary MAC address is initialized at registration time, so
3950  * to add it to default group only need to activate it if its reference
3951  * count is still zero. Also, some drivers may not have advertised RINGS
3952  * capability.
3953  */
3954 int
3955 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
3956     boolean_t use_hw)
3957 {
3958 	mac_address_t *map;
3959 	int err = 0;
3960 	boolean_t allocated_map = B_FALSE;
3961 
3962 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3963 
3964 	map = mac_find_macaddr(mip, mac_addr);
3965 
3966 	/*
3967 	 * If the new MAC address has not been added. Allocate a new one
3968 	 * and set it up.
3969 	 */
3970 	if (map == NULL) {
3971 		map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
3972 		map->ma_len = mip->mi_type->mt_addr_length;
3973 		bcopy(mac_addr, map->ma_addr, map->ma_len);
3974 		map->ma_nusers = 0;
3975 		map->ma_group = group;
3976 		map->ma_mip = mip;
3977 
3978 		/* add the new MAC address to the head of the address list */
3979 		map->ma_next = mip->mi_addresses;
3980 		mip->mi_addresses = map;
3981 
3982 		allocated_map = B_TRUE;
3983 	}
3984 
3985 	ASSERT(map->ma_group == group);
3986 
3987 	/*
3988 	 * If the MAC address is already in use, simply account for the
3989 	 * new client.
3990 	 */
3991 	if (map->ma_nusers++ > 0)
3992 		return (0);
3993 
3994 	/*
3995 	 * Activate this MAC address by adding it to the reserved group.
3996 	 */
3997 	if (group != NULL) {
3998 		err = mac_group_addmac(group, (const uint8_t *)mac_addr);
3999 		if (err == 0) {
4000 			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4001 			return (0);
4002 		}
4003 	}
4004 
4005 	/*
4006 	 * The MAC address addition failed. If the client requires a
4007 	 * hardware classified MAC address, fail the operation.
4008 	 */
4009 	if (use_hw) {
4010 		err = ENOSPC;
4011 		goto bail;
4012 	}
4013 
4014 	/*
4015 	 * Try promiscuous mode.
4016 	 *
4017 	 * For drivers that don't advertise RINGS capability, do
4018 	 * nothing for the primary address.
4019 	 */
4020 	if ((group == NULL) &&
4021 	    (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
4022 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4023 		return (0);
4024 	}
4025 
4026 	/*
4027 	 * Enable promiscuous mode in order to receive traffic
4028 	 * to the new MAC address.
4029 	 */
4030 	if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
4031 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
4032 		return (0);
4033 	}
4034 
4035 	/*
4036 	 * Free the MAC address that could not be added. Don't free
4037 	 * a pre-existing address, it could have been the entry
4038 	 * for the primary MAC address which was pre-allocated by
4039 	 * mac_init_macaddr(), and which must remain on the list.
4040 	 */
4041 bail:
4042 	map->ma_nusers--;
4043 	if (allocated_map)
4044 		mac_free_macaddr(map);
4045 	return (err);
4046 }
4047 
4048 /*
4049  * Remove a reference to a MAC address. This may cause to remove the MAC
4050  * address from an associated group or to turn off promiscuous mode.
4051  * The caller needs to handle the failure properly.
4052  */
4053 int
4054 mac_remove_macaddr(mac_address_t *map)
4055 {
4056 	mac_impl_t *mip = map->ma_mip;
4057 	int err = 0;
4058 
4059 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4060 
4061 	ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
4062 
4063 	/*
4064 	 * If it's not the last client using this MAC address, only update
4065 	 * the MAC clients count.
4066 	 */
4067 	if (--map->ma_nusers > 0)
4068 		return (0);
4069 
4070 	/*
4071 	 * The MAC address is no longer used by any MAC client, so remove
4072 	 * it from its associated group, or turn off promiscuous mode
4073 	 * if it was enabled for the MAC address.
4074 	 */
4075 	switch (map->ma_type) {
4076 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4077 		/*
4078 		 * Don't free the preset primary address for drivers that
4079 		 * don't advertise RINGS capability.
4080 		 */
4081 		if (map->ma_group == NULL)
4082 			return (0);
4083 
4084 		err = mac_group_remmac(map->ma_group, map->ma_addr);
4085 		break;
4086 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4087 		err = i_mac_promisc_set(mip, B_FALSE);
4088 		break;
4089 	default:
4090 		ASSERT(B_FALSE);
4091 	}
4092 
4093 	if (err != 0)
4094 		return (err);
4095 
4096 	/*
4097 	 * We created MAC address for the primary one at registration, so we
4098 	 * won't free it here. mac_fini_macaddr() will take care of it.
4099 	 */
4100 	if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
4101 		mac_free_macaddr(map);
4102 
4103 	return (0);
4104 }
4105 
4106 /*
4107  * Update an existing MAC address. The caller need to make sure that the new
4108  * value has not been used.
4109  */
4110 int
4111 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
4112 {
4113 	mac_impl_t *mip = map->ma_mip;
4114 	int err = 0;
4115 
4116 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4117 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4118 
4119 	switch (map->ma_type) {
4120 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4121 		/*
4122 		 * Update the primary address for drivers that are not
4123 		 * RINGS capable.
4124 		 */
4125 		if (map->ma_group == NULL) {
4126 			err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
4127 			    mac_addr);
4128 			if (err != 0)
4129 				return (err);
4130 			break;
4131 		}
4132 
4133 		/*
4134 		 * If this MAC address is not currently in use,
4135 		 * simply break out and update the value.
4136 		 */
4137 		if (map->ma_nusers == 0)
4138 			break;
4139 
4140 		/*
4141 		 * Need to replace the MAC address associated with a group.
4142 		 */
4143 		err = mac_group_remmac(map->ma_group, map->ma_addr);
4144 		if (err != 0)
4145 			return (err);
4146 
4147 		err = mac_group_addmac(map->ma_group, mac_addr);
4148 
4149 		/*
4150 		 * Failure hints hardware error. The MAC layer needs to
4151 		 * have error notification facility to handle this.
4152 		 * Now, simply try to restore the value.
4153 		 */
4154 		if (err != 0)
4155 			(void) mac_group_addmac(map->ma_group, map->ma_addr);
4156 
4157 		break;
4158 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4159 		/*
4160 		 * Need to do nothing more if in promiscuous mode.
4161 		 */
4162 		break;
4163 	default:
4164 		ASSERT(B_FALSE);
4165 	}
4166 
4167 	/*
4168 	 * Successfully replaced the MAC address.
4169 	 */
4170 	if (err == 0)
4171 		bcopy(mac_addr, map->ma_addr, map->ma_len);
4172 
4173 	return (err);
4174 }
4175 
4176 /*
4177  * Freshen the MAC address with new value. Its caller must have updated the
4178  * hardware MAC address before calling this function.
4179  * This funcitons is supposed to be used to handle the MAC address change
4180  * notification from underlying drivers.
4181  */
4182 void
4183 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
4184 {
4185 	mac_impl_t *mip = map->ma_mip;
4186 
4187 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4188 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4189 
4190 	/*
4191 	 * Freshen the MAC address with new value.
4192 	 */
4193 	bcopy(mac_addr, map->ma_addr, map->ma_len);
4194 	bcopy(mac_addr, mip->mi_addr, map->ma_len);
4195 
4196 	/*
4197 	 * Update all MAC clients that share this MAC address.
4198 	 */
4199 	mac_unicast_update_clients(mip, map);
4200 }
4201 
4202 /*
4203  * Set up the primary MAC address.
4204  */
4205 void
4206 mac_init_macaddr(mac_impl_t *mip)
4207 {
4208 	mac_address_t *map;
4209 
4210 	/*
4211 	 * The reference count is initialized to zero, until it's really
4212 	 * activated.
4213 	 */
4214 	map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4215 	map->ma_len = mip->mi_type->mt_addr_length;
4216 	bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
4217 
4218 	/*
4219 	 * If driver advertises RINGS capability, it shouldn't have initialized
4220 	 * its primary MAC address. For other drivers, including VNIC, the
4221 	 * primary address must work after registration.
4222 	 */
4223 	if (mip->mi_rx_groups == NULL)
4224 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4225 
4226 	/*
4227 	 * The primary MAC address is reserved for default group according
4228 	 * to current design.
4229 	 */
4230 	map->ma_group = mip->mi_rx_groups;
4231 	map->ma_mip = mip;
4232 
4233 	mip->mi_addresses = map;
4234 }
4235 
4236 /*
4237  * Clean up the primary MAC address. Note, only one primary MAC address
4238  * is allowed. All other MAC addresses must have been freed appropriately.
4239  */
4240 void
4241 mac_fini_macaddr(mac_impl_t *mip)
4242 {
4243 	mac_address_t *map = mip->mi_addresses;
4244 
4245 	if (map == NULL)
4246 		return;
4247 
4248 	/*
4249 	 * If mi_addresses is initialized, there should be exactly one
4250 	 * entry left on the list with no users.
4251 	 */
4252 	ASSERT(map->ma_nusers == 0);
4253 	ASSERT(map->ma_next == NULL);
4254 
4255 	kmem_free(map, sizeof (mac_address_t));
4256 	mip->mi_addresses = NULL;
4257 }
4258 
4259 /*
4260  * Logging related functions.
4261  */
4262 
4263 /* Write the Flow description to the log file */
4264 int
4265 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
4266 {
4267 	flow_desc_t		*fdesc;
4268 	mac_resource_props_t	*mrp;
4269 	net_desc_t		ndesc;
4270 
4271 	bzero(&ndesc, sizeof (net_desc_t));
4272 
4273 	/*
4274 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4275 	 * Updates to the fe_flow_desc are done under the fe_lock
4276 	 */
4277 	mutex_enter(&flent->fe_lock);
4278 	fdesc = &flent->fe_flow_desc;
4279 	mrp = &flent->fe_resource_props;
4280 
4281 	ndesc.nd_name = flent->fe_flow_name;
4282 	ndesc.nd_devname = mcip->mci_name;
4283 	bcopy(fdesc->fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4284 	bcopy(fdesc->fd_dst_mac, ndesc.nd_edest, ETHERADDRL);
4285 	ndesc.nd_sap = htonl(fdesc->fd_sap);
4286 	ndesc.nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
4287 	ndesc.nd_bw_limit = mrp->mrp_maxbw;
4288 	if (ndesc.nd_isv4) {
4289 		ndesc.nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
4290 		ndesc.nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
4291 	} else {
4292 		bcopy(&fdesc->fd_local_addr, ndesc.nd_saddr, IPV6_ADDR_LEN);
4293 		bcopy(&fdesc->fd_remote_addr, ndesc.nd_daddr, IPV6_ADDR_LEN);
4294 	}
4295 	ndesc.nd_sport = htons(fdesc->fd_local_port);
4296 	ndesc.nd_dport = htons(fdesc->fd_remote_port);
4297 	ndesc.nd_protocol = (uint8_t)fdesc->fd_protocol;
4298 	mutex_exit(&flent->fe_lock);
4299 
4300 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_FLDESC_REC));
4301 }
4302 
4303 /* Write the Flow statistics to the log file */
4304 int
4305 mac_write_flow_stats(flow_entry_t *flent)
4306 {
4307 	flow_stats_t	*fl_stats;
4308 	net_stat_t	nstat;
4309 
4310 	fl_stats = &flent->fe_flowstats;
4311 	nstat.ns_name = flent->fe_flow_name;
4312 	nstat.ns_ibytes = fl_stats->fs_rbytes;
4313 	nstat.ns_obytes = fl_stats->fs_obytes;
4314 	nstat.ns_ipackets = fl_stats->fs_ipackets;
4315 	nstat.ns_opackets = fl_stats->fs_opackets;
4316 	nstat.ns_ierrors = fl_stats->fs_ierrors;
4317 	nstat.ns_oerrors = fl_stats->fs_oerrors;
4318 
4319 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_FLSTAT_REC));
4320 }
4321 
4322 /* Write the Link Description to the log file */
4323 int
4324 mac_write_link_desc(mac_client_impl_t *mcip)
4325 {
4326 	net_desc_t		ndesc;
4327 	flow_entry_t		*flent = mcip->mci_flent;
4328 
4329 	bzero(&ndesc, sizeof (net_desc_t));
4330 
4331 	ndesc.nd_name = mcip->mci_name;
4332 	ndesc.nd_devname = mcip->mci_name;
4333 	ndesc.nd_isv4 = B_TRUE;
4334 	/*
4335 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4336 	 * Updates to the fe_flow_desc are done under the fe_lock
4337 	 * after removing the flent from the flow table.
4338 	 */
4339 	mutex_enter(&flent->fe_lock);
4340 	bcopy(flent->fe_flow_desc.fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4341 	mutex_exit(&flent->fe_lock);
4342 
4343 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_LNDESC_REC));
4344 }
4345 
4346 /* Write the Link statistics to the log file */
4347 int
4348 mac_write_link_stats(mac_client_impl_t *mcip)
4349 {
4350 	net_stat_t	nstat;
4351 
4352 	nstat.ns_name = mcip->mci_name;
4353 	nstat.ns_ibytes = mcip->mci_stat_ibytes;
4354 	nstat.ns_obytes = mcip->mci_stat_obytes;
4355 	nstat.ns_ipackets = mcip->mci_stat_ipackets;
4356 	nstat.ns_opackets = mcip->mci_stat_opackets;
4357 	nstat.ns_ierrors = mcip->mci_stat_ierrors;
4358 	nstat.ns_oerrors = mcip->mci_stat_oerrors;
4359 
4360 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_LNSTAT_REC));
4361 }
4362 
4363 /*
4364  * For a given flow, if the descrition has not been logged before, do it now.
4365  * If it is a VNIC, then we have collected information about it from the MAC
4366  * table, so skip it.
4367  */
4368 /*ARGSUSED*/
4369 static int
4370 mac_log_flowinfo(flow_entry_t *flent, void *args)
4371 {
4372 	mac_client_impl_t	*mcip = flent->fe_mcip;
4373 
4374 	if (mcip == NULL)
4375 		return (0);
4376 
4377 	/*
4378 	 * If the name starts with "vnic", and fe_user_generated is true (to
4379 	 * exclude the mcast and active flow entries created implicitly for
4380 	 * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
4381 	 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
4382 	 */
4383 	if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
4384 	    (flent->fe_type & FLOW_USER) != 0) {
4385 		return (0);
4386 	}
4387 
4388 	if (!flent->fe_desc_logged) {
4389 		/*
4390 		 * We don't return error because we want to continu the
4391 		 * walk in case this is the last walk which means we
4392 		 * need to reset fe_desc_logged in all the flows.
4393 		 */
4394 		if (mac_write_flow_desc(flent, mcip) != 0)
4395 			return (0);
4396 		flent->fe_desc_logged = B_TRUE;
4397 	}
4398 
4399 	/*
4400 	 * Regardless of the error, we want to proceed in case we have to
4401 	 * reset fe_desc_logged.
4402 	 */
4403 	(void) mac_write_flow_stats(flent);
4404 
4405 	if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
4406 		flent->fe_desc_logged = B_FALSE;
4407 
4408 	return (0);
4409 }
4410 
4411 typedef struct i_mac_log_state_s {
4412 	boolean_t	mi_last;
4413 	int		mi_fenable;
4414 	int		mi_lenable;
4415 } i_mac_log_state_t;
4416 
4417 /*
4418  * Walk the mac_impl_ts and log the description for each mac client of this mac,
4419  * if it hasn't already been done. Additionally, log statistics for the link as
4420  * well. Walk the flow table and log information for each flow as well.
4421  * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
4422  * also fe_desc_logged, if flow logging is on) since we want to log the
4423  * description if and when logging is restarted.
4424  */
4425 /*ARGSUSED*/
4426 static uint_t
4427 i_mac_log_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
4428 {
4429 	mac_impl_t		*mip = (mac_impl_t *)val;
4430 	i_mac_log_state_t	*lstate = (i_mac_log_state_t *)arg;
4431 	int			ret;
4432 	mac_client_impl_t	*mcip;
4433 
4434 	/*
4435 	 * Only walk the client list for NIC and etherstub
4436 	 */
4437 	if ((mip->mi_state_flags & MIS_DISABLED) ||
4438 	    ((mip->mi_state_flags & MIS_IS_VNIC) &&
4439 	    (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL)))
4440 		return (MH_WALK_CONTINUE);
4441 
4442 	for (mcip = mip->mi_clients_list; mcip != NULL;
4443 	    mcip = mcip->mci_client_next) {
4444 		if (!MCIP_DATAPATH_SETUP(mcip))
4445 			continue;
4446 		if (lstate->mi_lenable) {
4447 			if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
4448 				ret = mac_write_link_desc(mcip);
4449 				if (ret != 0) {
4450 				/*
4451 				 * We can't terminate it if this is the last
4452 				 * walk, else there might be some links with
4453 				 * mi_desc_logged set to true, which means
4454 				 * their description won't be logged the next
4455 				 * time logging is started (similarly for the
4456 				 * flows within such links). We can continue
4457 				 * without walking the flow table (i.e. to
4458 				 * set fe_desc_logged to false) because we
4459 				 * won't have written any flow stuff for this
4460 				 * link as we haven't logged the link itself.
4461 				 */
4462 					if (lstate->mi_last)
4463 						return (MH_WALK_CONTINUE);
4464 					else
4465 						return (MH_WALK_TERMINATE);
4466 				}
4467 				mcip->mci_state_flags |= MCIS_DESC_LOGGED;
4468 			}
4469 		}
4470 
4471 		if (mac_write_link_stats(mcip) != 0 && !lstate->mi_last)
4472 			return (MH_WALK_TERMINATE);
4473 
4474 		if (lstate->mi_last)
4475 			mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
4476 
4477 		if (lstate->mi_fenable) {
4478 			if (mcip->mci_subflow_tab != NULL) {
4479 				(void) mac_flow_walk(mcip->mci_subflow_tab,
4480 				    mac_log_flowinfo, mip);
4481 			}
4482 		}
4483 	}
4484 	return (MH_WALK_CONTINUE);
4485 }
4486 
4487 /*
4488  * The timer thread that runs every mac_logging_interval seconds and logs
4489  * link and/or flow information.
4490  */
4491 /* ARGSUSED */
4492 void
4493 mac_log_linkinfo(void *arg)
4494 {
4495 	i_mac_log_state_t	lstate;
4496 
4497 	rw_enter(&i_mac_impl_lock, RW_READER);
4498 	if (!mac_flow_log_enable && !mac_link_log_enable) {
4499 		rw_exit(&i_mac_impl_lock);
4500 		return;
4501 	}
4502 	lstate.mi_fenable = mac_flow_log_enable;
4503 	lstate.mi_lenable = mac_link_log_enable;
4504 	lstate.mi_last = B_FALSE;
4505 	rw_exit(&i_mac_impl_lock);
4506 
4507 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4508 
4509 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4510 	if (mac_flow_log_enable || mac_link_log_enable) {
4511 		mac_logging_timer = timeout(mac_log_linkinfo, NULL,
4512 		    SEC_TO_TICK(mac_logging_interval));
4513 	}
4514 	rw_exit(&i_mac_impl_lock);
4515 }
4516 
4517 typedef struct i_mac_fastpath_state_s {
4518 	boolean_t	mf_disable;
4519 	int		mf_err;
4520 } i_mac_fastpath_state_t;
4521 
4522 /*ARGSUSED*/
4523 static uint_t
4524 i_mac_fastpath_disable_walker(mod_hash_key_t key, mod_hash_val_t *val,
4525     void *arg)
4526 {
4527 	i_mac_fastpath_state_t	*state = arg;
4528 	mac_handle_t		mh = (mac_handle_t)val;
4529 
4530 	if (state->mf_disable)
4531 		state->mf_err = mac_fastpath_disable(mh);
4532 	else
4533 		mac_fastpath_enable(mh);
4534 
4535 	return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
4536 }
4537 
4538 /*
4539  * Start the logging timer.
4540  */
4541 int
4542 mac_start_logusage(mac_logtype_t type, uint_t interval)
4543 {
4544 	i_mac_fastpath_state_t state = {B_TRUE, 0};
4545 	int err;
4546 
4547 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4548 	switch (type) {
4549 	case MAC_LOGTYPE_FLOW:
4550 		if (mac_flow_log_enable) {
4551 			rw_exit(&i_mac_impl_lock);
4552 			return (0);
4553 		}
4554 		/* FALLTHRU */
4555 	case MAC_LOGTYPE_LINK:
4556 		if (mac_link_log_enable) {
4557 			rw_exit(&i_mac_impl_lock);
4558 			return (0);
4559 		}
4560 		break;
4561 	default:
4562 		ASSERT(0);
4563 	}
4564 
4565 	/* Disable fastpath */
4566 	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_disable_walker, &state);
4567 	if ((err = state.mf_err) != 0) {
4568 		/* Reenable fastpath  */
4569 		state.mf_disable = B_FALSE;
4570 		state.mf_err = 0;
4571 		mod_hash_walk(i_mac_impl_hash,
4572 		    i_mac_fastpath_disable_walker, &state);
4573 		rw_exit(&i_mac_impl_lock);
4574 		return (err);
4575 	}
4576 
4577 	switch (type) {
4578 	case MAC_LOGTYPE_FLOW:
4579 		mac_flow_log_enable = B_TRUE;
4580 		/* FALLTHRU */
4581 	case MAC_LOGTYPE_LINK:
4582 		mac_link_log_enable = B_TRUE;
4583 		break;
4584 	}
4585 
4586 	mac_logging_interval = interval;
4587 	rw_exit(&i_mac_impl_lock);
4588 	mac_log_linkinfo(NULL);
4589 	return (0);
4590 }
4591 
4592 /*
4593  * Stop the logging timer if both Link and Flow logging are turned off.
4594  */
4595 void
4596 mac_stop_logusage(mac_logtype_t type)
4597 {
4598 	i_mac_log_state_t	lstate;
4599 	i_mac_fastpath_state_t	state = {B_FALSE, 0};
4600 
4601 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4602 	lstate.mi_fenable = mac_flow_log_enable;
4603 	lstate.mi_lenable = mac_link_log_enable;
4604 
4605 	/* Last walk */
4606 	lstate.mi_last = B_TRUE;
4607 
4608 	switch (type) {
4609 	case MAC_LOGTYPE_FLOW:
4610 		if (lstate.mi_fenable) {
4611 			ASSERT(mac_link_log_enable);
4612 			mac_flow_log_enable = B_FALSE;
4613 			mac_link_log_enable = B_FALSE;
4614 			break;
4615 		}
4616 		/* FALLTHRU */
4617 	case MAC_LOGTYPE_LINK:
4618 		if (!lstate.mi_lenable || mac_flow_log_enable) {
4619 			rw_exit(&i_mac_impl_lock);
4620 			return;
4621 		}
4622 		mac_link_log_enable = B_FALSE;
4623 		break;
4624 	default:
4625 		ASSERT(0);
4626 	}
4627 
4628 	/* Reenable fastpath */
4629 	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_disable_walker, &state);
4630 
4631 	rw_exit(&i_mac_impl_lock);
4632 	(void) untimeout(mac_logging_timer);
4633 	mac_logging_timer = 0;
4634 
4635 	/* Last walk */
4636 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4637 }
4638 
4639 /*
4640  * Walk the rx and tx SRS/SRs for a flow and update the priority value.
4641  */
4642 void
4643 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
4644 {
4645 	pri_t			pri;
4646 	int			count;
4647 	mac_soft_ring_set_t	*mac_srs;
4648 
4649 	if (flent->fe_rx_srs_cnt <= 0)
4650 		return;
4651 
4652 	if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
4653 	    SRST_FLOW) {
4654 		pri = FLOW_PRIORITY(mcip->mci_min_pri,
4655 		    mcip->mci_max_pri,
4656 		    flent->fe_resource_props.mrp_priority);
4657 	} else {
4658 		pri = mcip->mci_max_pri;
4659 	}
4660 
4661 	for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
4662 		mac_srs = flent->fe_rx_srs[count];
4663 		mac_update_srs_priority(mac_srs, pri);
4664 	}
4665 	/*
4666 	 * If we have a Tx SRS, we need to modify all the threads associated
4667 	 * with it.
4668 	 */
4669 	if (flent->fe_tx_srs != NULL)
4670 		mac_update_srs_priority(flent->fe_tx_srs, pri);
4671 }
4672 
4673 /*
4674  * RX and TX rings are reserved according to different semantics depending
4675  * on the requests from the MAC clients and type of rings:
4676  *
4677  * On the Tx side, by default we reserve individual rings, independently from
4678  * the groups.
4679  *
4680  * On the Rx side, the reservation is at the granularity of the group
4681  * of rings, and used for v12n level 1 only. It has a special case for the
4682  * primary client.
4683  *
4684  * If a share is allocated to a MAC client, we allocate a TX group and an
4685  * RX group to the client, and assign TX rings and RX rings to these
4686  * groups according to information gathered from the driver through
4687  * the share capability.
4688  *
4689  * The foreseable evolution of Rx rings will handle v12n level 2 and higher
4690  * to allocate individual rings out of a group and program the hw classifier
4691  * based on IP address or higher level criteria.
4692  */
4693 
4694 /*
4695  * mac_reserve_tx_ring()
4696  * Reserve a unused ring by marking it with MR_INUSE state.
4697  * As reserved, the ring is ready to function.
4698  *
4699  * Notes for Hybrid I/O:
4700  *
4701  * If a specific ring is needed, it is specified through the desired_ring
4702  * argument. Otherwise that argument is set to NULL.
4703  * If the desired ring was previous allocated to another client, this
4704  * function swaps it with a new ring from the group of unassigned rings.
4705  */
4706 mac_ring_t *
4707 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
4708 {
4709 	mac_group_t *group;
4710 	mac_ring_t *ring;
4711 
4712 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4713 
4714 	if (mip->mi_tx_groups == NULL)
4715 		return (NULL);
4716 
4717 	/*
4718 	 * Find an available ring and start it before changing its status.
4719 	 * The unassigned rings are at the end of the mi_tx_groups
4720 	 * array.
4721 	 */
4722 	group = mip->mi_tx_groups + mip->mi_tx_group_count;
4723 
4724 	for (ring = group->mrg_rings; ring != NULL;
4725 	    ring = ring->mr_next) {
4726 		if (desired_ring == NULL) {
4727 			if (ring->mr_state == MR_FREE)
4728 				/* wanted any free ring and found one */
4729 				break;
4730 		} else {
4731 			mac_ring_t *sring;
4732 			mac_client_impl_t *client;
4733 			mac_soft_ring_set_t *srs;
4734 
4735 			if (ring != desired_ring)
4736 				/* wants a desired ring but this one ain't it */
4737 				continue;
4738 
4739 			if (ring->mr_state == MR_FREE)
4740 				break;
4741 
4742 			/*
4743 			 * Found the desired ring but it's already in use.
4744 			 * Swap it with a new ring.
4745 			 */
4746 
4747 			/* find the client which owns that ring */
4748 			for (client = mip->mi_clients_list; client != NULL;
4749 			    client = client->mci_client_next) {
4750 				srs = MCIP_TX_SRS(client);
4751 				if (srs != NULL && mac_tx_srs_ring_present(srs,
4752 				    desired_ring)) {
4753 					/* found our ring */
4754 					break;
4755 				}
4756 			}
4757 			if (client == NULL) {
4758 				/*
4759 				 * The TX ring is in use, but it's not
4760 				 * associated with any clients, so it
4761 				 * has to be the default ring. In that
4762 				 * case we can simply assign a new ring
4763 				 * as the default ring, and we're done.
4764 				 */
4765 				ASSERT(mip->mi_default_tx_ring ==
4766 				    (mac_ring_handle_t)desired_ring);
4767 
4768 				/*
4769 				 * Quiesce all clients on top of
4770 				 * the NIC to make sure there are no
4771 				 * pending threads still relying on
4772 				 * that default ring, for example
4773 				 * the multicast path.
4774 				 */
4775 				for (client = mip->mi_clients_list;
4776 				    client != NULL;
4777 				    client = client->mci_client_next) {
4778 					mac_tx_client_quiesce(client,
4779 					    SRS_QUIESCE);
4780 				}
4781 
4782 				mip->mi_default_tx_ring = (mac_ring_handle_t)
4783 				    mac_reserve_tx_ring(mip, NULL);
4784 
4785 				/* resume the clients */
4786 				for (client = mip->mi_clients_list;
4787 				    client != NULL;
4788 				    client = client->mci_client_next)
4789 					mac_tx_client_restart(client);
4790 
4791 				break;
4792 			}
4793 
4794 			/*
4795 			 * Note that we cannot simply invoke the group
4796 			 * add/rem routines since the client doesn't have a
4797 			 * TX group. So we need to instead add/remove
4798 			 * the rings from the SRS.
4799 			 */
4800 			ASSERT(client->mci_share == NULL);
4801 
4802 			/* first quiece the client */
4803 			mac_tx_client_quiesce(client, SRS_QUIESCE);
4804 
4805 			/* give a new ring to the client... */
4806 			sring = mac_reserve_tx_ring(mip, NULL);
4807 			if (sring != NULL) {
4808 				/*
4809 				 * There are no other available ring
4810 				 * on that MAC instance. The client
4811 				 * will fallback to the shared TX
4812 				 * ring.
4813 				 */
4814 				mac_tx_srs_add_ring(srs, sring);
4815 			}
4816 
4817 			/* ... in exchange for our desired ring */
4818 			mac_tx_srs_del_ring(srs, desired_ring);
4819 
4820 			/* restart the client */
4821 			mac_tx_client_restart(client);
4822 
4823 			if (mip->mi_default_tx_ring ==
4824 			    (mac_ring_handle_t)desired_ring) {
4825 				/*
4826 				 * The desired ring is the default ring,
4827 				 * and there are one or more clients
4828 				 * using that default ring directly.
4829 				 */
4830 				mip->mi_default_tx_ring =
4831 				    (mac_ring_handle_t)sring;
4832 				/*
4833 				 * Find clients using default ring and
4834 				 * swap it with the new default ring.
4835 				 */
4836 				for (client = mip->mi_clients_list;
4837 				    client != NULL;
4838 				    client = client->mci_client_next) {
4839 					srs = MCIP_TX_SRS(client);
4840 					if (srs != NULL &&
4841 					    mac_tx_srs_ring_present(srs,
4842 					    desired_ring)) {
4843 						/* first quiece the client */
4844 						mac_tx_client_quiesce(client,
4845 						    SRS_QUIESCE);
4846 
4847 						/*
4848 						 * Give it the new default
4849 						 * ring, and remove the old
4850 						 * one.
4851 						 */
4852 						if (sring != NULL) {
4853 							mac_tx_srs_add_ring(srs,
4854 							    sring);
4855 						}
4856 						mac_tx_srs_del_ring(srs,
4857 						    desired_ring);
4858 
4859 						/* restart the client */
4860 						mac_tx_client_restart(client);
4861 					}
4862 				}
4863 			}
4864 			break;
4865 		}
4866 	}
4867 
4868 	if (ring != NULL) {
4869 		if (mac_start_ring(ring) != 0)
4870 			return (NULL);
4871 		ring->mr_state = MR_INUSE;
4872 	}
4873 
4874 	return (ring);
4875 }
4876 
4877 /*
4878  * Minimum number of rings to leave in the default TX group when allocating
4879  * rings to new clients.
4880  */
4881 static uint_t mac_min_rx_default_rings = 1;
4882 
4883 /*
4884  * Populate a zero-ring group with rings. If the share is non-NULL,
4885  * the rings are chosen according to that share.
4886  * Invoked after allocating a new RX or TX group through
4887  * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
4888  * Returns zero on success, an errno otherwise.
4889  */
4890 int
4891 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
4892     mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share)
4893 {
4894 	mac_ring_t **rings, *tmp_ring[1], *ring;
4895 	uint_t nrings;
4896 	int rv, i, j;
4897 
4898 	ASSERT(mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC &&
4899 	    mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
4900 	ASSERT(new_group->mrg_cur_count == 0);
4901 
4902 	/*
4903 	 * First find the rings to allocate to the group.
4904 	 */
4905 	if (share != NULL) {
4906 		/* get rings through ms_squery() */
4907 		mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
4908 		ASSERT(nrings != 0);
4909 		rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
4910 		    KM_SLEEP);
4911 		mip->mi_share_capab.ms_squery(share, ring_type,
4912 		    (mac_ring_handle_t *)rings, &nrings);
4913 	} else {
4914 		/* this function is called for TX only with a share */
4915 		ASSERT(ring_type == MAC_RING_TYPE_RX);
4916 		/*
4917 		 * Pick one ring from default group.
4918 		 *
4919 		 * for now pick the second ring which requires the first ring
4920 		 * at index 0 to stay in the default group, since it is the
4921 		 * ring which carries the multicast traffic.
4922 		 * We need a better way for a driver to indicate this,
4923 		 * for example a per-ring flag.
4924 		 */
4925 		for (ring = src_group->mrg_rings; ring != NULL;
4926 		    ring = ring->mr_next) {
4927 			if (ring->mr_index != 0)
4928 				break;
4929 		}
4930 		ASSERT(ring != NULL);
4931 		nrings = 1;
4932 		tmp_ring[0] = ring;
4933 		rings = tmp_ring;
4934 	}
4935 
4936 	switch (ring_type) {
4937 	case MAC_RING_TYPE_RX:
4938 		if (src_group->mrg_cur_count - nrings <
4939 		    mac_min_rx_default_rings) {
4940 			/* we ran out of rings */
4941 			return (ENOSPC);
4942 		}
4943 
4944 		/* move receive rings to new group */
4945 		for (i = 0; i < nrings; i++) {
4946 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4947 			if (rv != 0) {
4948 				/* move rings back on failure */
4949 				for (j = 0; j < i; j++) {
4950 					(void) mac_group_mov_ring(mip,
4951 					    src_group, rings[j]);
4952 				}
4953 				return (rv);
4954 			}
4955 		}
4956 		break;
4957 
4958 	case MAC_RING_TYPE_TX: {
4959 		mac_ring_t *tmp_ring;
4960 
4961 		/* move the TX rings to the new group */
4962 		ASSERT(src_group == NULL);
4963 		for (i = 0; i < nrings; i++) {
4964 			/* get the desired ring */
4965 			tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
4966 			ASSERT(tmp_ring == rings[i]);
4967 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4968 			if (rv != 0) {
4969 				/* cleanup on failure */
4970 				for (j = 0; j < i; j++) {
4971 					(void) mac_group_mov_ring(mip,
4972 					    mip->mi_tx_groups +
4973 					    mip->mi_tx_group_count, rings[j]);
4974 				}
4975 			}
4976 		}
4977 		break;
4978 	}
4979 	}
4980 
4981 	if (share != NULL) {
4982 		/* add group to share */
4983 		mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
4984 		/* free temporary array of rings */
4985 		kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
4986 	}
4987 
4988 	return (0);
4989 }
4990 
4991 void
4992 mac_rx_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
4993 {
4994 	mac_grp_client_t *mgcp;
4995 
4996 	for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
4997 		if (mgcp->mgc_client == mcip)
4998 			break;
4999 	}
5000 
5001 	VERIFY(mgcp == NULL);
5002 
5003 	mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
5004 	mgcp->mgc_client = mcip;
5005 	mgcp->mgc_next = grp->mrg_clients;
5006 	grp->mrg_clients = mgcp;
5007 
5008 }
5009 
5010 void
5011 mac_rx_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
5012 {
5013 	mac_grp_client_t *mgcp, **pprev;
5014 
5015 	for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
5016 	    pprev = &mgcp->mgc_next, mgcp = *pprev) {
5017 		if (mgcp->mgc_client == mcip)
5018 			break;
5019 	}
5020 
5021 	ASSERT(mgcp != NULL);
5022 
5023 	*pprev = mgcp->mgc_next;
5024 	kmem_free(mgcp, sizeof (mac_grp_client_t));
5025 }
5026 
5027 /*
5028  * mac_reserve_rx_group()
5029  *
5030  * Finds an available group and exclusively reserves it for a client.
5031  * The group is chosen to suit the flow's resource controls (bandwidth and
5032  * fanout requirements) and the address type.
5033  * If the requestor is the pimary MAC then return the group with the
5034  * largest number of rings, otherwise the default ring when available.
5035  */
5036 mac_group_t *
5037 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr,
5038     mac_rx_group_reserve_type_t rtype)
5039 {
5040 	mac_share_handle_t	share = mcip->mci_share;
5041 	mac_impl_t		*mip = mcip->mci_mip;
5042 	mac_group_t		*grp = NULL;
5043 	int			i, start, loopcount;
5044 	int			err;
5045 	mac_address_t		*map;
5046 
5047 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5048 
5049 	/* Check if a group already has this mac address (case of VLANs) */
5050 	if ((map = mac_find_macaddr(mip, mac_addr)) != NULL)
5051 		return (map->ma_group);
5052 
5053 	if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0 ||
5054 	    rtype == MAC_RX_NO_RESERVE)
5055 		return (NULL);
5056 
5057 	/*
5058 	 * Try to exclusively reserve a RX group.
5059 	 *
5060 	 * For flows requires SW_RING it always goes to the default group
5061 	 * (Until we can explicitely call out default groups (CR 6695600),
5062 	 * we assume that the default group is always at position zero);
5063 	 *
5064 	 * For flows requires HW_DEFAULT_RING (unicast flow of the primary
5065 	 * client), try to reserve the default RX group only.
5066 	 *
5067 	 * For flows requires HW_RING (unicast flow of other clients), try
5068 	 * to reserve non-default RX group then the default group.
5069 	 */
5070 	switch (rtype) {
5071 	case MAC_RX_RESERVE_DEFAULT:
5072 		start = 0;
5073 		loopcount = 1;
5074 		break;
5075 	case MAC_RX_RESERVE_NONDEFAULT:
5076 		start = 1;
5077 		loopcount = mip->mi_rx_group_count;
5078 	}
5079 
5080 	for (i = start; i < start + loopcount; i++) {
5081 		grp = &mip->mi_rx_groups[i % mip->mi_rx_group_count];
5082 
5083 		DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
5084 		    int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
5085 
5086 		/*
5087 		 * Check to see whether this mac client is the only client
5088 		 * on this RX group. If not, we cannot exclusively reserve
5089 		 * this RX group.
5090 		 */
5091 		if (!MAC_RX_GROUP_NO_CLIENT(grp) &&
5092 		    (MAC_RX_GROUP_ONLY_CLIENT(grp) != mcip)) {
5093 			continue;
5094 		}
5095 
5096 		/*
5097 		 * This group could already be SHARED by other multicast
5098 		 * flows on this client. In that case, the group would
5099 		 * be shared and has already been started.
5100 		 */
5101 		ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
5102 
5103 		if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
5104 		    (mac_start_group(grp) != 0)) {
5105 			continue;
5106 		}
5107 
5108 		if ((i % mip->mi_rx_group_count) == 0 ||
5109 		    mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
5110 			break;
5111 		}
5112 
5113 		ASSERT(grp->mrg_cur_count == 0);
5114 
5115 		/*
5116 		 * Populate the group. Rings should be taken
5117 		 * from the default group at position 0 for now.
5118 		 */
5119 
5120 		err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
5121 		    &mip->mi_rx_groups[0], grp, share);
5122 		if (err == 0)
5123 			break;
5124 
5125 		DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
5126 		    mip->mi_name, int, grp->mrg_index, int, err);
5127 
5128 		/*
5129 		 * It's a dynamic group but the grouping operation failed.
5130 		 */
5131 		mac_stop_group(grp);
5132 	}
5133 
5134 	if (i == start + loopcount)
5135 		return (NULL);
5136 
5137 	ASSERT(grp != NULL);
5138 
5139 	DTRACE_PROBE2(rx__group__reserved,
5140 	    char *, mip->mi_name, int, grp->mrg_index);
5141 	return (grp);
5142 }
5143 
5144 /*
5145  * mac_rx_release_group()
5146  *
5147  * This is called when there are no clients left for the group.
5148  * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
5149  * and if it is a non default group, the shares are removed and
5150  * all rings are assigned back to default group.
5151  */
5152 void
5153 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
5154 {
5155 	mac_impl_t	*mip = mcip->mci_mip;
5156 	mac_ring_t	*ring;
5157 
5158 	ASSERT(group != &mip->mi_rx_groups[0]);
5159 
5160 	/*
5161 	 * This is the case where there are no clients left. Any
5162 	 * SRS etc on this group have also be quiesced.
5163 	 */
5164 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
5165 		if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
5166 			ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
5167 			/*
5168 			 * Remove the SRS associated with the HW ring.
5169 			 * As a result, polling will be disabled.
5170 			 */
5171 			ring->mr_srs = NULL;
5172 		}
5173 		ASSERT(ring->mr_state == MR_INUSE);
5174 		mac_stop_ring(ring);
5175 		ring->mr_state = MR_FREE;
5176 		ring->mr_flag = 0;
5177 	}
5178 
5179 	/* remove group from share */
5180 	if (mcip->mci_share != NULL) {
5181 		mip->mi_share_capab.ms_sremove(mcip->mci_share,
5182 		    group->mrg_driver);
5183 	}
5184 
5185 	if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
5186 		mac_ring_t *ring;
5187 
5188 		/*
5189 		 * Rings were dynamically allocated to group.
5190 		 * Move rings back to default group.
5191 		 */
5192 		while ((ring = group->mrg_rings) != NULL) {
5193 			(void) mac_group_mov_ring(mip,
5194 			    &mip->mi_rx_groups[0], ring);
5195 		}
5196 	}
5197 	mac_stop_group(group);
5198 	/*
5199 	 * Possible improvement: See if we can assign the group just released
5200 	 * to a another client of the mip
5201 	 */
5202 }
5203 
5204 /*
5205  * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
5206  * when a share was allocated to the client.
5207  */
5208 mac_group_t *
5209 mac_reserve_tx_group(mac_impl_t *mip, mac_share_handle_t share)
5210 {
5211 	mac_group_t *grp;
5212 	int rv, i;
5213 
5214 	/*
5215 	 * TX groups are currently allocated only to MAC clients
5216 	 * which are associated with a share. Since we have a fixed
5217 	 * number of share and groups, and we already successfully
5218 	 * allocated a share, find an available TX group.
5219 	 */
5220 	ASSERT(share != NULL);
5221 	ASSERT(mip->mi_tx_group_free > 0);
5222 
5223 	for (i = 0; i <  mip->mi_tx_group_count; i++) {
5224 		grp = &mip->mi_tx_groups[i];
5225 
5226 		if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
5227 		    (grp->mrg_state == MAC_GROUP_STATE_UNINIT))
5228 			continue;
5229 
5230 		rv = mac_start_group(grp);
5231 		ASSERT(rv == 0);
5232 
5233 		grp->mrg_state = MAC_GROUP_STATE_RESERVED;
5234 		break;
5235 	}
5236 
5237 	ASSERT(grp != NULL);
5238 
5239 	/*
5240 	 * Populate the group. Rings should be taken from the group
5241 	 * of unassigned rings, which is past the array of TX
5242 	 * groups adversized by the driver.
5243 	 */
5244 	rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, NULL,
5245 	    grp, share);
5246 	if (rv != 0) {
5247 		DTRACE_PROBE3(tx__group__reserve__alloc__rings,
5248 		    char *, mip->mi_name, int, grp->mrg_index, int, rv);
5249 
5250 		mac_stop_group(grp);
5251 		grp->mrg_state = MAC_GROUP_STATE_UNINIT;
5252 
5253 		return (NULL);
5254 	}
5255 
5256 	mip->mi_tx_group_free--;
5257 
5258 	return (grp);
5259 }
5260 
5261 void
5262 mac_release_tx_group(mac_impl_t *mip, mac_group_t *grp)
5263 {
5264 	mac_client_impl_t *mcip = grp->mrg_tx_client;
5265 	mac_share_handle_t share = mcip->mci_share;
5266 	mac_ring_t *ring;
5267 
5268 	ASSERT(mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
5269 	ASSERT(share != NULL);
5270 	ASSERT(grp->mrg_state == MAC_GROUP_STATE_RESERVED);
5271 
5272 	mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
5273 	while ((ring = grp->mrg_rings) != NULL) {
5274 		/* move the ring back to the pool */
5275 		(void) mac_group_mov_ring(mip, mip->mi_tx_groups +
5276 		    mip->mi_tx_group_count, ring);
5277 	}
5278 	mac_stop_group(grp);
5279 	mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
5280 	grp->mrg_tx_client = NULL;
5281 	mip->mi_tx_group_free++;
5282 }
5283 
5284 /*
5285  * This is a 1-time control path activity initiated by the client (IP).
5286  * The mac perimeter protects against other simultaneous control activities,
5287  * for example an ioctl that attempts to change the degree of fanout and
5288  * increase or decrease the number of softrings associated with this Tx SRS.
5289  */
5290 static mac_tx_notify_cb_t *
5291 mac_client_tx_notify_add(mac_client_impl_t *mcip,
5292     mac_tx_notify_t notify, void *arg)
5293 {
5294 	mac_cb_info_t *mcbi;
5295 	mac_tx_notify_cb_t *mtnfp;
5296 
5297 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5298 
5299 	mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
5300 	mtnfp->mtnf_fn = notify;
5301 	mtnfp->mtnf_arg = arg;
5302 	mtnfp->mtnf_link.mcb_objp = mtnfp;
5303 	mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
5304 	mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
5305 
5306 	mcbi = &mcip->mci_tx_notify_cb_info;
5307 	mutex_enter(mcbi->mcbi_lockp);
5308 	mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
5309 	mutex_exit(mcbi->mcbi_lockp);
5310 	return (mtnfp);
5311 }
5312 
5313 static void
5314 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
5315 {
5316 	mac_cb_info_t	*mcbi;
5317 	mac_cb_t	**cblist;
5318 
5319 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5320 
5321 	if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
5322 	    &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
5323 		cmn_err(CE_WARN,
5324 		    "mac_client_tx_notify_remove: callback not "
5325 		    "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
5326 		return;
5327 	}
5328 
5329 	mcbi = &mcip->mci_tx_notify_cb_info;
5330 	cblist = &mcip->mci_tx_notify_cb_list;
5331 	mutex_enter(mcbi->mcbi_lockp);
5332 	if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
5333 		kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
5334 	else
5335 		mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
5336 	mutex_exit(mcbi->mcbi_lockp);
5337 }
5338 
5339 /*
5340  * mac_client_tx_notify():
5341  * call to add and remove flow control callback routine.
5342  */
5343 mac_tx_notify_handle_t
5344 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
5345     void *ptr)
5346 {
5347 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
5348 	mac_tx_notify_cb_t	*mtnfp = NULL;
5349 
5350 	i_mac_perim_enter(mcip->mci_mip);
5351 
5352 	if (callb_func != NULL) {
5353 		/* Add a notify callback */
5354 		mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
5355 	} else {
5356 		mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
5357 	}
5358 	i_mac_perim_exit(mcip->mci_mip);
5359 
5360 	return ((mac_tx_notify_handle_t)mtnfp);
5361 }
5362 
5363 void
5364 mac_bridge_vectors(mac_bridge_tx_t txf, mac_bridge_rx_t rxf,
5365     mac_bridge_ref_t reff, mac_bridge_ls_t lsf)
5366 {
5367 	mac_bridge_tx_cb = txf;
5368 	mac_bridge_rx_cb = rxf;
5369 	mac_bridge_ref_cb = reff;
5370 	mac_bridge_ls_cb = lsf;
5371 }
5372 
5373 int
5374 mac_bridge_set(mac_handle_t mh, mac_handle_t link)
5375 {
5376 	mac_impl_t *mip = (mac_impl_t *)mh;
5377 	int retv;
5378 
5379 	mutex_enter(&mip->mi_bridge_lock);
5380 	if (mip->mi_bridge_link == NULL) {
5381 		mip->mi_bridge_link = link;
5382 		retv = 0;
5383 	} else {
5384 		retv = EBUSY;
5385 	}
5386 	mutex_exit(&mip->mi_bridge_lock);
5387 	if (retv == 0) {
5388 		mac_poll_state_change(mh, B_FALSE);
5389 		mac_capab_update(mh);
5390 	}
5391 	return (retv);
5392 }
5393 
5394 /*
5395  * Disable bridging on the indicated link.
5396  */
5397 void
5398 mac_bridge_clear(mac_handle_t mh, mac_handle_t link)
5399 {
5400 	mac_impl_t *mip = (mac_impl_t *)mh;
5401 
5402 	mutex_enter(&mip->mi_bridge_lock);
5403 	ASSERT(mip->mi_bridge_link == link);
5404 	mip->mi_bridge_link = NULL;
5405 	mutex_exit(&mip->mi_bridge_lock);
5406 	mac_poll_state_change(mh, B_TRUE);
5407 	mac_capab_update(mh);
5408 }
5409 
5410 void
5411 mac_no_active(mac_handle_t mh)
5412 {
5413 	mac_impl_t *mip = (mac_impl_t *)mh;
5414 
5415 	i_mac_perim_enter(mip);
5416 	mip->mi_state_flags |= MIS_NO_ACTIVE;
5417 	i_mac_perim_exit(mip);
5418 }
5419