xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_grp.c (revision 5ee6ac27d4fd4c9412183aa8cc1143f36ae04a8c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
27  *
28  * An instance of the structure aggr_grp_t is allocated for each
29  * link aggregation group. When created, aggr_grp_t objects are
30  * entered into the aggr_grp_hash hash table maintained by the modhash
31  * module. The hash key is the linkid associated with the link
32  * aggregation group.
33  *
34  * A set of MAC ports are associated with each association group.
35  *
36  * Aggr pseudo TX rings
37  * --------------------
38  * The underlying ports (NICs) in an aggregation can have TX rings. To
39  * enhance aggr's performance, these TX rings are made available to the
40  * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
41  * They are already present and implemented on the RX side. It is called
42  * as pseudo RX rings. The same concept is extended to the TX side where
43  * each TX ring of an underlying port is reflected in aggr as a pseudo
44  * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
45  * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
46  * TX ring is given to the aggregation layer.
47  *
48  * With this change, the outgoing stack depth looks much better:
49  *
50  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
51  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
52  *
53  * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
54  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
55  *
56  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
57  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
58  * ring belonging to a port on which the packet has to be sent.
59  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
60  * policy and then uses the fanout_hint passed to it to pick a TX ring from
61  * the selected port.
62  *
63  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
64  * bandwidth limit is applied first on the outgoing packet and the packets
65  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
66  * particular TX ring.
67  */
68 
69 #include <sys/types.h>
70 #include <sys/sysmacros.h>
71 #include <sys/conf.h>
72 #include <sys/cmn_err.h>
73 #include <sys/disp.h>
74 #include <sys/list.h>
75 #include <sys/ksynch.h>
76 #include <sys/kmem.h>
77 #include <sys/stream.h>
78 #include <sys/modctl.h>
79 #include <sys/ddi.h>
80 #include <sys/sunddi.h>
81 #include <sys/atomic.h>
82 #include <sys/stat.h>
83 #include <sys/modhash.h>
84 #include <sys/id_space.h>
85 #include <sys/strsun.h>
86 #include <sys/cred.h>
87 #include <sys/dlpi.h>
88 #include <sys/zone.h>
89 #include <sys/mac_provider.h>
90 #include <sys/dls.h>
91 #include <sys/vlan.h>
92 #include <sys/aggr.h>
93 #include <sys/aggr_impl.h>
94 
95 static int aggr_m_start(void *);
96 static void aggr_m_stop(void *);
97 static int aggr_m_promisc(void *, boolean_t);
98 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
99 static int aggr_m_unicst(void *, const uint8_t *);
100 static int aggr_m_stat(void *, uint_t, uint64_t *);
101 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
102 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
103 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
104     const void *);
105 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
106     mac_prop_info_handle_t);
107 
108 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
109 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
110     boolean_t *);
111 
112 static void aggr_grp_capab_set(aggr_grp_t *);
113 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
114 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
115 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
116 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
117 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
118 
119 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
120 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
121 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
122 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
123 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
124 static void aggr_pseudo_stop_ring(mac_ring_driver_t);
125 static int aggr_addmac(void *, const uint8_t *);
126 static int aggr_remmac(void *, const uint8_t *);
127 static mblk_t *aggr_rx_poll(void *, int);
128 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
129     const int, mac_ring_info_t *, mac_ring_handle_t);
130 static void aggr_fill_group(void *, mac_ring_type_t, const int,
131     mac_group_info_t *, mac_group_handle_t);
132 
133 static kmem_cache_t	*aggr_grp_cache;
134 static mod_hash_t	*aggr_grp_hash;
135 static krwlock_t	aggr_grp_lock;
136 static uint_t		aggr_grp_cnt;
137 static id_space_t	*key_ids;
138 
139 #define	GRP_HASHSZ		64
140 #define	GRP_HASH_KEY(linkid)	((mod_hash_key_t)(uintptr_t)linkid)
141 #define	AGGR_PORT_NAME_DELIMIT '-'
142 
143 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
144 
145 #define	AGGR_M_CALLBACK_FLAGS	\
146 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
147 
148 static mac_callbacks_t aggr_m_callbacks = {
149 	AGGR_M_CALLBACK_FLAGS,
150 	aggr_m_stat,
151 	aggr_m_start,
152 	aggr_m_stop,
153 	aggr_m_promisc,
154 	aggr_m_multicst,
155 	NULL,
156 	NULL,
157 	NULL,
158 	aggr_m_ioctl,
159 	aggr_m_capab_get,
160 	NULL,
161 	NULL,
162 	aggr_m_setprop,
163 	NULL,
164 	aggr_m_propinfo
165 };
166 
167 /*ARGSUSED*/
168 static int
169 aggr_grp_constructor(void *buf, void *arg, int kmflag)
170 {
171 	aggr_grp_t *grp = buf;
172 
173 	bzero(grp, sizeof (*grp));
174 	mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
175 	cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
176 	rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
177 	mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
178 	cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
179 	mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
180 	cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
181 	grp->lg_link_state = LINK_STATE_UNKNOWN;
182 	return (0);
183 }
184 
185 /*ARGSUSED*/
186 static void
187 aggr_grp_destructor(void *buf, void *arg)
188 {
189 	aggr_grp_t *grp = buf;
190 
191 	if (grp->lg_tx_ports != NULL) {
192 		kmem_free(grp->lg_tx_ports,
193 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
194 	}
195 
196 	mutex_destroy(&grp->lg_lacp_lock);
197 	cv_destroy(&grp->lg_lacp_cv);
198 	mutex_destroy(&grp->lg_port_lock);
199 	cv_destroy(&grp->lg_port_cv);
200 	rw_destroy(&grp->lg_tx_lock);
201 	mutex_destroy(&grp->lg_tx_flowctl_lock);
202 	cv_destroy(&grp->lg_tx_flowctl_cv);
203 }
204 
205 void
206 aggr_grp_init(void)
207 {
208 	aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
209 	    sizeof (aggr_grp_t), 0, aggr_grp_constructor,
210 	    aggr_grp_destructor, NULL, NULL, NULL, 0);
211 
212 	aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
213 	    GRP_HASHSZ, mod_hash_null_valdtor);
214 	rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
215 	aggr_grp_cnt = 0;
216 
217 	/*
218 	 * Allocate an id space to manage key values (when key is not
219 	 * specified). The range of the id space will be from
220 	 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
221 	 * uses a 16-bit key.
222 	 */
223 	key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
224 	ASSERT(key_ids != NULL);
225 }
226 
227 void
228 aggr_grp_fini(void)
229 {
230 	id_space_destroy(key_ids);
231 	rw_destroy(&aggr_grp_lock);
232 	mod_hash_destroy_idhash(aggr_grp_hash);
233 	kmem_cache_destroy(aggr_grp_cache);
234 }
235 
236 uint_t
237 aggr_grp_count(void)
238 {
239 	uint_t	count;
240 
241 	rw_enter(&aggr_grp_lock, RW_READER);
242 	count = aggr_grp_cnt;
243 	rw_exit(&aggr_grp_lock);
244 	return (count);
245 }
246 
247 /*
248  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
249  * requires the mac perimeter, this function holds a reference of the aggr
250  * and aggr won't call mac_unregister() until this reference drops to 0.
251  */
252 void
253 aggr_grp_port_hold(aggr_port_t *port)
254 {
255 	aggr_grp_t	*grp = port->lp_grp;
256 
257 	AGGR_PORT_REFHOLD(port);
258 	mutex_enter(&grp->lg_port_lock);
259 	grp->lg_port_ref++;
260 	mutex_exit(&grp->lg_port_lock);
261 }
262 
263 /*
264  * Release the reference of the grp and inform aggr_grp_delete() calling
265  * mac_unregister() is now safe.
266  */
267 void
268 aggr_grp_port_rele(aggr_port_t *port)
269 {
270 	aggr_grp_t	*grp = port->lp_grp;
271 
272 	mutex_enter(&grp->lg_port_lock);
273 	if (--grp->lg_port_ref == 0)
274 		cv_signal(&grp->lg_port_cv);
275 	mutex_exit(&grp->lg_port_lock);
276 	AGGR_PORT_REFRELE(port);
277 }
278 
279 /*
280  * Wait for the port's lacp timer thread and the port's notification callback
281  * to exit.
282  */
283 void
284 aggr_grp_port_wait(aggr_grp_t *grp)
285 {
286 	mutex_enter(&grp->lg_port_lock);
287 	if (grp->lg_port_ref != 0)
288 		cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
289 	mutex_exit(&grp->lg_port_lock);
290 }
291 
292 /*
293  * Attach a port to a link aggregation group.
294  *
295  * A port is attached to a link aggregation group once its speed
296  * and link state have been verified.
297  *
298  * Returns B_TRUE if the group link state or speed has changed. If
299  * it's the case, the caller must notify the MAC layer via a call
300  * to mac_link().
301  */
302 boolean_t
303 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
304 {
305 	boolean_t link_state_changed = B_FALSE;
306 
307 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
308 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
309 
310 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
311 		return (B_FALSE);
312 
313 	/*
314 	 * Validate the MAC port link speed and update the group
315 	 * link speed if needed.
316 	 */
317 	if (port->lp_ifspeed == 0 ||
318 	    port->lp_link_state != LINK_STATE_UP ||
319 	    port->lp_link_duplex != LINK_DUPLEX_FULL) {
320 		/*
321 		 * Can't attach a MAC port with unknown link speed,
322 		 * down link, or not in full duplex mode.
323 		 */
324 		return (B_FALSE);
325 	}
326 
327 	if (grp->lg_ifspeed == 0) {
328 		/*
329 		 * The group inherits the speed of the first link being
330 		 * attached.
331 		 */
332 		grp->lg_ifspeed = port->lp_ifspeed;
333 		link_state_changed = B_TRUE;
334 	} else if (grp->lg_ifspeed != port->lp_ifspeed) {
335 		/*
336 		 * The link speed of the MAC port must be the same as
337 		 * the group link speed, as per 802.3ad. Since it is
338 		 * not, the attach is cancelled.
339 		 */
340 		return (B_FALSE);
341 	}
342 
343 	grp->lg_nattached_ports++;
344 
345 	/*
346 	 * Update the group link state.
347 	 */
348 	if (grp->lg_link_state != LINK_STATE_UP) {
349 		grp->lg_link_state = LINK_STATE_UP;
350 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
351 		link_state_changed = B_TRUE;
352 	}
353 
354 	/*
355 	 * Update port's state.
356 	 */
357 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
358 
359 	aggr_grp_multicst_port(port, B_TRUE);
360 
361 	/*
362 	 * Set port's receive callback
363 	 */
364 	mac_rx_set(port->lp_mch, aggr_recv_cb, port);
365 
366 	/*
367 	 * If LACP is OFF, the port can be used to send data as soon
368 	 * as its link is up and verified to be compatible with the
369 	 * aggregation.
370 	 *
371 	 * If LACP is active or passive, notify the LACP subsystem, which
372 	 * will enable sending on the port following the LACP protocol.
373 	 */
374 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
375 		aggr_send_port_enable(port);
376 	else
377 		aggr_lacp_port_attached(port);
378 
379 	return (link_state_changed);
380 }
381 
382 boolean_t
383 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
384 {
385 	boolean_t link_state_changed = B_FALSE;
386 
387 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
388 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
389 
390 	/* update state */
391 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
392 		return (B_FALSE);
393 
394 	mac_rx_clear(port->lp_mch);
395 
396 	aggr_grp_multicst_port(port, B_FALSE);
397 
398 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
399 		aggr_send_port_disable(port);
400 	else
401 		aggr_lacp_port_detached(port);
402 
403 	port->lp_state = AGGR_PORT_STATE_STANDBY;
404 
405 	grp->lg_nattached_ports--;
406 	if (grp->lg_nattached_ports == 0) {
407 		/* the last attached MAC port of the group is being detached */
408 		grp->lg_ifspeed = 0;
409 		grp->lg_link_state = LINK_STATE_DOWN;
410 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
411 		link_state_changed = B_TRUE;
412 	}
413 
414 	return (link_state_changed);
415 }
416 
417 /*
418  * Update the MAC addresses of the constituent ports of the specified
419  * group. This function is invoked:
420  * - after creating a new aggregation group.
421  * - after adding new ports to an aggregation group.
422  * - after removing a port from a group when the MAC address of
423  *   that port was used for the MAC address of the group.
424  * - after the MAC address of a port changed when the MAC address
425  *   of that port was used for the MAC address of the group.
426  *
427  * Return true if the link state of the aggregation changed, for example
428  * as a result of a failure changing the MAC address of one of the
429  * constituent ports.
430  */
431 boolean_t
432 aggr_grp_update_ports_mac(aggr_grp_t *grp)
433 {
434 	aggr_port_t *cport;
435 	boolean_t link_state_changed = B_FALSE;
436 	mac_perim_handle_t mph;
437 
438 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
439 
440 	for (cport = grp->lg_ports; cport != NULL;
441 	    cport = cport->lp_next) {
442 		mac_perim_enter_by_mh(cport->lp_mh, &mph);
443 		if (aggr_port_unicst(cport) != 0) {
444 			if (aggr_grp_detach_port(grp, cport))
445 				link_state_changed = B_TRUE;
446 		} else {
447 			/*
448 			 * If a port was detached because of a previous
449 			 * failure changing the MAC address, the port is
450 			 * reattached when it successfully changes the MAC
451 			 * address now, and this might cause the link state
452 			 * of the aggregation to change.
453 			 */
454 			if (aggr_grp_attach_port(grp, cport))
455 				link_state_changed = B_TRUE;
456 		}
457 		mac_perim_exit(mph);
458 	}
459 	return (link_state_changed);
460 }
461 
462 /*
463  * Invoked when the MAC address of a port has changed. If the port's
464  * MAC address was used for the group MAC address, set mac_addr_changedp
465  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
466  * notification. If the link state changes due to detach/attach of
467  * the constituent port, set link_state_changedp to B_TRUE to indicate
468  * to the caller that it should send a MAC_NOTE_LINK notification. In both
469  * cases, it is the responsibility of the caller to invoke notification
470  * functions after releasing the the port lock.
471  */
472 void
473 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
474     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
475 {
476 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
477 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
478 	ASSERT(mac_addr_changedp != NULL);
479 	ASSERT(link_state_changedp != NULL);
480 
481 	*mac_addr_changedp = B_FALSE;
482 	*link_state_changedp = B_FALSE;
483 
484 	if (grp->lg_addr_fixed) {
485 		/*
486 		 * The group is using a fixed MAC address or an automatic
487 		 * MAC address has not been set.
488 		 */
489 		return;
490 	}
491 
492 	if (grp->lg_mac_addr_port == port) {
493 		/*
494 		 * The MAC address of the port was assigned to the group
495 		 * MAC address. Update the group MAC address.
496 		 */
497 		bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
498 		*mac_addr_changedp = B_TRUE;
499 	} else {
500 		/*
501 		 * Update the actual port MAC address to the MAC address
502 		 * of the group.
503 		 */
504 		if (aggr_port_unicst(port) != 0) {
505 			*link_state_changedp = aggr_grp_detach_port(grp, port);
506 		} else {
507 			/*
508 			 * If a port was detached because of a previous
509 			 * failure changing the MAC address, the port is
510 			 * reattached when it successfully changes the MAC
511 			 * address now, and this might cause the link state
512 			 * of the aggregation to change.
513 			 */
514 			*link_state_changedp = aggr_grp_attach_port(grp, port);
515 		}
516 	}
517 }
518 
519 /*
520  * Add a port to a link aggregation group.
521  */
522 static int
523 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
524     aggr_port_t **pp)
525 {
526 	aggr_port_t *port, **cport;
527 	mac_perim_handle_t mph;
528 	zoneid_t port_zoneid = ALL_ZONES;
529 	int err;
530 
531 	/* The port must be int the same zone as the aggregation. */
532 	if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
533 		port_zoneid = GLOBAL_ZONEID;
534 	if (grp->lg_zoneid != port_zoneid)
535 		return (EBUSY);
536 
537 	/*
538 	 * lg_mh could be NULL when the function is called during the creation
539 	 * of the aggregation.
540 	 */
541 	ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
542 
543 	/* create new port */
544 	err = aggr_port_create(grp, port_linkid, force, &port);
545 	if (err != 0)
546 		return (err);
547 
548 	mac_perim_enter_by_mh(port->lp_mh, &mph);
549 
550 	/* add port to list of group constituent ports */
551 	cport = &grp->lg_ports;
552 	while (*cport != NULL)
553 		cport = &((*cport)->lp_next);
554 	*cport = port;
555 
556 	/*
557 	 * Back reference to the group it is member of. A port always
558 	 * holds a reference to its group to ensure that the back
559 	 * reference is always valid.
560 	 */
561 	port->lp_grp = grp;
562 	AGGR_GRP_REFHOLD(grp);
563 	grp->lg_nports++;
564 
565 	aggr_lacp_init_port(port);
566 	mac_perim_exit(mph);
567 
568 	if (pp != NULL)
569 		*pp = port;
570 
571 	return (0);
572 }
573 
574 /*
575  * Add a pseudo RX ring for the given HW ring handle.
576  */
577 static int
578 aggr_add_pseudo_rx_ring(aggr_port_t *port,
579     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
580 {
581 	aggr_pseudo_rx_ring_t	*ring;
582 	int			err;
583 	int			j;
584 
585 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
586 		ring = rx_grp->arg_rings + j;
587 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
588 			break;
589 	}
590 
591 	/*
592 	 * No slot for this new RX ring.
593 	 */
594 	if (j == MAX_RINGS_PER_GROUP)
595 		return (EIO);
596 
597 	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
598 	ring->arr_hw_rh = hw_rh;
599 	ring->arr_port = port;
600 	rx_grp->arg_ring_cnt++;
601 
602 	/*
603 	 * The group is already registered, dynamically add a new ring to the
604 	 * mac group.
605 	 */
606 	if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
607 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
608 		ring->arr_hw_rh = NULL;
609 		ring->arr_port = NULL;
610 		rx_grp->arg_ring_cnt--;
611 	} else {
612 		mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
613 		    mac_find_ring(rx_grp->arg_gh, j));
614 	}
615 	return (err);
616 }
617 
618 /*
619  * Remove the pseudo RX ring of the given HW ring handle.
620  */
621 static void
622 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
623 {
624 	aggr_pseudo_rx_ring_t	*ring;
625 	int			j;
626 
627 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
628 		ring = rx_grp->arg_rings + j;
629 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
630 		    ring->arr_hw_rh != hw_rh) {
631 			continue;
632 		}
633 
634 		mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
635 
636 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
637 		ring->arr_hw_rh = NULL;
638 		ring->arr_port = NULL;
639 		rx_grp->arg_ring_cnt--;
640 		mac_hwring_teardown(hw_rh);
641 		break;
642 	}
643 }
644 
645 /*
646  * This function is called to create pseudo rings over the hardware rings of
647  * the underlying device. Note that there is a 1:1 mapping between the pseudo
648  * RX rings of the aggr and the hardware rings of the underlying port.
649  */
650 static int
651 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
652 {
653 	aggr_grp_t		*grp = port->lp_grp;
654 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
655 	aggr_unicst_addr_t	*addr, *a;
656 	mac_perim_handle_t	pmph;
657 	int			hw_rh_cnt, i = 0, j;
658 	int			err = 0;
659 
660 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
661 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
662 
663 	/*
664 	 * This function must be called after the aggr registers its mac
665 	 * and its RX group has been initialized.
666 	 */
667 	ASSERT(rx_grp->arg_gh != NULL);
668 
669 	/*
670 	 * Get the list the the underlying HW rings.
671 	 */
672 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
673 	    &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
674 
675 	if (port->lp_hwgh != NULL) {
676 		/*
677 		 * Quiesce the HW ring and the mac srs on the ring. Note
678 		 * that the HW ring will be restarted when the pseudo ring
679 		 * is started. At that time all the packets will be
680 		 * directly passed up to the pseudo RX ring and handled
681 		 * by mac srs created over the pseudo RX ring.
682 		 */
683 		mac_rx_client_quiesce(port->lp_mch);
684 		mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
685 	}
686 
687 	/*
688 	 * Add all the unicast addresses to the newly added port.
689 	 */
690 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
691 		if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
692 			break;
693 	}
694 
695 	for (i = 0; err == 0 && i < hw_rh_cnt; i++)
696 		err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
697 
698 	if (err != 0) {
699 		for (j = 0; j < i; j++)
700 			aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
701 
702 		for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
703 			aggr_port_remmac(port, a->aua_addr);
704 
705 		if (port->lp_hwgh != NULL) {
706 			mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
707 			mac_rx_client_restart(port->lp_mch);
708 			port->lp_hwgh = NULL;
709 		}
710 	} else {
711 		port->lp_rx_grp_added = B_TRUE;
712 	}
713 done:
714 	mac_perim_exit(pmph);
715 	return (err);
716 }
717 
718 /*
719  * This function is called by aggr to remove pseudo RX rings over the
720  * HW rings of the underlying port.
721  */
722 static void
723 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
724 {
725 	aggr_grp_t		*grp = port->lp_grp;
726 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
727 	aggr_unicst_addr_t	*addr;
728 	mac_group_handle_t	hwgh;
729 	mac_perim_handle_t	pmph;
730 	int			hw_rh_cnt, i;
731 
732 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
733 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
734 
735 	if (!port->lp_rx_grp_added)
736 		goto done;
737 
738 	ASSERT(rx_grp->arg_gh != NULL);
739 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
740 	    &hwgh, hw_rh, MAC_RING_TYPE_RX);
741 
742 	/*
743 	 * If hw_rh_cnt is 0, it means that the underlying port does not
744 	 * support RX rings. Directly return in this case.
745 	 */
746 	for (i = 0; i < hw_rh_cnt; i++)
747 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
748 
749 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
750 		aggr_port_remmac(port, addr->aua_addr);
751 
752 	if (port->lp_hwgh != NULL) {
753 		port->lp_hwgh = NULL;
754 
755 		/*
756 		 * First clear the permanent-quiesced flag of the RX srs then
757 		 * restart the HW ring and the mac srs on the ring. Note that
758 		 * the HW ring and associated SRS will soon been removed when
759 		 * the port is removed from the aggr.
760 		 */
761 		mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
762 		mac_rx_client_restart(port->lp_mch);
763 	}
764 
765 	port->lp_rx_grp_added = B_FALSE;
766 done:
767 	mac_perim_exit(pmph);
768 }
769 
770 /*
771  * Add a pseudo TX ring for the given HW ring handle.
772  */
773 static int
774 aggr_add_pseudo_tx_ring(aggr_port_t *port,
775     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
776     mac_ring_handle_t *pseudo_rh)
777 {
778 	aggr_pseudo_tx_ring_t	*ring;
779 	int			err;
780 	int			i;
781 
782 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
783 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
784 		ring = tx_grp->atg_rings + i;
785 		if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
786 			break;
787 	}
788 	/*
789 	 * No slot for this new TX ring.
790 	 */
791 	if (i == MAX_RINGS_PER_GROUP)
792 		return (EIO);
793 	/*
794 	 * The following 4 statements needs to be done before
795 	 * calling mac_group_add_ring(). Otherwise it will
796 	 * result in an assertion failure in mac_init_ring().
797 	 */
798 	ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
799 	ring->atr_hw_rh = hw_rh;
800 	ring->atr_port = port;
801 	tx_grp->atg_ring_cnt++;
802 
803 	/*
804 	 * The TX side has no concept of ring groups unlike RX groups.
805 	 * There is just a single group which stores all the TX rings.
806 	 * This group will be used to store aggr's pseudo TX rings.
807 	 */
808 	if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
809 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
810 		ring->atr_hw_rh = NULL;
811 		ring->atr_port = NULL;
812 		tx_grp->atg_ring_cnt--;
813 	} else {
814 		*pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
815 		if (hw_rh != NULL) {
816 			mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
817 			    mac_find_ring(tx_grp->atg_gh, i));
818 		}
819 	}
820 	return (err);
821 }
822 
823 /*
824  * Remove the pseudo TX ring of the given HW ring handle.
825  */
826 static void
827 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
828     mac_ring_handle_t pseudo_hw_rh)
829 {
830 	aggr_pseudo_tx_ring_t	*ring;
831 	int			i;
832 
833 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
834 		ring = tx_grp->atg_rings + i;
835 		if (ring->atr_rh != pseudo_hw_rh)
836 			continue;
837 
838 		ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
839 		mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
840 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
841 		mac_hwring_teardown(ring->atr_hw_rh);
842 		ring->atr_hw_rh = NULL;
843 		ring->atr_port = NULL;
844 		tx_grp->atg_ring_cnt--;
845 		break;
846 	}
847 }
848 
849 /*
850  * This function is called to create pseudo rings over hardware rings of
851  * the underlying device. There is a 1:1 mapping between the pseudo TX
852  * rings of the aggr and the hardware rings of the underlying port.
853  */
854 static int
855 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
856 {
857 	aggr_grp_t		*grp = port->lp_grp;
858 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
859 	mac_perim_handle_t	pmph;
860 	int			hw_rh_cnt, i = 0, j;
861 	int			err = 0;
862 
863 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
864 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
865 
866 	/*
867 	 * Get the list the the underlying HW rings.
868 	 */
869 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
870 	    NULL, hw_rh, MAC_RING_TYPE_TX);
871 
872 	/*
873 	 * Even if the underlying NIC does not have TX rings, we
874 	 * still make a psuedo TX ring for that NIC with NULL as
875 	 * the ring handle.
876 	 */
877 	if (hw_rh_cnt == 0)
878 		port->lp_tx_ring_cnt = 1;
879 	else
880 		port->lp_tx_ring_cnt = hw_rh_cnt;
881 
882 	port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
883 	    port->lp_tx_ring_cnt), KM_SLEEP);
884 	port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
885 	    port->lp_tx_ring_cnt), KM_SLEEP);
886 
887 	if (hw_rh_cnt == 0) {
888 		if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
889 		    NULL, &pseudo_rh)) == 0) {
890 			port->lp_tx_rings[0] = NULL;
891 			port->lp_pseudo_tx_rings[0] = pseudo_rh;
892 		}
893 	} else {
894 		for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
895 			err = aggr_add_pseudo_tx_ring(port,
896 			    tx_grp, hw_rh[i], &pseudo_rh);
897 			if (err != 0)
898 				break;
899 			port->lp_tx_rings[i] = hw_rh[i];
900 			port->lp_pseudo_tx_rings[i] = pseudo_rh;
901 		}
902 	}
903 
904 	if (err != 0) {
905 		if (hw_rh_cnt != 0) {
906 			for (j = 0; j < i; j++) {
907 				aggr_rem_pseudo_tx_ring(tx_grp,
908 				    port->lp_pseudo_tx_rings[j]);
909 			}
910 		}
911 		kmem_free(port->lp_tx_rings,
912 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
913 		kmem_free(port->lp_pseudo_tx_rings,
914 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
915 		port->lp_tx_ring_cnt = 0;
916 	} else {
917 		port->lp_tx_grp_added = B_TRUE;
918 		port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
919 		    aggr_tx_ring_update, port);
920 	}
921 	mac_perim_exit(pmph);
922 	return (err);
923 }
924 
925 /*
926  * This function is called by aggr to remove pseudo TX rings over the
927  * HW rings of the underlying port.
928  */
929 static void
930 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
931 {
932 	aggr_grp_t		*grp = port->lp_grp;
933 	mac_perim_handle_t	pmph;
934 	int			i;
935 
936 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
937 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
938 
939 	if (!port->lp_tx_grp_added)
940 		goto done;
941 
942 	ASSERT(tx_grp->atg_gh != NULL);
943 
944 	for (i = 0; i < port->lp_tx_ring_cnt; i++)
945 		aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
946 
947 	kmem_free(port->lp_tx_rings,
948 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
949 	kmem_free(port->lp_pseudo_tx_rings,
950 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
951 
952 	port->lp_tx_ring_cnt = 0;
953 	(void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
954 	port->lp_tx_grp_added = B_FALSE;
955 done:
956 	mac_perim_exit(pmph);
957 }
958 
959 static int
960 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
961 {
962 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
963 	return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
964 }
965 
966 static int
967 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
968 {
969 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
970 	return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
971 }
972 
973 static int
974 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
975 {
976 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
977 	int err;
978 
979 	err = mac_hwring_start(rr_ring->arr_hw_rh);
980 	if (err == 0)
981 		rr_ring->arr_gen = mr_gen;
982 	return (err);
983 }
984 
985 static void
986 aggr_pseudo_stop_ring(mac_ring_driver_t arg)
987 {
988 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
989 	mac_hwring_stop(rr_ring->arr_hw_rh);
990 }
991 
992 /*
993  * Add one or more ports to an existing link aggregation group.
994  */
995 int
996 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
997     laioc_port_t *ports)
998 {
999 	int rc, i, nadded = 0;
1000 	aggr_grp_t *grp = NULL;
1001 	aggr_port_t *port;
1002 	boolean_t link_state_changed = B_FALSE;
1003 	mac_perim_handle_t mph, pmph;
1004 
1005 	/* get group corresponding to linkid */
1006 	rw_enter(&aggr_grp_lock, RW_READER);
1007 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1008 	    (mod_hash_val_t *)&grp) != 0) {
1009 		rw_exit(&aggr_grp_lock);
1010 		return (ENOENT);
1011 	}
1012 	AGGR_GRP_REFHOLD(grp);
1013 
1014 	/*
1015 	 * Hold the perimeter so that the aggregation won't be destroyed.
1016 	 */
1017 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1018 	rw_exit(&aggr_grp_lock);
1019 
1020 	/* add the specified ports to group */
1021 	for (i = 0; i < nports; i++) {
1022 		/* add port to group */
1023 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1024 		    force, &port)) != 0) {
1025 			goto bail;
1026 		}
1027 		ASSERT(port != NULL);
1028 		nadded++;
1029 
1030 		/* check capabilities */
1031 		if (!aggr_grp_capab_check(grp, port) ||
1032 		    !aggr_grp_sdu_check(grp, port) ||
1033 		    !aggr_grp_margin_check(grp, port)) {
1034 			rc = ENOTSUP;
1035 			goto bail;
1036 		}
1037 
1038 		/*
1039 		 * Create the pseudo ring for each HW ring of the underlying
1040 		 * port.
1041 		 */
1042 		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1043 		if (rc != 0)
1044 			goto bail;
1045 		rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1046 		if (rc != 0)
1047 			goto bail;
1048 
1049 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1050 
1051 		/* set LACP mode */
1052 		aggr_port_lacp_set_mode(grp, port);
1053 
1054 		/* start port if group has already been started */
1055 		if (grp->lg_started) {
1056 			rc = aggr_port_start(port);
1057 			if (rc != 0) {
1058 				mac_perim_exit(pmph);
1059 				goto bail;
1060 			}
1061 
1062 			/*
1063 			 * Turn on the promiscuous mode over the port when it
1064 			 * is requested to be turned on to receive the
1065 			 * non-primary address over a port, or the promiscous
1066 			 * mode is enabled over the aggr.
1067 			 */
1068 			if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1069 				rc = aggr_port_promisc(port, B_TRUE);
1070 				if (rc != 0) {
1071 					mac_perim_exit(pmph);
1072 					goto bail;
1073 				}
1074 			}
1075 		}
1076 		mac_perim_exit(pmph);
1077 
1078 		/*
1079 		 * Attach each port if necessary.
1080 		 */
1081 		if (aggr_port_notify_link(grp, port))
1082 			link_state_changed = B_TRUE;
1083 
1084 		/*
1085 		 * Initialize the callback functions for this port.
1086 		 */
1087 		aggr_port_init_callbacks(port);
1088 	}
1089 
1090 	/* update the MAC address of the constituent ports */
1091 	if (aggr_grp_update_ports_mac(grp))
1092 		link_state_changed = B_TRUE;
1093 
1094 	if (link_state_changed)
1095 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1096 
1097 bail:
1098 	if (rc != 0) {
1099 		/* stop and remove ports that have been added */
1100 		for (i = 0; i < nadded; i++) {
1101 			port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1102 			ASSERT(port != NULL);
1103 			if (grp->lg_started) {
1104 				mac_perim_enter_by_mh(port->lp_mh, &pmph);
1105 				(void) aggr_port_promisc(port, B_FALSE);
1106 				aggr_port_stop(port);
1107 				mac_perim_exit(pmph);
1108 			}
1109 			aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1110 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1111 			(void) aggr_grp_rem_port(grp, port, NULL, NULL);
1112 		}
1113 	}
1114 
1115 	mac_perim_exit(mph);
1116 	AGGR_GRP_REFRELE(grp);
1117 	return (rc);
1118 }
1119 
1120 static int
1121 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1122     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1123     aggr_lacp_timer_t lacp_timer)
1124 {
1125 	boolean_t mac_addr_changed = B_FALSE;
1126 	boolean_t link_state_changed = B_FALSE;
1127 	mac_perim_handle_t pmph;
1128 
1129 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1130 
1131 	/* validate fixed address if specified */
1132 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1133 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1134 	    (mac_addr[0] & 0x01))) {
1135 		return (EINVAL);
1136 	}
1137 
1138 	/* update policy if requested */
1139 	if (update_mask & AGGR_MODIFY_POLICY)
1140 		aggr_send_update_policy(grp, policy);
1141 
1142 	/* update unicast MAC address if requested */
1143 	if (update_mask & AGGR_MODIFY_MAC) {
1144 		if (mac_fixed) {
1145 			/* user-supplied MAC address */
1146 			grp->lg_mac_addr_port = NULL;
1147 			if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1148 				bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1149 				mac_addr_changed = B_TRUE;
1150 			}
1151 		} else if (grp->lg_addr_fixed) {
1152 			/* switch from user-supplied to automatic */
1153 			aggr_port_t *port = grp->lg_ports;
1154 
1155 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1156 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1157 			grp->lg_mac_addr_port = port;
1158 			mac_addr_changed = B_TRUE;
1159 			mac_perim_exit(pmph);
1160 		}
1161 		grp->lg_addr_fixed = mac_fixed;
1162 	}
1163 
1164 	if (mac_addr_changed)
1165 		link_state_changed = aggr_grp_update_ports_mac(grp);
1166 
1167 	if (update_mask & AGGR_MODIFY_LACP_MODE)
1168 		aggr_lacp_update_mode(grp, lacp_mode);
1169 
1170 	if (update_mask & AGGR_MODIFY_LACP_TIMER)
1171 		aggr_lacp_update_timer(grp, lacp_timer);
1172 
1173 	if (link_state_changed)
1174 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1175 
1176 	if (mac_addr_changed)
1177 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1178 
1179 	return (0);
1180 }
1181 
1182 /*
1183  * Update properties of an existing link aggregation group.
1184  */
1185 int
1186 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1187     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1188     aggr_lacp_timer_t lacp_timer)
1189 {
1190 	aggr_grp_t *grp = NULL;
1191 	mac_perim_handle_t mph;
1192 	int err;
1193 
1194 	/* get group corresponding to linkid */
1195 	rw_enter(&aggr_grp_lock, RW_READER);
1196 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1197 	    (mod_hash_val_t *)&grp) != 0) {
1198 		rw_exit(&aggr_grp_lock);
1199 		return (ENOENT);
1200 	}
1201 	AGGR_GRP_REFHOLD(grp);
1202 
1203 	/*
1204 	 * Hold the perimeter so that the aggregation won't be destroyed.
1205 	 */
1206 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1207 	rw_exit(&aggr_grp_lock);
1208 
1209 	err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1210 	    mac_addr, lacp_mode, lacp_timer);
1211 
1212 	mac_perim_exit(mph);
1213 	AGGR_GRP_REFRELE(grp);
1214 	return (err);
1215 }
1216 
1217 /*
1218  * Create a new link aggregation group upon request from administrator.
1219  * Returns 0 on success, an errno on failure.
1220  */
1221 int
1222 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1223     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1224     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1225     cred_t *credp)
1226 {
1227 	aggr_grp_t *grp = NULL;
1228 	aggr_port_t *port;
1229 	mac_register_t *mac;
1230 	boolean_t link_state_changed;
1231 	mac_perim_handle_t mph;
1232 	int err;
1233 	int i;
1234 	kt_did_t tid = 0;
1235 
1236 	/* need at least one port */
1237 	if (nports == 0)
1238 		return (EINVAL);
1239 
1240 	rw_enter(&aggr_grp_lock, RW_WRITER);
1241 
1242 	/* does a group with the same linkid already exist? */
1243 	err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1244 	    (mod_hash_val_t *)&grp);
1245 	if (err == 0) {
1246 		rw_exit(&aggr_grp_lock);
1247 		return (EEXIST);
1248 	}
1249 
1250 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1251 
1252 	grp->lg_refs = 1;
1253 	grp->lg_closing = B_FALSE;
1254 	grp->lg_force = force;
1255 	grp->lg_linkid = linkid;
1256 	grp->lg_zoneid = crgetzoneid(credp);
1257 	grp->lg_ifspeed = 0;
1258 	grp->lg_link_state = LINK_STATE_UNKNOWN;
1259 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1260 	grp->lg_started = B_FALSE;
1261 	grp->lg_promisc = B_FALSE;
1262 	grp->lg_lacp_done = B_FALSE;
1263 	grp->lg_tx_notify_done = B_FALSE;
1264 	grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1265 	grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1266 	    aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1267 	grp->lg_tx_notify_thread = thread_create(NULL, 0,
1268 	    aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1269 	grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1270 	    MAX_RINGS_PER_GROUP), KM_SLEEP);
1271 	grp->lg_tx_blocked_cnt = 0;
1272 	bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1273 	bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1274 	aggr_lacp_init_grp(grp);
1275 
1276 	/* add MAC ports to group */
1277 	grp->lg_ports = NULL;
1278 	grp->lg_nports = 0;
1279 	grp->lg_nattached_ports = 0;
1280 	grp->lg_ntx_ports = 0;
1281 
1282 	/*
1283 	 * If key is not specified by the user, allocate the key.
1284 	 */
1285 	if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1286 		err = ENOMEM;
1287 		goto bail;
1288 	}
1289 	grp->lg_key = key;
1290 
1291 	for (i = 0; i < nports; i++) {
1292 		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
1293 		if (err != 0)
1294 			goto bail;
1295 	}
1296 
1297 	/*
1298 	 * If no explicit MAC address was specified by the administrator,
1299 	 * set it to the MAC address of the first port.
1300 	 */
1301 	grp->lg_addr_fixed = mac_fixed;
1302 	if (grp->lg_addr_fixed) {
1303 		/* validate specified address */
1304 		if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1305 			err = EINVAL;
1306 			goto bail;
1307 		}
1308 		bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1309 	} else {
1310 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1311 		grp->lg_mac_addr_port = grp->lg_ports;
1312 	}
1313 
1314 	/* set the initial group capabilities */
1315 	aggr_grp_capab_set(grp);
1316 
1317 	if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1318 		err = ENOMEM;
1319 		goto bail;
1320 	}
1321 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1322 	mac->m_driver = grp;
1323 	mac->m_dip = aggr_dip;
1324 	mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1325 	mac->m_src_addr = grp->lg_addr;
1326 	mac->m_callbacks = &aggr_m_callbacks;
1327 	mac->m_min_sdu = 0;
1328 	mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1329 	mac->m_margin = aggr_grp_max_margin(grp);
1330 	mac->m_v12n = MAC_VIRT_LEVEL1;
1331 	err = mac_register(mac, &grp->lg_mh);
1332 	mac_free(mac);
1333 	if (err != 0)
1334 		goto bail;
1335 
1336 	err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1337 	if (err != 0) {
1338 		(void) mac_unregister(grp->lg_mh);
1339 		grp->lg_mh = NULL;
1340 		goto bail;
1341 	}
1342 
1343 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1344 
1345 	/*
1346 	 * Update the MAC address of the constituent ports.
1347 	 * None of the port is attached at this time, the link state of the
1348 	 * aggregation will not change.
1349 	 */
1350 	link_state_changed = aggr_grp_update_ports_mac(grp);
1351 	ASSERT(!link_state_changed);
1352 
1353 	/* update outbound load balancing policy */
1354 	aggr_send_update_policy(grp, policy);
1355 
1356 	/* set LACP mode */
1357 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1358 
1359 	/*
1360 	 * Attach each port if necessary.
1361 	 */
1362 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1363 		/*
1364 		 * Create the pseudo ring for each HW ring of the underlying
1365 		 * port. Note that this is done after the aggr registers the
1366 		 * mac.
1367 		 */
1368 		VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1369 		VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1370 		if (aggr_port_notify_link(grp, port))
1371 			link_state_changed = B_TRUE;
1372 
1373 		/*
1374 		 * Initialize the callback functions for this port.
1375 		 */
1376 		aggr_port_init_callbacks(port);
1377 	}
1378 
1379 	if (link_state_changed)
1380 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1381 
1382 	/* add new group to hash table */
1383 	err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1384 	    (mod_hash_val_t)grp);
1385 	ASSERT(err == 0);
1386 	aggr_grp_cnt++;
1387 
1388 	mac_perim_exit(mph);
1389 	rw_exit(&aggr_grp_lock);
1390 	return (0);
1391 
1392 bail:
1393 
1394 	grp->lg_closing = B_TRUE;
1395 
1396 	port = grp->lg_ports;
1397 	while (port != NULL) {
1398 		aggr_port_t *cport;
1399 
1400 		cport = port->lp_next;
1401 		aggr_port_delete(port);
1402 		port = cport;
1403 	}
1404 
1405 	/*
1406 	 * Inform the lacp_rx thread to exit.
1407 	 */
1408 	mutex_enter(&grp->lg_lacp_lock);
1409 	grp->lg_lacp_done = B_TRUE;
1410 	cv_signal(&grp->lg_lacp_cv);
1411 	while (grp->lg_lacp_rx_thread != NULL)
1412 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1413 	mutex_exit(&grp->lg_lacp_lock);
1414 	/*
1415 	 * Inform the tx_notify thread to exit.
1416 	 */
1417 	mutex_enter(&grp->lg_tx_flowctl_lock);
1418 	if (grp->lg_tx_notify_thread != NULL) {
1419 		tid = grp->lg_tx_notify_thread->t_did;
1420 		grp->lg_tx_notify_done = B_TRUE;
1421 		cv_signal(&grp->lg_tx_flowctl_cv);
1422 	}
1423 	mutex_exit(&grp->lg_tx_flowctl_lock);
1424 	if (tid != 0)
1425 		thread_join(tid);
1426 
1427 	kmem_free(grp->lg_tx_blocked_rings,
1428 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1429 	rw_exit(&aggr_grp_lock);
1430 	AGGR_GRP_REFRELE(grp);
1431 	return (err);
1432 }
1433 
1434 /*
1435  * Return a pointer to the member of a group with specified linkid.
1436  */
1437 static aggr_port_t *
1438 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1439 {
1440 	aggr_port_t *port;
1441 
1442 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1443 
1444 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1445 		if (port->lp_linkid == linkid)
1446 			break;
1447 	}
1448 
1449 	return (port);
1450 }
1451 
1452 /*
1453  * Stop, detach and remove a port from a link aggregation group.
1454  */
1455 static int
1456 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1457     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1458 {
1459 	int rc = 0;
1460 	aggr_port_t **pport;
1461 	boolean_t mac_addr_changed = B_FALSE;
1462 	boolean_t link_state_changed = B_FALSE;
1463 	mac_perim_handle_t mph;
1464 	uint64_t val;
1465 	uint_t i;
1466 	uint_t stat;
1467 
1468 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1469 	ASSERT(grp->lg_nports > 1);
1470 	ASSERT(!grp->lg_closing);
1471 
1472 	/* unlink port */
1473 	for (pport = &grp->lg_ports; *pport != port;
1474 	    pport = &(*pport)->lp_next) {
1475 		if (*pport == NULL) {
1476 			rc = ENOENT;
1477 			goto done;
1478 		}
1479 	}
1480 	*pport = port->lp_next;
1481 
1482 	mac_perim_enter_by_mh(port->lp_mh, &mph);
1483 
1484 	/*
1485 	 * If the MAC address of the port being removed was assigned
1486 	 * to the group, update the group MAC address
1487 	 * using the MAC address of a different port.
1488 	 */
1489 	if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1490 		/*
1491 		 * Set the MAC address of the group to the
1492 		 * MAC address of its first port.
1493 		 */
1494 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1495 		grp->lg_mac_addr_port = grp->lg_ports;
1496 		mac_addr_changed = B_TRUE;
1497 	}
1498 
1499 	link_state_changed = aggr_grp_detach_port(grp, port);
1500 
1501 	/*
1502 	 * Add the counter statistics of the ports while it was aggregated
1503 	 * to the group's residual statistics.  This is done by obtaining
1504 	 * the current counter from the underlying MAC then subtracting the
1505 	 * value of the counter at the moment it was added to the
1506 	 * aggregation.
1507 	 */
1508 	for (i = 0; i < MAC_NSTAT; i++) {
1509 		stat = i + MAC_STAT_MIN;
1510 		if (!MAC_STAT_ISACOUNTER(stat))
1511 			continue;
1512 		val = aggr_port_stat(port, stat);
1513 		val -= port->lp_stat[i];
1514 		grp->lg_stat[i] += val;
1515 	}
1516 	for (i = 0; i < ETHER_NSTAT; i++) {
1517 		stat = i + MACTYPE_STAT_MIN;
1518 		if (!ETHER_STAT_ISACOUNTER(stat))
1519 			continue;
1520 		val = aggr_port_stat(port, stat);
1521 		val -= port->lp_ether_stat[i];
1522 		grp->lg_ether_stat[i] += val;
1523 	}
1524 
1525 	grp->lg_nports--;
1526 	mac_perim_exit(mph);
1527 
1528 	aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1529 	aggr_port_delete(port);
1530 
1531 	/*
1532 	 * If the group MAC address has changed, update the MAC address of
1533 	 * the remaining constituent ports according to the new MAC
1534 	 * address of the group.
1535 	 */
1536 	if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1537 		link_state_changed = B_TRUE;
1538 
1539 done:
1540 	if (mac_addr_changedp != NULL)
1541 		*mac_addr_changedp = mac_addr_changed;
1542 	if (link_state_changedp != NULL)
1543 		*link_state_changedp = link_state_changed;
1544 
1545 	return (rc);
1546 }
1547 
1548 /*
1549  * Remove one or more ports from an existing link aggregation group.
1550  */
1551 int
1552 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1553 {
1554 	int rc = 0, i;
1555 	aggr_grp_t *grp = NULL;
1556 	aggr_port_t *port;
1557 	boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1558 	boolean_t link_state_update = B_FALSE, link_state_changed;
1559 	mac_perim_handle_t mph, pmph;
1560 
1561 	/* get group corresponding to linkid */
1562 	rw_enter(&aggr_grp_lock, RW_READER);
1563 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1564 	    (mod_hash_val_t *)&grp) != 0) {
1565 		rw_exit(&aggr_grp_lock);
1566 		return (ENOENT);
1567 	}
1568 	AGGR_GRP_REFHOLD(grp);
1569 
1570 	/*
1571 	 * Hold the perimeter so that the aggregation won't be destroyed.
1572 	 */
1573 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1574 	rw_exit(&aggr_grp_lock);
1575 
1576 	/* we need to keep at least one port per group */
1577 	if (nports >= grp->lg_nports) {
1578 		rc = EINVAL;
1579 		goto bail;
1580 	}
1581 
1582 	/* first verify that all the groups are valid */
1583 	for (i = 0; i < nports; i++) {
1584 		if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1585 			/* port not found */
1586 			rc = ENOENT;
1587 			goto bail;
1588 		}
1589 	}
1590 
1591 	/* clear the promiscous mode for the specified ports */
1592 	for (i = 0; i < nports && rc == 0; i++) {
1593 		/* lookup port */
1594 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1595 		ASSERT(port != NULL);
1596 
1597 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1598 		rc = aggr_port_promisc(port, B_FALSE);
1599 		mac_perim_exit(pmph);
1600 	}
1601 	if (rc != 0) {
1602 		for (i = 0; i < nports; i++) {
1603 			port = aggr_grp_port_lookup(grp,
1604 			    ports[i].lp_linkid);
1605 			ASSERT(port != NULL);
1606 
1607 			/*
1608 			 * Turn the promiscuous mode back on if it is required
1609 			 * to receive the non-primary address over a port, or
1610 			 * the promiscous mode is enabled over the aggr.
1611 			 */
1612 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1613 			if (port->lp_started && (grp->lg_promisc ||
1614 			    port->lp_prom_addr != NULL)) {
1615 				(void) aggr_port_promisc(port, B_TRUE);
1616 			}
1617 			mac_perim_exit(pmph);
1618 		}
1619 		goto bail;
1620 	}
1621 
1622 	/* remove the specified ports from group */
1623 	for (i = 0; i < nports; i++) {
1624 		/* lookup port */
1625 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1626 		ASSERT(port != NULL);
1627 
1628 		/* stop port if group has already been started */
1629 		if (grp->lg_started) {
1630 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1631 			aggr_port_stop(port);
1632 			mac_perim_exit(pmph);
1633 		}
1634 
1635 		/*
1636 		 * aggr_rem_pseudo_tx_group() is not called here. Instead
1637 		 * it is called from inside aggr_grp_rem_port() after the
1638 		 * port has been detached. The reason is that
1639 		 * aggr_rem_pseudo_tx_group() removes one ring at a time
1640 		 * and if there is still traffic going on, then there
1641 		 * is the possibility of aggr_find_tx_ring() returning a
1642 		 * removed ring for transmission. Once the port has been
1643 		 * detached, that port will not be used and
1644 		 * aggr_find_tx_ring() will not return any rings
1645 		 * belonging to it.
1646 		 */
1647 		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1648 
1649 		/* remove port from group */
1650 		rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1651 		    &link_state_changed);
1652 		ASSERT(rc == 0);
1653 		mac_addr_update = mac_addr_update || mac_addr_changed;
1654 		link_state_update = link_state_update || link_state_changed;
1655 	}
1656 
1657 bail:
1658 	if (mac_addr_update)
1659 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1660 	if (link_state_update)
1661 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1662 
1663 	mac_perim_exit(mph);
1664 	AGGR_GRP_REFRELE(grp);
1665 
1666 	return (rc);
1667 }
1668 
1669 int
1670 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1671 {
1672 	aggr_grp_t *grp = NULL;
1673 	aggr_port_t *port, *cport;
1674 	datalink_id_t tmpid;
1675 	mod_hash_val_t val;
1676 	mac_perim_handle_t mph, pmph;
1677 	int err;
1678 	kt_did_t tid = 0;
1679 
1680 	rw_enter(&aggr_grp_lock, RW_WRITER);
1681 
1682 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1683 	    (mod_hash_val_t *)&grp) != 0) {
1684 		rw_exit(&aggr_grp_lock);
1685 		return (ENOENT);
1686 	}
1687 
1688 	/*
1689 	 * Note that dls_devnet_destroy() must be called before lg_lock is
1690 	 * held. Otherwise, it will deadlock if another thread is in
1691 	 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1692 	 * dls_devnet_destroy() needs to delete.
1693 	 */
1694 	if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1695 		rw_exit(&aggr_grp_lock);
1696 		return (err);
1697 	}
1698 	ASSERT(linkid == tmpid);
1699 
1700 	/*
1701 	 * Unregister from the MAC service module. Since this can
1702 	 * fail if a client hasn't closed the MAC port, we gracefully
1703 	 * fail the operation.
1704 	 */
1705 	if ((err = mac_disable(grp->lg_mh)) != 0) {
1706 		(void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1707 		rw_exit(&aggr_grp_lock);
1708 		return (err);
1709 	}
1710 	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1711 	ASSERT(grp == (aggr_grp_t *)val);
1712 
1713 	ASSERT(aggr_grp_cnt > 0);
1714 	aggr_grp_cnt--;
1715 	rw_exit(&aggr_grp_lock);
1716 
1717 	/*
1718 	 * Inform the lacp_rx thread to exit.
1719 	 */
1720 	mutex_enter(&grp->lg_lacp_lock);
1721 	grp->lg_lacp_done = B_TRUE;
1722 	cv_signal(&grp->lg_lacp_cv);
1723 	while (grp->lg_lacp_rx_thread != NULL)
1724 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1725 	mutex_exit(&grp->lg_lacp_lock);
1726 	/*
1727 	 * Inform the tx_notify_thread to exit.
1728 	 */
1729 	mutex_enter(&grp->lg_tx_flowctl_lock);
1730 	if (grp->lg_tx_notify_thread != NULL) {
1731 		tid = grp->lg_tx_notify_thread->t_did;
1732 		grp->lg_tx_notify_done = B_TRUE;
1733 		cv_signal(&grp->lg_tx_flowctl_cv);
1734 	}
1735 	mutex_exit(&grp->lg_tx_flowctl_lock);
1736 	if (tid != 0)
1737 		thread_join(tid);
1738 
1739 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1740 
1741 	grp->lg_closing = B_TRUE;
1742 	/* detach and free MAC ports associated with group */
1743 	port = grp->lg_ports;
1744 	while (port != NULL) {
1745 		cport = port->lp_next;
1746 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1747 		if (grp->lg_started)
1748 			aggr_port_stop(port);
1749 		(void) aggr_grp_detach_port(grp, port);
1750 		mac_perim_exit(pmph);
1751 		aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1752 		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1753 		aggr_port_delete(port);
1754 		port = cport;
1755 	}
1756 
1757 	mac_perim_exit(mph);
1758 
1759 	kmem_free(grp->lg_tx_blocked_rings,
1760 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1761 	/*
1762 	 * Wait for the port's lacp timer thread and its notification callback
1763 	 * to exit before calling mac_unregister() since both needs to access
1764 	 * the mac perimeter of the grp.
1765 	 */
1766 	aggr_grp_port_wait(grp);
1767 
1768 	VERIFY(mac_unregister(grp->lg_mh) == 0);
1769 	grp->lg_mh = NULL;
1770 
1771 	AGGR_GRP_REFRELE(grp);
1772 	return (0);
1773 }
1774 
1775 void
1776 aggr_grp_free(aggr_grp_t *grp)
1777 {
1778 	ASSERT(grp->lg_refs == 0);
1779 	ASSERT(grp->lg_port_ref == 0);
1780 	if (grp->lg_key > AGGR_MAX_KEY) {
1781 		id_free(key_ids, grp->lg_key);
1782 		grp->lg_key = 0;
1783 	}
1784 	kmem_cache_free(aggr_grp_cache, grp);
1785 }
1786 
1787 int
1788 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1789     aggr_grp_info_new_grp_fn_t new_grp_fn,
1790     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1791 {
1792 	aggr_grp_t	*grp;
1793 	aggr_port_t	*port;
1794 	mac_perim_handle_t mph, pmph;
1795 	int		rc = 0;
1796 
1797 	/*
1798 	 * Make sure that the aggregation link is visible from the caller's
1799 	 * zone.
1800 	 */
1801 	if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1802 		return (ENOENT);
1803 
1804 	rw_enter(&aggr_grp_lock, RW_READER);
1805 
1806 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1807 	    (mod_hash_val_t *)&grp) != 0) {
1808 		rw_exit(&aggr_grp_lock);
1809 		return (ENOENT);
1810 	}
1811 	AGGR_GRP_REFHOLD(grp);
1812 
1813 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1814 	rw_exit(&aggr_grp_lock);
1815 
1816 	rc = new_grp_fn(fn_arg, grp->lg_linkid,
1817 	    (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1818 	    grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1819 	    grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1820 
1821 	if (rc != 0)
1822 		goto bail;
1823 
1824 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1825 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1826 		rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1827 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
1828 		mac_perim_exit(pmph);
1829 
1830 		if (rc != 0)
1831 			goto bail;
1832 	}
1833 
1834 bail:
1835 	mac_perim_exit(mph);
1836 	AGGR_GRP_REFRELE(grp);
1837 	return (rc);
1838 }
1839 
1840 /*ARGSUSED*/
1841 static void
1842 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1843 {
1844 	miocnak(q, mp, 0, ENOTSUP);
1845 }
1846 
1847 static int
1848 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1849 {
1850 	aggr_port_t	*port;
1851 	uint_t		stat_index;
1852 
1853 	/* We only aggregate counter statistics. */
1854 	if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1855 	    IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1856 		return (ENOTSUP);
1857 	}
1858 
1859 	/*
1860 	 * Counter statistics for a group are computed by aggregating the
1861 	 * counters of the members MACs while they were aggregated, plus
1862 	 * the residual counter of the group itself, which is updated each
1863 	 * time a MAC is removed from the group.
1864 	 */
1865 	*val = 0;
1866 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1867 		/* actual port statistic */
1868 		*val += aggr_port_stat(port, stat);
1869 		/*
1870 		 * minus the port stat when it was added, plus any residual
1871 		 * amount for the group.
1872 		 */
1873 		if (IS_MAC_STAT(stat)) {
1874 			stat_index = stat - MAC_STAT_MIN;
1875 			*val -= port->lp_stat[stat_index];
1876 			*val += grp->lg_stat[stat_index];
1877 		} else if (IS_MACTYPE_STAT(stat)) {
1878 			stat_index = stat - MACTYPE_STAT_MIN;
1879 			*val -= port->lp_ether_stat[stat_index];
1880 			*val += grp->lg_ether_stat[stat_index];
1881 		}
1882 	}
1883 	return (0);
1884 }
1885 
1886 int
1887 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1888 {
1889 	aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1890 
1891 	if (rx_ring->arr_hw_rh != NULL) {
1892 		*val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1893 	} else {
1894 		aggr_port_t	*port = rx_ring->arr_port;
1895 
1896 		*val = mac_stat_get(port->lp_mh, stat);
1897 
1898 	}
1899 	return (0);
1900 }
1901 
1902 int
1903 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1904 {
1905 	aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
1906 
1907 	if (tx_ring->atr_hw_rh != NULL) {
1908 		*val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
1909 	} else {
1910 		aggr_port_t	*port = tx_ring->atr_port;
1911 
1912 		*val = mac_stat_get(port->lp_mh, stat);
1913 	}
1914 	return (0);
1915 }
1916 
1917 static int
1918 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
1919 {
1920 	aggr_grp_t		*grp = arg;
1921 	mac_perim_handle_t	mph;
1922 	int			rval = 0;
1923 
1924 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1925 
1926 	switch (stat) {
1927 	case MAC_STAT_IFSPEED:
1928 		*val = grp->lg_ifspeed;
1929 		break;
1930 
1931 	case ETHER_STAT_LINK_DUPLEX:
1932 		*val = grp->lg_link_duplex;
1933 		break;
1934 
1935 	default:
1936 		/*
1937 		 * For all other statistics, we return the aggregated stat
1938 		 * from the underlying ports.  aggr_grp_stat() will set
1939 		 * rval appropriately if the statistic isn't a counter.
1940 		 */
1941 		rval = aggr_grp_stat(grp, stat, val);
1942 	}
1943 
1944 	mac_perim_exit(mph);
1945 	return (rval);
1946 }
1947 
1948 static int
1949 aggr_m_start(void *arg)
1950 {
1951 	aggr_grp_t *grp = arg;
1952 	aggr_port_t *port;
1953 	mac_perim_handle_t mph, pmph;
1954 
1955 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1956 
1957 	/*
1958 	 * Attempts to start all configured members of the group.
1959 	 * Group members will be attached when their link-up notification
1960 	 * is received.
1961 	 */
1962 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1963 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1964 		if (aggr_port_start(port) != 0) {
1965 			mac_perim_exit(pmph);
1966 			continue;
1967 		}
1968 
1969 		/*
1970 		 * Turn on the promiscuous mode if it is required to receive
1971 		 * the non-primary address over a port, or the promiscous
1972 		 * mode is enabled over the aggr.
1973 		 */
1974 		if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1975 			if (aggr_port_promisc(port, B_TRUE) != 0)
1976 				aggr_port_stop(port);
1977 		}
1978 		mac_perim_exit(pmph);
1979 	}
1980 
1981 	grp->lg_started = B_TRUE;
1982 
1983 	mac_perim_exit(mph);
1984 	return (0);
1985 }
1986 
1987 static void
1988 aggr_m_stop(void *arg)
1989 {
1990 	aggr_grp_t *grp = arg;
1991 	aggr_port_t *port;
1992 	mac_perim_handle_t mph, pmph;
1993 
1994 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1995 
1996 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1997 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1998 
1999 		/* reset port promiscuous mode */
2000 		(void) aggr_port_promisc(port, B_FALSE);
2001 
2002 		aggr_port_stop(port);
2003 		mac_perim_exit(pmph);
2004 	}
2005 
2006 	grp->lg_started = B_FALSE;
2007 	mac_perim_exit(mph);
2008 }
2009 
2010 static int
2011 aggr_m_promisc(void *arg, boolean_t on)
2012 {
2013 	aggr_grp_t *grp = arg;
2014 	aggr_port_t *port;
2015 	boolean_t link_state_changed = B_FALSE;
2016 	mac_perim_handle_t mph, pmph;
2017 
2018 	AGGR_GRP_REFHOLD(grp);
2019 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2020 
2021 	ASSERT(!grp->lg_closing);
2022 
2023 	if (on == grp->lg_promisc)
2024 		goto bail;
2025 
2026 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2027 		int	err = 0;
2028 
2029 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2030 		AGGR_PORT_REFHOLD(port);
2031 		if (!on && (port->lp_prom_addr == NULL))
2032 			err = aggr_port_promisc(port, B_FALSE);
2033 		else if (on && port->lp_started)
2034 			err = aggr_port_promisc(port, B_TRUE);
2035 
2036 		if (err != 0) {
2037 			if (aggr_grp_detach_port(grp, port))
2038 				link_state_changed = B_TRUE;
2039 		} else {
2040 			/*
2041 			 * If a port was detached because of a previous
2042 			 * failure changing the promiscuity, the port
2043 			 * is reattached when it successfully changes
2044 			 * the promiscuity now, and this might cause
2045 			 * the link state of the aggregation to change.
2046 			 */
2047 			if (aggr_grp_attach_port(grp, port))
2048 				link_state_changed = B_TRUE;
2049 		}
2050 		mac_perim_exit(pmph);
2051 		AGGR_PORT_REFRELE(port);
2052 	}
2053 
2054 	grp->lg_promisc = on;
2055 
2056 	if (link_state_changed)
2057 		mac_link_update(grp->lg_mh, grp->lg_link_state);
2058 
2059 bail:
2060 	mac_perim_exit(mph);
2061 	AGGR_GRP_REFRELE(grp);
2062 
2063 	return (0);
2064 }
2065 
2066 static void
2067 aggr_grp_port_rename(const char *new_name, void *arg)
2068 {
2069 	/*
2070 	 * aggr port's mac client name is the format of "aggr link name" plus
2071 	 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2072 	 */
2073 	int aggr_len, link_len, clnt_name_len, i;
2074 	char *str_end, *str_st, *str_del;
2075 	char aggr_name[MAXNAMELEN];
2076 	char link_name[MAXNAMELEN];
2077 	char *clnt_name;
2078 	aggr_grp_t *aggr_grp = arg;
2079 	aggr_port_t *aggr_port = aggr_grp->lg_ports;
2080 
2081 	for (i = 0; i < aggr_grp->lg_nports; i++) {
2082 		clnt_name = mac_client_name(aggr_port->lp_mch);
2083 		clnt_name_len = strlen(clnt_name);
2084 		str_st = clnt_name;
2085 		str_end = &(clnt_name[clnt_name_len]);
2086 		str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2087 		ASSERT(str_del != NULL);
2088 		aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2089 		link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2090 		bzero(aggr_name, MAXNAMELEN);
2091 		bzero(link_name, MAXNAMELEN);
2092 		bcopy(clnt_name, aggr_name, aggr_len);
2093 		bcopy(str_del, link_name, link_len + 1);
2094 		bzero(clnt_name, MAXNAMELEN);
2095 		(void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2096 		    link_name);
2097 
2098 		(void) mac_rename_primary(aggr_port->lp_mh, NULL);
2099 		aggr_port = aggr_port->lp_next;
2100 	}
2101 }
2102 
2103 /*
2104  * Initialize the capabilities that are advertised for the group
2105  * according to the capabilities of the constituent ports.
2106  */
2107 static boolean_t
2108 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2109 {
2110 	aggr_grp_t *grp = arg;
2111 
2112 	switch (cap) {
2113 	case MAC_CAPAB_HCKSUM: {
2114 		uint32_t *hcksum_txflags = cap_data;
2115 		*hcksum_txflags = grp->lg_hcksum_txflags;
2116 		break;
2117 	}
2118 	case MAC_CAPAB_LSO: {
2119 		mac_capab_lso_t *cap_lso = cap_data;
2120 
2121 		if (grp->lg_lso) {
2122 			*cap_lso = grp->lg_cap_lso;
2123 			break;
2124 		} else {
2125 			return (B_FALSE);
2126 		}
2127 	}
2128 	case MAC_CAPAB_NO_NATIVEVLAN:
2129 		return (!grp->lg_vlan);
2130 	case MAC_CAPAB_NO_ZCOPY:
2131 		return (!grp->lg_zcopy);
2132 	case MAC_CAPAB_RINGS: {
2133 		mac_capab_rings_t *cap_rings = cap_data;
2134 
2135 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2136 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2137 			cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2138 
2139 			/*
2140 			 * An aggregation advertises only one (pseudo) RX
2141 			 * group, which virtualizes the main/primary group of
2142 			 * the underlying devices.
2143 			 */
2144 			cap_rings->mr_gnum = 1;
2145 			cap_rings->mr_gaddring = NULL;
2146 			cap_rings->mr_gremring = NULL;
2147 		} else {
2148 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2149 			cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2150 			cap_rings->mr_gnum = 0;
2151 		}
2152 		cap_rings->mr_rget = aggr_fill_ring;
2153 		cap_rings->mr_gget = aggr_fill_group;
2154 		break;
2155 	}
2156 	case MAC_CAPAB_AGGR:
2157 	{
2158 		mac_capab_aggr_t *aggr_cap;
2159 
2160 		if (cap_data != NULL) {
2161 			aggr_cap = cap_data;
2162 			aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2163 			aggr_cap->mca_unicst = aggr_m_unicst;
2164 			aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2165 			aggr_cap->mca_arg = arg;
2166 		}
2167 		return (B_TRUE);
2168 	}
2169 	default:
2170 		return (B_FALSE);
2171 	}
2172 	return (B_TRUE);
2173 }
2174 
2175 /*
2176  * Callback funtion for MAC layer to register groups.
2177  */
2178 static void
2179 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2180     mac_group_info_t *infop, mac_group_handle_t gh)
2181 {
2182 	aggr_grp_t *grp = arg;
2183 	aggr_pseudo_rx_group_t *rx_group;
2184 	aggr_pseudo_tx_group_t *tx_group;
2185 
2186 	ASSERT(index == 0);
2187 	if (rtype == MAC_RING_TYPE_RX) {
2188 		rx_group = &grp->lg_rx_group;
2189 		rx_group->arg_gh = gh;
2190 		rx_group->arg_grp = grp;
2191 
2192 		infop->mgi_driver = (mac_group_driver_t)rx_group;
2193 		infop->mgi_start = NULL;
2194 		infop->mgi_stop = NULL;
2195 		infop->mgi_addmac = aggr_addmac;
2196 		infop->mgi_remmac = aggr_remmac;
2197 		infop->mgi_count = rx_group->arg_ring_cnt;
2198 	} else {
2199 		tx_group = &grp->lg_tx_group;
2200 		tx_group->atg_gh = gh;
2201 	}
2202 }
2203 
2204 /*
2205  * Callback funtion for MAC layer to register all rings.
2206  */
2207 static void
2208 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2209     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2210 {
2211 	aggr_grp_t	*grp = arg;
2212 
2213 	switch (rtype) {
2214 	case MAC_RING_TYPE_RX: {
2215 		aggr_pseudo_rx_group_t	*rx_group = &grp->lg_rx_group;
2216 		aggr_pseudo_rx_ring_t	*rx_ring;
2217 		mac_intr_t		aggr_mac_intr;
2218 
2219 		ASSERT(rg_index == 0);
2220 
2221 		ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2222 		rx_ring = rx_group->arg_rings + index;
2223 		rx_ring->arr_rh = rh;
2224 
2225 		/*
2226 		 * Entrypoint to enable interrupt (disable poll) and
2227 		 * disable interrupt (enable poll).
2228 		 */
2229 		aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2230 		aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2231 		aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2232 		aggr_mac_intr.mi_ddi_handle = NULL;
2233 
2234 		infop->mri_driver = (mac_ring_driver_t)rx_ring;
2235 		infop->mri_start = aggr_pseudo_start_ring;
2236 		infop->mri_stop = aggr_pseudo_stop_ring;
2237 
2238 		infop->mri_intr = aggr_mac_intr;
2239 		infop->mri_poll = aggr_rx_poll;
2240 
2241 		infop->mri_stat = aggr_rx_ring_stat;
2242 		break;
2243 	}
2244 	case MAC_RING_TYPE_TX: {
2245 		aggr_pseudo_tx_group_t	*tx_group = &grp->lg_tx_group;
2246 		aggr_pseudo_tx_ring_t	*tx_ring;
2247 
2248 		ASSERT(rg_index == -1);
2249 		ASSERT(index < tx_group->atg_ring_cnt);
2250 
2251 		tx_ring = &tx_group->atg_rings[index];
2252 		tx_ring->atr_rh = rh;
2253 
2254 		infop->mri_driver = (mac_ring_driver_t)tx_ring;
2255 		infop->mri_start = NULL;
2256 		infop->mri_stop = NULL;
2257 		infop->mri_tx = aggr_ring_tx;
2258 		infop->mri_stat = aggr_tx_ring_stat;
2259 		/*
2260 		 * Use the hw TX ring handle to find if the ring needs
2261 		 * serialization or not. For NICs that do not expose
2262 		 * Tx rings, atr_hw_rh will be NULL.
2263 		 */
2264 		if (tx_ring->atr_hw_rh != NULL) {
2265 			infop->mri_flags =
2266 			    mac_hwring_getinfo(tx_ring->atr_hw_rh);
2267 		}
2268 		break;
2269 	}
2270 	default:
2271 		break;
2272 	}
2273 }
2274 
2275 static mblk_t *
2276 aggr_rx_poll(void *arg, int bytes_to_pickup)
2277 {
2278 	aggr_pseudo_rx_ring_t *rr_ring = arg;
2279 	aggr_port_t *port = rr_ring->arr_port;
2280 	aggr_grp_t *grp = port->lp_grp;
2281 	mblk_t *mp_chain, *mp, **mpp;
2282 
2283 	mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2284 
2285 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2286 		return (mp_chain);
2287 
2288 	mpp = &mp_chain;
2289 	while ((mp = *mpp) != NULL) {
2290 		if (MBLKL(mp) >= sizeof (struct ether_header)) {
2291 			struct ether_header *ehp;
2292 
2293 			ehp = (struct ether_header *)mp->b_rptr;
2294 			if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2295 				*mpp = mp->b_next;
2296 				mp->b_next = NULL;
2297 				aggr_recv_lacp(port,
2298 				    (mac_resource_handle_t)rr_ring, mp);
2299 				continue;
2300 			}
2301 		}
2302 
2303 		if (!port->lp_collector_enabled) {
2304 			*mpp = mp->b_next;
2305 			mp->b_next = NULL;
2306 			freemsg(mp);
2307 			continue;
2308 		}
2309 		mpp = &mp->b_next;
2310 	}
2311 	return (mp_chain);
2312 }
2313 
2314 static int
2315 aggr_addmac(void *arg, const uint8_t *mac_addr)
2316 {
2317 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2318 	aggr_unicst_addr_t	*addr, **pprev;
2319 	aggr_grp_t		*grp = rx_group->arg_grp;
2320 	aggr_port_t		*port, *p;
2321 	mac_perim_handle_t	mph;
2322 	int			err = 0;
2323 
2324 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2325 
2326 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2327 		mac_perim_exit(mph);
2328 		return (0);
2329 	}
2330 
2331 	/*
2332 	 * Insert this mac address into the list of mac addresses owned by
2333 	 * the aggregation pseudo group.
2334 	 */
2335 	pprev = &rx_group->arg_macaddr;
2336 	while ((addr = *pprev) != NULL) {
2337 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2338 			mac_perim_exit(mph);
2339 			return (EEXIST);
2340 		}
2341 		pprev = &addr->aua_next;
2342 	}
2343 	addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2344 	bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2345 	addr->aua_next = NULL;
2346 	*pprev = addr;
2347 
2348 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2349 		if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2350 			break;
2351 
2352 	if (err != 0) {
2353 		for (p = grp->lg_ports; p != port; p = p->lp_next)
2354 			aggr_port_remmac(p, mac_addr);
2355 
2356 		*pprev = NULL;
2357 		kmem_free(addr, sizeof (aggr_unicst_addr_t));
2358 	}
2359 
2360 	mac_perim_exit(mph);
2361 	return (err);
2362 }
2363 
2364 static int
2365 aggr_remmac(void *arg, const uint8_t *mac_addr)
2366 {
2367 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2368 	aggr_unicst_addr_t	*addr, **pprev;
2369 	aggr_grp_t		*grp = rx_group->arg_grp;
2370 	aggr_port_t		*port;
2371 	mac_perim_handle_t	mph;
2372 	int			err = 0;
2373 
2374 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2375 
2376 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2377 		mac_perim_exit(mph);
2378 		return (0);
2379 	}
2380 
2381 	/*
2382 	 * Insert this mac address into the list of mac addresses owned by
2383 	 * the aggregation pseudo group.
2384 	 */
2385 	pprev = &rx_group->arg_macaddr;
2386 	while ((addr = *pprev) != NULL) {
2387 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2388 			pprev = &addr->aua_next;
2389 			continue;
2390 		}
2391 		break;
2392 	}
2393 	if (addr == NULL) {
2394 		mac_perim_exit(mph);
2395 		return (EINVAL);
2396 	}
2397 
2398 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2399 		aggr_port_remmac(port, mac_addr);
2400 
2401 	*pprev = addr->aua_next;
2402 	kmem_free(addr, sizeof (aggr_unicst_addr_t));
2403 
2404 	mac_perim_exit(mph);
2405 	return (err);
2406 }
2407 
2408 /*
2409  * Add or remove the multicast addresses that are defined for the group
2410  * to or from the specified port.
2411  *
2412  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2413  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2414  * called when the port is either stopped or detached.
2415  */
2416 void
2417 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2418 {
2419 	aggr_grp_t *grp = port->lp_grp;
2420 
2421 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
2422 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2423 
2424 	if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2425 		return;
2426 
2427 	mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2428 }
2429 
2430 static int
2431 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2432 {
2433 	aggr_grp_t *grp = arg;
2434 	aggr_port_t *port = NULL, *errport = NULL;
2435 	mac_perim_handle_t mph;
2436 	int err = 0;
2437 
2438 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2439 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2440 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2441 		    !port->lp_started) {
2442 			continue;
2443 		}
2444 		err = aggr_port_multicst(port, add, addrp);
2445 		if (err != 0) {
2446 			errport = port;
2447 			break;
2448 		}
2449 	}
2450 
2451 	/*
2452 	 * At least one port caused error return and this error is returned to
2453 	 * mac, eventually a NAK would be sent upwards.
2454 	 * Some ports have this multicast address listed now, and some don't.
2455 	 * Treat this error as a whole aggr failure not individual port failure.
2456 	 * Therefore remove this multicast address from other ports.
2457 	 */
2458 	if ((err != 0) && add) {
2459 		for (port = grp->lg_ports; port != errport;
2460 		    port = port->lp_next) {
2461 			if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2462 			    !port->lp_started) {
2463 				continue;
2464 			}
2465 			(void) aggr_port_multicst(port, B_FALSE, addrp);
2466 		}
2467 	}
2468 	mac_perim_exit(mph);
2469 	return (err);
2470 }
2471 
2472 static int
2473 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2474 {
2475 	aggr_grp_t *grp = arg;
2476 	mac_perim_handle_t mph;
2477 	int err;
2478 
2479 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2480 	err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2481 	    0, 0);
2482 	mac_perim_exit(mph);
2483 	return (err);
2484 }
2485 
2486 /*
2487  * Initialize the capabilities that are advertised for the group
2488  * according to the capabilities of the constituent ports.
2489  */
2490 static void
2491 aggr_grp_capab_set(aggr_grp_t *grp)
2492 {
2493 	uint32_t cksum;
2494 	aggr_port_t *port;
2495 	mac_capab_lso_t cap_lso;
2496 
2497 	ASSERT(grp->lg_mh == NULL);
2498 	ASSERT(grp->lg_ports != NULL);
2499 
2500 	grp->lg_hcksum_txflags = (uint32_t)-1;
2501 	grp->lg_zcopy = B_TRUE;
2502 	grp->lg_vlan = B_TRUE;
2503 
2504 	grp->lg_lso = B_TRUE;
2505 	grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2506 	grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2507 
2508 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2509 		if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2510 			cksum = 0;
2511 		grp->lg_hcksum_txflags &= cksum;
2512 
2513 		grp->lg_vlan &=
2514 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2515 
2516 		grp->lg_zcopy &=
2517 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2518 
2519 		grp->lg_lso &=
2520 		    mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2521 		if (grp->lg_lso) {
2522 			grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2523 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2524 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2525 				grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2526 				    cap_lso.lso_basic_tcp_ipv4.lso_max;
2527 		}
2528 	}
2529 }
2530 
2531 /*
2532  * Checks whether the capabilities of the port being added are compatible
2533  * with the current capabilities of the aggregation.
2534  */
2535 static boolean_t
2536 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2537 {
2538 	uint32_t hcksum_txflags;
2539 
2540 	ASSERT(grp->lg_ports != NULL);
2541 
2542 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2543 	    grp->lg_vlan) != grp->lg_vlan) {
2544 		return (B_FALSE);
2545 	}
2546 
2547 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2548 	    grp->lg_zcopy) != grp->lg_zcopy) {
2549 		return (B_FALSE);
2550 	}
2551 
2552 	if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2553 		if (grp->lg_hcksum_txflags != 0)
2554 			return (B_FALSE);
2555 	} else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2556 	    grp->lg_hcksum_txflags) {
2557 		return (B_FALSE);
2558 	}
2559 
2560 	if (grp->lg_lso) {
2561 		mac_capab_lso_t cap_lso;
2562 
2563 		if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2564 			if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2565 			    grp->lg_cap_lso.lso_flags)
2566 				return (B_FALSE);
2567 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2568 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2569 				return (B_FALSE);
2570 		} else {
2571 			return (B_FALSE);
2572 		}
2573 	}
2574 
2575 	return (B_TRUE);
2576 }
2577 
2578 /*
2579  * Returns the maximum SDU according to the SDU of the constituent ports.
2580  */
2581 static uint_t
2582 aggr_grp_max_sdu(aggr_grp_t *grp)
2583 {
2584 	uint_t max_sdu = (uint_t)-1;
2585 	aggr_port_t *port;
2586 
2587 	ASSERT(grp->lg_ports != NULL);
2588 
2589 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2590 		uint_t port_sdu_max;
2591 
2592 		mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2593 		if (max_sdu > port_sdu_max)
2594 			max_sdu = port_sdu_max;
2595 	}
2596 
2597 	return (max_sdu);
2598 }
2599 
2600 /*
2601  * Checks if the maximum SDU of the specified port is compatible
2602  * with the maximum SDU of the specified aggregation group, returns
2603  * B_TRUE if it is, B_FALSE otherwise.
2604  */
2605 static boolean_t
2606 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2607 {
2608 	uint_t port_sdu_max;
2609 
2610 	mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2611 	return (port_sdu_max >= grp->lg_max_sdu);
2612 }
2613 
2614 /*
2615  * Returns the maximum margin according to the margin of the constituent ports.
2616  */
2617 static uint32_t
2618 aggr_grp_max_margin(aggr_grp_t *grp)
2619 {
2620 	uint32_t margin = UINT32_MAX;
2621 	aggr_port_t *port;
2622 
2623 	ASSERT(grp->lg_mh == NULL);
2624 	ASSERT(grp->lg_ports != NULL);
2625 
2626 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2627 		if (margin > port->lp_margin)
2628 			margin = port->lp_margin;
2629 	}
2630 
2631 	grp->lg_margin = margin;
2632 	return (margin);
2633 }
2634 
2635 /*
2636  * Checks if the maximum margin of the specified port is compatible
2637  * with the maximum margin of the specified aggregation group, returns
2638  * B_TRUE if it is, B_FALSE otherwise.
2639  */
2640 static boolean_t
2641 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2642 {
2643 	if (port->lp_margin >= grp->lg_margin)
2644 		return (B_TRUE);
2645 
2646 	/*
2647 	 * See whether the current margin value is allowed to be changed to
2648 	 * the new value.
2649 	 */
2650 	if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2651 		return (B_FALSE);
2652 
2653 	grp->lg_margin = port->lp_margin;
2654 	return (B_TRUE);
2655 }
2656 
2657 /*
2658  * Set MTU on individual ports of an aggregation group
2659  */
2660 static int
2661 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2662     uint32_t *old_mtu)
2663 {
2664 	boolean_t 		removed = B_FALSE;
2665 	mac_perim_handle_t	mph;
2666 	mac_diag_t		diag;
2667 	int			err, rv, retry = 0;
2668 
2669 	if (port->lp_mah != NULL) {
2670 		(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2671 		port->lp_mah = NULL;
2672 		removed = B_TRUE;
2673 	}
2674 	err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2675 try_again:
2676 	if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2677 	    MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2678 	    &port->lp_mah, 0, &diag)) != 0) {
2679 		/*
2680 		 * following is a workaround for a bug in 'bge' driver.
2681 		 * See CR 6794654 for more information and this work around
2682 		 * will be removed once the CR is fixed.
2683 		 */
2684 		if (rv == EIO && retry++ < 3) {
2685 			delay(2 * hz);
2686 			goto try_again;
2687 		}
2688 		/*
2689 		 * if mac_unicast_add() failed while setting the MTU,
2690 		 * detach the port from the group.
2691 		 */
2692 		mac_perim_enter_by_mh(port->lp_mh, &mph);
2693 		(void) aggr_grp_detach_port(grp, port);
2694 		mac_perim_exit(mph);
2695 		cmn_err(CE_WARN, "Unable to restart the port %s while "
2696 		    "setting MTU. Detaching the port from the aggregation.",
2697 		    mac_client_name(port->lp_mch));
2698 	}
2699 	return (err);
2700 }
2701 
2702 static int
2703 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2704 {
2705 	int			err = 0, i, rv;
2706 	aggr_port_t		*port;
2707 	uint32_t		*mtu;
2708 
2709 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2710 
2711 	/*
2712 	 * If the MTU being set is equal to aggr group's maximum
2713 	 * allowable value, then there is nothing to change
2714 	 */
2715 	if (sdu == grp->lg_max_sdu)
2716 		return (0);
2717 
2718 	/* 0 is aggr group's min sdu */
2719 	if (sdu == 0)
2720 		return (EINVAL);
2721 
2722 	mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
2723 	for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
2724 	    port = port->lp_next, i++) {
2725 		err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
2726 	}
2727 	if (err != 0) {
2728 		/* recover from error: reset the mtus of the ports */
2729 		aggr_port_t *tmp;
2730 
2731 		for (tmp = grp->lg_ports, i = 0; tmp != port;
2732 		    tmp = tmp->lp_next, i++) {
2733 			(void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
2734 		}
2735 		goto bail;
2736 	}
2737 	grp->lg_max_sdu = aggr_grp_max_sdu(grp);
2738 	rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
2739 	ASSERT(rv == 0);
2740 bail:
2741 	kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
2742 	return (err);
2743 }
2744 
2745 /*
2746  * Callback functions for set/get of properties
2747  */
2748 /*ARGSUSED*/
2749 static int
2750 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2751     uint_t pr_valsize, const void *pr_val)
2752 {
2753 	int 		err = ENOTSUP;
2754 	aggr_grp_t 	*grp = m_driver;
2755 
2756 	switch (pr_num) {
2757 	case MAC_PROP_MTU: {
2758 		uint32_t 	mtu;
2759 
2760 		if (pr_valsize < sizeof (mtu)) {
2761 			err = EINVAL;
2762 			break;
2763 		}
2764 		bcopy(pr_val, &mtu, sizeof (mtu));
2765 		err = aggr_sdu_update(grp, mtu);
2766 		break;
2767 	}
2768 	default:
2769 		break;
2770 	}
2771 	return (err);
2772 }
2773 
2774 typedef struct rboundary {
2775 	uint32_t	bval;
2776 	int		btype;
2777 } rboundary_t;
2778 
2779 /*
2780  * This function finds the intersection of mtu ranges stored in arrays -
2781  * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
2782  * Individual arrays are assumed to contain non-overlapping ranges.
2783  * Algorithm:
2784  *   A range has two boundaries - min and max. We scan all arrays and store
2785  * each boundary as a separate element in a temporary array. We also store
2786  * the boundary types, min or max, as +1 or -1 respectively in the temporary
2787  * array. Then we sort the temporary array in ascending order. We scan the
2788  * sorted array from lower to higher values and keep a cumulative sum of
2789  * boundary types. Element in the temporary array for which the sum reaches
2790  * mcount is a min boundary of a range in the result and next element will be
2791  * max boundary.
2792  *
2793  * Example for mcount = 3,
2794  *
2795  *  ----|_________|-------|_______|----|__|------ mrange[0]
2796  *
2797  *  -------|________|--|____________|-----|___|-- mrange[1]
2798  *
2799  *  --------|________________|-------|____|------ mrange[2]
2800  *
2801  *                                      3 2 1
2802  *                                       \|/
2803  *      1  23     2 1  2  3  2    1 01 2  V   0  <- the sum
2804  *  ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
2805  *
2806  *                                 same min and max
2807  *                                        V
2808  *  --------|_____|-------|__|------------|------ intersecting ranges
2809  */
2810 void
2811 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
2812     mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
2813 {
2814 	mac_propval_uint32_range_t	*rval, *ur;
2815 	int				rmaxcnt, rcount;
2816 	size_t				sz_range32;
2817 	rboundary_t			*ta; /* temporary array */
2818 	rboundary_t			temp;
2819 	boolean_t			range_started = B_FALSE;
2820 	int				i, j, m, sum;
2821 
2822 	sz_range32 = sizeof (mac_propval_uint32_range_t);
2823 
2824 	for (i = 0, rmaxcnt = 0; i < mcount; i++)
2825 		rmaxcnt += mrange[i]->mpr_count;
2826 
2827 	/* Allocate enough space to store the results */
2828 	rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
2829 
2830 	/* Number of boundaries are twice as many as ranges */
2831 	ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
2832 
2833 	for (i = 0, m = 0; i < mcount; i++) {
2834 		ur = &(mrange[i]->mpr_range_uint32[0]);
2835 		for (j = 0; j < mrange[i]->mpr_count; j++) {
2836 			ta[m].bval = ur[j].mpur_min;
2837 			ta[m++].btype = 1;
2838 			ta[m].bval = ur[j].mpur_max;
2839 			ta[m++].btype = -1;
2840 		}
2841 	}
2842 
2843 	/*
2844 	 * Sort the temporary array in ascending order of bval;
2845 	 * if boundary values are same then sort on btype.
2846 	 */
2847 	for (i = 0; i < m-1; i++) {
2848 		for (j = i+1; j < m; j++) {
2849 			if ((ta[i].bval > ta[j].bval) ||
2850 			    ((ta[i].bval == ta[j].bval) &&
2851 			    (ta[i].btype < ta[j].btype))) {
2852 				temp = ta[i];
2853 				ta[i] = ta[j];
2854 				ta[j] = temp;
2855 			}
2856 		}
2857 	}
2858 
2859 	/* Walk through temporary array to find all ranges in the results */
2860 	for (i = 0, sum = 0, rcount = 0; i < m; i++) {
2861 		sum += ta[i].btype;
2862 		if (sum == mcount) {
2863 			rval[rcount].mpur_min = ta[i].bval;
2864 			range_started = B_TRUE;
2865 		} else if (sum < mcount && range_started) {
2866 			rval[rcount++].mpur_max = ta[i].bval;
2867 			range_started = B_FALSE;
2868 		}
2869 	}
2870 
2871 	*prval = rval;
2872 	*prmaxcnt = rmaxcnt;
2873 	*prcount = rcount;
2874 
2875 	kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
2876 }
2877 
2878 /*
2879  * Returns the mtu ranges which could be supported by aggr group.
2880  * prmaxcnt returns the size of the buffer prval, prcount returns
2881  * the number of valid entries in prval. Caller is responsible
2882  * for freeing up prval.
2883  */
2884 int
2885 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
2886     int *prmaxcnt, int *prcount)
2887 {
2888 	mac_propval_range_t		**vals;
2889 	aggr_port_t			*port;
2890 	mac_perim_handle_t		mph;
2891 	uint_t 				i, numr;
2892 	int 				err = 0;
2893 	size_t				sz_propval, sz_range32;
2894 	size_t				size;
2895 
2896 	sz_propval = sizeof (mac_propval_range_t);
2897 	sz_range32 = sizeof (mac_propval_uint32_range_t);
2898 
2899 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2900 
2901 	vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
2902 	    KM_SLEEP);
2903 
2904 	for (port = grp->lg_ports, i = 0; port != NULL;
2905 	    port = port->lp_next, i++) {
2906 
2907 		size = sz_propval;
2908 		vals[i] = kmem_alloc(size, KM_SLEEP);
2909 		vals[i]->mpr_count = 1;
2910 
2911 		mac_perim_enter_by_mh(port->lp_mh, &mph);
2912 
2913 		err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2914 		    NULL, 0, vals[i], NULL);
2915 		if (err == ENOSPC) {
2916 			/*
2917 			 * Not enough space to hold all ranges.
2918 			 * Allocate extra space as indicated and retry.
2919 			 */
2920 			numr = vals[i]->mpr_count;
2921 			kmem_free(vals[i], sz_propval);
2922 			size = sz_propval + (numr - 1) * sz_range32;
2923 			vals[i] = kmem_alloc(size, KM_SLEEP);
2924 			vals[i]->mpr_count = numr;
2925 			err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2926 			    NULL, 0, vals[i], NULL);
2927 			ASSERT(err != ENOSPC);
2928 		}
2929 		mac_perim_exit(mph);
2930 		if (err != 0) {
2931 			kmem_free(vals[i], size);
2932 			vals[i] = NULL;
2933 			break;
2934 		}
2935 	}
2936 
2937 	/*
2938 	 * if any of the underlying ports does not support changing MTU then
2939 	 * just return ENOTSUP
2940 	 */
2941 	if (port != NULL) {
2942 		ASSERT(err != 0);
2943 		goto done;
2944 	}
2945 
2946 	aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
2947 	    prcount);
2948 
2949 done:
2950 	for (i = 0; i < grp->lg_nports; i++) {
2951 		if (vals[i] != NULL) {
2952 			numr = vals[i]->mpr_count;
2953 			size = sz_propval + (numr - 1) * sz_range32;
2954 			kmem_free(vals[i], size);
2955 		}
2956 	}
2957 
2958 	kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
2959 	return (err);
2960 }
2961 
2962 static void
2963 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2964     mac_prop_info_handle_t prh)
2965 {
2966 	aggr_grp_t			*grp = m_driver;
2967 	mac_propval_uint32_range_t	*rval = NULL;
2968 	int				i, rcount, rmaxcnt;
2969 	int				err = 0;
2970 
2971 	_NOTE(ARGUNUSED(pr_name));
2972 
2973 	switch (pr_num) {
2974 	case MAC_PROP_MTU:
2975 
2976 		err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
2977 		    &rcount);
2978 		if (err != 0) {
2979 			ASSERT(rval == NULL);
2980 			return;
2981 		}
2982 		for (i = 0; i < rcount; i++) {
2983 			mac_prop_info_set_range_uint32(prh,
2984 			    rval[i].mpur_min, rval[i].mpur_max);
2985 		}
2986 		kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
2987 		break;
2988 	}
2989 }
2990