xref: /illumos-gate/usr/src/uts/sun4v/io/vsw_switching.c (revision 257873cfc1dd3337766407f80397db60a56f2f5a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mac.h>
62 #include <sys/mdeg.h>
63 #include <sys/ldc.h>
64 #include <sys/vsw_fdb.h>
65 #include <sys/vsw.h>
66 #include <sys/vio_mailbox.h>
67 #include <sys/vnet_mailbox.h>
68 #include <sys/vnet_common.h>
69 #include <sys/vio_util.h>
70 #include <sys/sdt.h>
71 #include <sys/atomic.h>
72 #include <sys/vlan.h>
73 
74 /* Switching setup routines */
75 void vsw_setup_switching_timeout(void *arg);
76 void vsw_stop_switching_timeout(vsw_t *vswp);
77 int vsw_setup_switching(vsw_t *);
78 void vsw_setup_layer2_post_process(vsw_t *vswp);
79 void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
80     vsw_port_t *port, mac_resource_handle_t mrh);
81 static	int vsw_setup_layer2(vsw_t *);
82 static	int vsw_setup_layer3(vsw_t *);
83 
84 /* Switching/data transmit routines */
85 static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
86 	vsw_port_t *port, mac_resource_handle_t);
87 static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
88 	vsw_port_t *port, mac_resource_handle_t);
89 static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
90 	int caller, vsw_port_t *port);
91 static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
92     int caller, vsw_port_t *port);
93 
94 /* VLAN routines */
95 void vsw_create_vlans(void *arg, int type);
96 void vsw_destroy_vlans(void *arg, int type);
97 void vsw_vlan_add_ids(void *arg, int type);
98 void vsw_vlan_remove_ids(void *arg, int type);
99 static	void vsw_vlan_create_hash(void *arg, int type);
100 static	void vsw_vlan_destroy_hash(void *arg, int type);
101 boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
102 	uint16_t *vidp);
103 mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
104 uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
105 boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
106 
107 /* Forwarding database (FDB) routines */
108 void vsw_fdbe_add(vsw_t *vswp, void *port);
109 void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
110 static	vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
111 static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
112 
113 int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
114 int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
115 int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
116 void vsw_del_mcst_vsw(vsw_t *);
117 
118 /* Support functions */
119 static mblk_t *vsw_dupmsgchain(mblk_t *mp);
120 static uint32_t vsw_get_same_dest_list(struct ether_header *ehp,
121     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
122 
123 
124 /*
125  * Functions imported from other files.
126  */
127 extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
128 extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
129 extern int vsw_mac_open(vsw_t *vswp);
130 extern void vsw_mac_close(vsw_t *vswp);
131 extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
132     mblk_t *mp, vsw_macrx_flags_t flags);
133 extern void vsw_set_addrs(vsw_t *vswp);
134 extern int vsw_get_hw_maddr(vsw_t *);
135 extern int vsw_mac_attach(vsw_t *vswp);
136 extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt,
137 	uint32_t count);
138 extern void vsw_hio_init(vsw_t *vswp);
139 extern void vsw_hio_start_ports(vsw_t *vswp);
140 
141 /*
142  * Tunables used in this file.
143  */
144 extern	int vsw_setup_switching_delay;
145 extern	uint32_t vsw_vlan_nchains;
146 extern	uint32_t vsw_fdbe_refcnt_delay;
147 
148 #define	VSW_FDBE_REFHOLD(p)						\
149 {									\
150 	atomic_inc_32(&(p)->refcnt);					\
151 	ASSERT((p)->refcnt != 0);					\
152 }
153 
154 #define	VSW_FDBE_REFRELE(p)						\
155 {									\
156 	ASSERT((p)->refcnt != 0);					\
157 	atomic_dec_32(&(p)->refcnt);					\
158 }
159 
160 /*
161  * Timeout routine to setup switching mode:
162  * vsw_setup_switching() is invoked from vsw_attach() or vsw_update_md_prop()
163  * initially. If it fails and the error is EAGAIN, then this timeout handler
164  * is started to retry vsw_setup_switching(). vsw_setup_switching() is retried
165  * until we successfully finish it; or the returned error is not EAGAIN.
166  */
167 void
168 vsw_setup_switching_timeout(void *arg)
169 {
170 	vsw_t		*vswp = (vsw_t *)arg;
171 	int		rv;
172 
173 	if (vswp->swtmout_enabled == B_FALSE)
174 		return;
175 
176 	rv = vsw_setup_switching(vswp);
177 
178 	if (rv == 0) {
179 		vsw_setup_layer2_post_process(vswp);
180 	}
181 
182 	mutex_enter(&vswp->swtmout_lock);
183 
184 	if (rv == EAGAIN && vswp->swtmout_enabled == B_TRUE) {
185 		/*
186 		 * Reschedule timeout() if the error is EAGAIN and the
187 		 * timeout is still enabled. For errors other than EAGAIN,
188 		 * we simply return without rescheduling timeout().
189 		 */
190 		vswp->swtmout_id =
191 		    timeout(vsw_setup_switching_timeout, vswp,
192 		    (vsw_setup_switching_delay * drv_usectohz(MICROSEC)));
193 		goto exit;
194 	}
195 
196 	/* timeout handler completed */
197 	vswp->swtmout_enabled = B_FALSE;
198 	vswp->swtmout_id = 0;
199 
200 exit:
201 	mutex_exit(&vswp->swtmout_lock);
202 }
203 
204 /*
205  * Cancel the timeout handler to setup switching mode.
206  */
207 void
208 vsw_stop_switching_timeout(vsw_t *vswp)
209 {
210 	timeout_id_t tid;
211 
212 	mutex_enter(&vswp->swtmout_lock);
213 
214 	tid = vswp->swtmout_id;
215 
216 	if (tid != 0) {
217 		/* signal timeout handler to stop */
218 		vswp->swtmout_enabled = B_FALSE;
219 		vswp->swtmout_id = 0;
220 		mutex_exit(&vswp->swtmout_lock);
221 
222 		(void) untimeout(tid);
223 	} else {
224 		mutex_exit(&vswp->swtmout_lock);
225 	}
226 
227 	(void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);
228 
229 	WRITE_ENTER(&vswp->mac_rwlock);
230 	vswp->mac_open_retries = 0;
231 	RW_EXIT(&vswp->mac_rwlock);
232 }
233 
234 /*
235  * Setup the required switching mode.
236  * This routine is invoked from vsw_attach() or vsw_update_md_prop()
237  * initially. If it fails and the error is EAGAIN, then a timeout handler
238  * is started to retry vsw_setup_switching(), until it successfully finishes;
239  * or the returned error is not EAGAIN.
240  *
241  * Returns:
242  *  0 on success.
243  *  EAGAIN if retry is needed.
244  *  1 on all other failures.
245  */
246 int
247 vsw_setup_switching(vsw_t *vswp)
248 {
249 	int	i, rv = 1;
250 
251 	D1(vswp, "%s: enter", __func__);
252 
253 	/*
254 	 * Select best switching mode.
255 	 * Note that we start from the saved smode_idx. This is done as
256 	 * this routine can be called from the timeout handler to retry
257 	 * setting up a specific mode. Currently only the function which
258 	 * sets up layer2/promisc mode returns EAGAIN if the underlying
259 	 * physical device is not available yet, causing retries.
260 	 */
261 	for (i = vswp->smode_idx; i < vswp->smode_num; i++) {
262 		vswp->smode_idx = i;
263 		switch (vswp->smode[i]) {
264 		case VSW_LAYER2:
265 		case VSW_LAYER2_PROMISC:
266 			rv = vsw_setup_layer2(vswp);
267 			break;
268 
269 		case VSW_LAYER3:
270 			rv = vsw_setup_layer3(vswp);
271 			break;
272 
273 		default:
274 			DERR(vswp, "unknown switch mode");
275 			break;
276 		}
277 
278 		if ((rv == 0) || (rv == EAGAIN))
279 			break;
280 
281 		/* all other errors(rv != 0): continue & select the next mode */
282 		rv = 1;
283 	}
284 
285 	if (rv && (rv != EAGAIN)) {
286 		cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
287 		    "switching mode", vswp->instance);
288 	} else if (rv == 0) {
289 		(void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
290 	}
291 
292 	D2(vswp, "%s: Operating in mode %d", __func__,
293 	    vswp->smode[vswp->smode_idx]);
294 
295 	D1(vswp, "%s: exit", __func__);
296 
297 	return (rv);
298 }
299 
300 /*
301  * Setup for layer 2 switching.
302  *
303  * Returns:
304  *  0 on success.
305  *  EAGAIN if retry is needed.
306  *  EIO on all other failures.
307  */
308 static int
309 vsw_setup_layer2(vsw_t *vswp)
310 {
311 	int	rv;
312 
313 	D1(vswp, "%s: enter", __func__);
314 
315 	vswp->vsw_switch_frame = vsw_switch_l2_frame;
316 
317 	rv = strlen(vswp->physname);
318 	if (rv == 0) {
319 		/*
320 		 * Physical device name is NULL, which is
321 		 * required for layer 2.
322 		 */
323 		cmn_err(CE_WARN, "!vsw%d: no physical device name specified",
324 		    vswp->instance);
325 		return (EIO);
326 	}
327 
328 	WRITE_ENTER(&vswp->mac_rwlock);
329 
330 	rv = vsw_mac_open(vswp);
331 	if (rv != 0) {
332 		if (rv != EAGAIN) {
333 			cmn_err(CE_WARN, "!vsw%d: Unable to open physical "
334 			    "device: %s\n", vswp->instance, vswp->physname);
335 		}
336 		RW_EXIT(&vswp->mac_rwlock);
337 		return (rv);
338 	}
339 
340 	if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) {
341 		/*
342 		 * Verify that underlying device can support multiple
343 		 * unicast mac addresses.
344 		 */
345 		rv = vsw_get_hw_maddr(vswp);
346 		if (rv != 0) {
347 			goto exit_error;
348 		}
349 	}
350 
351 	/*
352 	 * Attempt to link into the MAC layer so we can get
353 	 * and send packets out over the physical adapter.
354 	 */
355 	rv = vsw_mac_attach(vswp);
356 	if (rv != 0) {
357 		/*
358 		 * Registration with the MAC layer has failed,
359 		 * so return error so that can fall back to next
360 		 * prefered switching method.
361 		 */
362 		cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: "
363 		    "%s\n", vswp->instance, vswp->physname);
364 		goto exit_error;
365 	}
366 
367 	D1(vswp, "%s: exit", __func__);
368 
369 	RW_EXIT(&vswp->mac_rwlock);
370 
371 	/* Initialize HybridIO related stuff */
372 	vsw_hio_init(vswp);
373 	return (0);
374 
375 exit_error:
376 	vsw_mac_close(vswp);
377 	RW_EXIT(&vswp->mac_rwlock);
378 	return (EIO);
379 }
380 
381 static int
382 vsw_setup_layer3(vsw_t *vswp)
383 {
384 	D1(vswp, "%s: enter", __func__);
385 
386 	D2(vswp, "%s: operating in layer 3 mode", __func__);
387 	vswp->vsw_switch_frame = vsw_switch_l3_frame;
388 
389 	D1(vswp, "%s: exit", __func__);
390 
391 	return (0);
392 }
393 
394 /* ARGSUSED */
395 void
396 vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
397 			mac_resource_handle_t mrh)
398 {
399 	freemsgchain(mp);
400 }
401 
402 /*
403  * Switch the given ethernet frame when operating in layer 2 mode.
404  *
405  * vswp: pointer to the vsw instance
406  * mp: pointer to chain of ethernet frame(s) to be switched
407  * caller: identifies the source of this frame as:
408  * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
409  *		2. VSW_PHYSDEV - the physical ethernet device
410  *		3. VSW_LOCALDEV - vsw configured as a virtual interface
411  * arg: argument provided by the caller.
412  *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
413  *		2. for PHYSDEV - NULL
414  *		3. for LOCALDEV - pointer to to this vsw_t(self)
415  */
416 void
417 vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
418 			vsw_port_t *arg, mac_resource_handle_t mrh)
419 {
420 	struct ether_header	*ehp;
421 	mblk_t			*bp, *ret_m;
422 	mblk_t			*mpt = NULL;
423 	uint32_t		count;
424 	vsw_fdbe_t		*fp;
425 
426 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
427 
428 	/*
429 	 * PERF: rather than breaking up the chain here, scan it
430 	 * to find all mblks heading to same destination and then
431 	 * pass that sub-chain to the lower transmit functions.
432 	 */
433 
434 	/* process the chain of packets */
435 	bp = mp;
436 	while (bp) {
437 		ehp = (struct ether_header *)bp->b_rptr;
438 		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
439 		ASSERT(count != 0);
440 
441 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
442 		    __func__, MBLKSIZE(mp), MBLKL(mp));
443 
444 		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
445 			/*
446 			 * If destination is VSW_LOCALDEV (vsw as an eth
447 			 * interface) and if the device is up & running,
448 			 * send the packet up the stack on this host.
449 			 * If the virtual interface is down, drop the packet.
450 			 */
451 			if (caller != VSW_LOCALDEV) {
452 				vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
453 			} else {
454 				freemsgchain(mp);
455 			}
456 			continue;
457 		}
458 
459 		/*
460 		 * Find fdb entry for the destination
461 		 * and hold a reference to it.
462 		 */
463 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
464 		if (fp != NULL) {
465 
466 			/*
467 			 * If plumbed and in promisc mode then copy msg
468 			 * and send up the stack.
469 			 */
470 			vsw_mac_rx(vswp, mrh, mp,
471 			    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
472 
473 			/*
474 			 * If the destination is in FDB, the packet
475 			 * should be forwarded to the correponding
476 			 * vsw_port (connected to a vnet device -
477 			 * VSW_VNETPORT)
478 			 */
479 			(void) vsw_portsend(fp->portp, mp, mpt, count);
480 
481 			/* Release the reference on the fdb entry */
482 			VSW_FDBE_REFRELE(fp);
483 		} else {
484 			/*
485 			 * Destination not in FDB.
486 			 *
487 			 * If the destination is broadcast or
488 			 * multicast forward the packet to all
489 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
490 			 * except the caller.
491 			 */
492 			if (IS_BROADCAST(ehp)) {
493 				D2(vswp, "%s: BROADCAST pkt", __func__);
494 				(void) vsw_forward_all(vswp, mp, caller, arg);
495 			} else if (IS_MULTICAST(ehp)) {
496 				D2(vswp, "%s: MULTICAST pkt", __func__);
497 				(void) vsw_forward_grp(vswp, mp, caller, arg);
498 			} else {
499 				/*
500 				 * If the destination is unicast, and came
501 				 * from either a logical network device or
502 				 * the switch itself when it is plumbed, then
503 				 * send it out on the physical device and also
504 				 * up the stack if the logical interface is
505 				 * in promiscious mode.
506 				 *
507 				 * NOTE:  The assumption here is that if we
508 				 * cannot find the destination in our fdb, its
509 				 * a unicast address, and came from either a
510 				 * vnet or down the stack (when plumbed) it
511 				 * must be destinded for an ethernet device
512 				 * outside our ldoms.
513 				 */
514 				if (caller == VSW_VNETPORT) {
515 					/* promisc check copy etc */
516 					vsw_mac_rx(vswp, mrh, mp,
517 					    VSW_MACRX_PROMISC |
518 					    VSW_MACRX_COPYMSG);
519 
520 					if ((ret_m = vsw_tx_msg(vswp, mp))
521 					    != NULL) {
522 						DERR(vswp, "%s: drop mblks to "
523 						    "phys dev", __func__);
524 						freemsgchain(ret_m);
525 					}
526 
527 				} else if (caller == VSW_PHYSDEV) {
528 					/*
529 					 * Pkt seen because card in promisc
530 					 * mode. Send up stack if plumbed in
531 					 * promisc mode, else drop it.
532 					 */
533 					vsw_mac_rx(vswp, mrh, mp,
534 					    VSW_MACRX_PROMISC |
535 					    VSW_MACRX_FREEMSG);
536 
537 				} else if (caller == VSW_LOCALDEV) {
538 					/*
539 					 * Pkt came down the stack, send out
540 					 * over physical device.
541 					 */
542 					if ((ret_m = vsw_tx_msg(vswp, mp))
543 					    != NULL) {
544 						DERR(vswp, "%s: drop mblks to "
545 						    "phys dev", __func__);
546 						freemsgchain(ret_m);
547 					}
548 				}
549 			}
550 		}
551 	}
552 	D1(vswp, "%s: exit\n", __func__);
553 }
554 
555 /*
556  * Switch ethernet frame when in layer 3 mode (i.e. using IP
557  * layer to do the routing).
558  *
559  * There is a large amount of overlap between this function and
560  * vsw_switch_l2_frame. At some stage we need to revisit and refactor
561  * both these functions.
562  */
563 void
564 vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
565 			vsw_port_t *arg, mac_resource_handle_t mrh)
566 {
567 	struct ether_header	*ehp;
568 	mblk_t			*bp = NULL;
569 	mblk_t			*mpt;
570 	uint32_t		count;
571 	vsw_fdbe_t		*fp;
572 
573 	D1(vswp, "%s: enter (caller %d)", __func__, caller);
574 
575 	/*
576 	 * In layer 3 mode should only ever be switching packets
577 	 * between IP layer and vnet devices. So make sure thats
578 	 * who is invoking us.
579 	 */
580 	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
581 		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
582 		freemsgchain(mp);
583 		return;
584 	}
585 
586 	/* process the chain of packets */
587 	bp = mp;
588 	while (bp) {
589 		ehp = (struct ether_header *)bp->b_rptr;
590 		count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp);
591 		ASSERT(count != 0);
592 
593 		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
594 		    __func__, MBLKSIZE(mp), MBLKL(mp));
595 
596 		/*
597 		 * Find fdb entry for the destination
598 		 * and hold a reference to it.
599 		 */
600 		fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
601 		if (fp != NULL) {
602 
603 			D2(vswp, "%s: sending to target port", __func__);
604 			(void) vsw_portsend(fp->portp, mp, mpt, count);
605 
606 			/* Release the reference on the fdb entry */
607 			VSW_FDBE_REFRELE(fp);
608 		} else {
609 			/*
610 			 * Destination not in FDB
611 			 *
612 			 * If the destination is broadcast or
613 			 * multicast forward the packet to all
614 			 * (VNETPORTs, PHYSDEV, LOCALDEV),
615 			 * except the caller.
616 			 */
617 			if (IS_BROADCAST(ehp)) {
618 				D2(vswp, "%s: BROADCAST pkt", __func__);
619 				(void) vsw_forward_all(vswp, mp, caller, arg);
620 			} else if (IS_MULTICAST(ehp)) {
621 				D2(vswp, "%s: MULTICAST pkt", __func__);
622 				(void) vsw_forward_grp(vswp, mp, caller, arg);
623 			} else {
624 				/*
625 				 * Unicast pkt from vnet that we don't have
626 				 * an FDB entry for, so must be destinded for
627 				 * the outside world. Attempt to send up to the
628 				 * IP layer to allow it to deal with it.
629 				 */
630 				if (caller == VSW_VNETPORT) {
631 					vsw_mac_rx(vswp, mrh,
632 					    mp, VSW_MACRX_FREEMSG);
633 				}
634 			}
635 		}
636 	}
637 
638 	D1(vswp, "%s: exit", __func__);
639 }
640 
641 /*
642  * Setup mac addrs and hio resources for layer 2 switching only.
643  */
644 void
645 vsw_setup_layer2_post_process(vsw_t *vswp)
646 {
647 	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
648 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) {
649 		/*
650 		 * Program unicst, mcst addrs of vsw
651 		 * interface and ports in the physdev.
652 		 */
653 		vsw_set_addrs(vswp);
654 
655 		/* Start HIO for ports that have already connected */
656 		vsw_hio_start_ports(vswp);
657 	}
658 }
659 
660 /*
661  * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
662  * except the caller (port on which frame arrived).
663  */
664 static int
665 vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
666 {
667 	vsw_port_list_t	*plist = &vswp->plist;
668 	vsw_port_t	*portp;
669 	mblk_t		*nmp = NULL;
670 	mblk_t		*ret_m = NULL;
671 	int		skip_port = 0;
672 
673 	D1(vswp, "vsw_forward_all: enter\n");
674 
675 	/*
676 	 * Broadcast message from inside ldoms so send to outside
677 	 * world if in either of layer 2 modes.
678 	 */
679 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
680 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
681 	    ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
682 
683 		nmp = vsw_dupmsgchain(mp);
684 		if (nmp) {
685 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
686 				DERR(vswp, "%s: dropping pkt(s) "
687 				    "consisting of %ld bytes of data for"
688 				    " physical device", __func__, MBLKL(ret_m));
689 				freemsgchain(ret_m);
690 			}
691 		}
692 	}
693 
694 	if (caller == VSW_VNETPORT)
695 		skip_port = 1;
696 
697 	/*
698 	 * Broadcast message from other vnet (layer 2 or 3) or outside
699 	 * world (layer 2 only), send up stack if plumbed.
700 	 */
701 	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
702 		vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
703 	}
704 
705 	/* send it to all VNETPORTs */
706 	READ_ENTER(&plist->lockrw);
707 	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
708 		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
709 		/*
710 		 * Caution ! - don't reorder these two checks as arg
711 		 * will be NULL if the caller is PHYSDEV. skip_port is
712 		 * only set if caller is VNETPORT.
713 		 */
714 		if ((skip_port) && (portp == arg)) {
715 			continue;
716 		} else {
717 			nmp = vsw_dupmsgchain(mp);
718 			if (nmp) {
719 				mblk_t	*mpt = nmp;
720 				uint32_t count = 1;
721 
722 				/* Find tail */
723 				while (mpt->b_next != NULL) {
724 					mpt = mpt->b_next;
725 					count++;
726 				}
727 				/*
728 				 * The plist->lockrw is protecting the
729 				 * portp from getting destroyed here.
730 				 * So, no ref_cnt is incremented here.
731 				 */
732 				(void) vsw_portsend(portp, nmp, mpt, count);
733 			} else {
734 				DERR(vswp, "vsw_forward_all: nmp NULL");
735 			}
736 		}
737 	}
738 	RW_EXIT(&plist->lockrw);
739 
740 	freemsgchain(mp);
741 
742 	D1(vswp, "vsw_forward_all: exit\n");
743 	return (0);
744 }
745 
746 /*
747  * Forward pkts to any devices or interfaces which have registered
748  * an interest in them (i.e. multicast groups).
749  */
750 static int
751 vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
752 {
753 	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
754 	mfdb_ent_t		*entp = NULL;
755 	mfdb_ent_t		*tpp = NULL;
756 	vsw_port_t 		*port;
757 	uint64_t		key = 0;
758 	mblk_t			*nmp = NULL;
759 	mblk_t			*ret_m = NULL;
760 	boolean_t		check_if = B_TRUE;
761 
762 	/*
763 	 * Convert address to hash table key
764 	 */
765 	KEY_HASH(key, &ehp->ether_dhost);
766 
767 	D1(vswp, "%s: key 0x%llx", __func__, key);
768 
769 	/*
770 	 * If pkt came from either a vnet or down the stack (if we are
771 	 * plumbed) and we are in layer 2 mode, then we send the pkt out
772 	 * over the physical adapter, and then check to see if any other
773 	 * vnets are interested in it.
774 	 */
775 	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
776 	    (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
777 	    ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
778 		nmp = vsw_dupmsgchain(mp);
779 		if (nmp) {
780 			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
781 				DERR(vswp, "%s: dropping pkt(s) consisting of "
782 				    "%ld bytes of data for physical device",
783 				    __func__, MBLKL(ret_m));
784 				freemsgchain(ret_m);
785 			}
786 		}
787 	}
788 
789 	READ_ENTER(&vswp->mfdbrw);
790 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
791 	    (mod_hash_val_t *)&entp) != 0) {
792 		D3(vswp, "%s: no table entry found for addr 0x%llx",
793 		    __func__, key);
794 	} else {
795 		/*
796 		 * Send to list of devices associated with this address...
797 		 */
798 		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
799 
800 			/* dont send to ourselves */
801 			if ((caller == VSW_VNETPORT) &&
802 			    (tpp->d_addr == (void *)arg)) {
803 				port = (vsw_port_t *)tpp->d_addr;
804 				D3(vswp, "%s: not sending to ourselves"
805 				    " : port %d", __func__, port->p_instance);
806 				continue;
807 
808 			} else if ((caller == VSW_LOCALDEV) &&
809 			    (tpp->d_type == VSW_LOCALDEV)) {
810 				D2(vswp, "%s: not sending back up stack",
811 				    __func__);
812 				continue;
813 			}
814 
815 			if (tpp->d_type == VSW_VNETPORT) {
816 				port = (vsw_port_t *)tpp->d_addr;
817 				D3(vswp, "%s: sending to port %ld for addr "
818 				    "0x%llx", __func__, port->p_instance, key);
819 
820 				nmp = vsw_dupmsgchain(mp);
821 				if (nmp) {
822 					mblk_t	*mpt = nmp;
823 					uint32_t count = 1;
824 
825 					/* Find tail */
826 					while (mpt->b_next != NULL) {
827 						mpt = mpt->b_next;
828 						count++;
829 					}
830 					/*
831 					 * The vswp->mfdbrw is protecting the
832 					 * portp from getting destroyed here.
833 					 * So, no ref_cnt is incremented here.
834 					 */
835 					(void) vsw_portsend(port, nmp, mpt,
836 					    count);
837 				}
838 			} else {
839 				vsw_mac_rx(vswp, NULL,
840 				    mp, VSW_MACRX_COPYMSG);
841 				D2(vswp, "%s: sending up stack"
842 				    " for addr 0x%llx", __func__, key);
843 				check_if = B_FALSE;
844 			}
845 		}
846 	}
847 
848 	RW_EXIT(&vswp->mfdbrw);
849 
850 	/*
851 	 * If the pkt came from either a vnet or from physical device,
852 	 * and if we havent already sent the pkt up the stack then we
853 	 * check now if we can/should (i.e. the interface is plumbed
854 	 * and in promisc mode).
855 	 */
856 	if ((check_if) &&
857 	    ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
858 		vsw_mac_rx(vswp, NULL, mp,
859 		    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
860 	}
861 
862 	freemsgchain(mp);
863 
864 	D1(vswp, "%s: exit", __func__);
865 
866 	return (0);
867 }
868 
869 /*
870  * This function creates the vlan id hash table for the given vsw device or
871  * port. It then adds each vlan that the device or port has been assigned,
872  * into this hash table.
873  * Arguments:
874  *   arg:  vsw device or port.
875  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
876  */
877 void
878 vsw_create_vlans(void *arg, int type)
879 {
880 	/* create vlan hash table */
881 	vsw_vlan_create_hash(arg, type);
882 
883 	/* add vlan ids of the vsw device into its hash table */
884 	vsw_vlan_add_ids(arg, type);
885 }
886 
887 /*
888  * This function removes the vlan ids of the vsw device or port from its hash
889  * table. It then destroys the vlan hash table.
890  * Arguments:
891  *   arg:  vsw device or port.
892  *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
893  */
894 void
895 vsw_destroy_vlans(void *arg, int type)
896 {
897 	/* remove vlan ids from the hash table */
898 	vsw_vlan_remove_ids(arg, type);
899 
900 	/* destroy vlan-hash-table */
901 	vsw_vlan_destroy_hash(arg, type);
902 }
903 
904 /*
905  * Create a vlan-id hash table for the given vsw device or port.
906  */
907 static void
908 vsw_vlan_create_hash(void *arg, int type)
909 {
910 	char		hashname[MAXNAMELEN];
911 
912 	if (type == VSW_LOCALDEV) {
913 		vsw_t		*vswp = (vsw_t *)arg;
914 
915 		(void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
916 		    vswp->instance);
917 
918 		vswp->vlan_nchains = vsw_vlan_nchains;
919 		vswp->vlan_hashp = mod_hash_create_idhash(hashname,
920 		    vswp->vlan_nchains, mod_hash_null_valdtor);
921 
922 	} else if (type == VSW_VNETPORT) {
923 		vsw_port_t	*portp = (vsw_port_t *)arg;
924 
925 		(void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
926 		    portp->p_instance);
927 
928 		portp->vlan_nchains = vsw_vlan_nchains;
929 		portp->vlan_hashp = mod_hash_create_idhash(hashname,
930 		    portp->vlan_nchains, mod_hash_null_valdtor);
931 
932 	} else {
933 		return;
934 	}
935 }
936 
937 /*
938  * Destroy the vlan-id hash table for the given vsw device or port.
939  */
940 static void
941 vsw_vlan_destroy_hash(void *arg, int type)
942 {
943 	if (type == VSW_LOCALDEV) {
944 		vsw_t		*vswp = (vsw_t *)arg;
945 
946 		mod_hash_destroy_hash(vswp->vlan_hashp);
947 		vswp->vlan_nchains = 0;
948 	} else if (type == VSW_VNETPORT) {
949 		vsw_port_t	*portp = (vsw_port_t *)arg;
950 
951 		mod_hash_destroy_hash(portp->vlan_hashp);
952 		portp->vlan_nchains = 0;
953 	} else {
954 		return;
955 	}
956 }
957 
958 /*
959  * Add vlan ids of the given vsw device or port into its hash table.
960  */
961 void
962 vsw_vlan_add_ids(void *arg, int type)
963 {
964 	int	rv;
965 	int	i;
966 
967 	if (type == VSW_LOCALDEV) {
968 		vsw_t		*vswp = (vsw_t *)arg;
969 
970 		rv = mod_hash_insert(vswp->vlan_hashp,
971 		    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
972 		    (mod_hash_val_t)B_TRUE);
973 		ASSERT(rv == 0);
974 
975 		for (i = 0; i < vswp->nvids; i++) {
976 			rv = mod_hash_insert(vswp->vlan_hashp,
977 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
978 			    (mod_hash_val_t)B_TRUE);
979 			ASSERT(rv == 0);
980 		}
981 
982 	} else if (type == VSW_VNETPORT) {
983 		vsw_port_t	*portp = (vsw_port_t *)arg;
984 
985 		rv = mod_hash_insert(portp->vlan_hashp,
986 		    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
987 		    (mod_hash_val_t)B_TRUE);
988 		ASSERT(rv == 0);
989 
990 		for (i = 0; i < portp->nvids; i++) {
991 			rv = mod_hash_insert(portp->vlan_hashp,
992 			    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
993 			    (mod_hash_val_t)B_TRUE);
994 			ASSERT(rv == 0);
995 		}
996 
997 	} else {
998 		return;
999 	}
1000 }
1001 
1002 /*
1003  * Remove vlan ids of the given vsw device or port from its hash table.
1004  */
1005 void
1006 vsw_vlan_remove_ids(void *arg, int type)
1007 {
1008 	mod_hash_val_t	vp;
1009 	int		rv;
1010 	int		i;
1011 
1012 	if (type == VSW_LOCALDEV) {
1013 		vsw_t		*vswp = (vsw_t *)arg;
1014 
1015 		rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
1016 		if (rv == B_TRUE) {
1017 			rv = mod_hash_remove(vswp->vlan_hashp,
1018 			    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
1019 			    (mod_hash_val_t *)&vp);
1020 			ASSERT(rv == 0);
1021 		}
1022 
1023 		for (i = 0; i < vswp->nvids; i++) {
1024 			rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->vids[i]);
1025 			if (rv == B_TRUE) {
1026 				rv = mod_hash_remove(vswp->vlan_hashp,
1027 				    (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]),
1028 				    (mod_hash_val_t *)&vp);
1029 				ASSERT(rv == 0);
1030 			}
1031 		}
1032 
1033 	} else if (type == VSW_VNETPORT) {
1034 		vsw_port_t	*portp = (vsw_port_t *)arg;
1035 
1036 		portp = (vsw_port_t *)arg;
1037 		rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
1038 		if (rv == B_TRUE) {
1039 			rv = mod_hash_remove(portp->vlan_hashp,
1040 			    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
1041 			    (mod_hash_val_t *)&vp);
1042 			ASSERT(rv == 0);
1043 		}
1044 
1045 		for (i = 0; i < portp->nvids; i++) {
1046 			rv = vsw_vlan_lookup(portp->vlan_hashp, portp->vids[i]);
1047 			if (rv == B_TRUE) {
1048 				rv = mod_hash_remove(portp->vlan_hashp,
1049 				    (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]),
1050 				    (mod_hash_val_t *)&vp);
1051 				ASSERT(rv == 0);
1052 			}
1053 		}
1054 
1055 	} else {
1056 		return;
1057 	}
1058 }
1059 
1060 /*
1061  * Find the given vlan id in the hash table.
1062  * Return: B_TRUE if the id is found; B_FALSE if not found.
1063  */
1064 boolean_t
1065 vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
1066 {
1067 	int		rv;
1068 	mod_hash_val_t	vp;
1069 
1070 	rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);
1071 
1072 	if (rv != 0)
1073 		return (B_FALSE);
1074 
1075 	return (B_TRUE);
1076 }
1077 
1078 /*
1079  * Add an entry into FDB for the given vsw.
1080  */
1081 void
1082 vsw_fdbe_add(vsw_t *vswp, void *port)
1083 {
1084 	uint64_t	addr = 0;
1085 	vsw_port_t	*portp;
1086 	vsw_fdbe_t	*fp;
1087 	int		rv;
1088 
1089 	portp = (vsw_port_t *)port;
1090 	KEY_HASH(addr, &portp->p_macaddr);
1091 
1092 	fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
1093 	fp->portp = port;
1094 
1095 	/*
1096 	 * Note: duplicate keys will be rejected by mod_hash.
1097 	 */
1098 	rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
1099 	    (mod_hash_val_t)fp);
1100 	ASSERT(rv == 0);
1101 }
1102 
1103 /*
1104  * Remove an entry from FDB.
1105  */
1106 void
1107 vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
1108 {
1109 	uint64_t	addr = 0;
1110 	vsw_fdbe_t	*fp;
1111 	int		rv;
1112 
1113 	KEY_HASH(addr, eaddr);
1114 
1115 	/*
1116 	 * Remove the entry from fdb hash table.
1117 	 * This prevents further references to this fdb entry.
1118 	 */
1119 	rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
1120 	    (mod_hash_val_t *)&fp);
1121 	if (rv != 0) {
1122 		/* invalid key? */
1123 		return;
1124 	}
1125 
1126 	/*
1127 	 * If there are threads already ref holding before the entry was
1128 	 * removed from hash table, then wait for ref count to drop to zero.
1129 	 */
1130 	while (fp->refcnt != 0) {
1131 		delay(drv_usectohz(vsw_fdbe_refcnt_delay));
1132 	}
1133 
1134 	kmem_free(fp, sizeof (*fp));
1135 }
1136 
1137 /*
1138  * Search fdb for a given mac address. If an entry is found, hold
1139  * a reference to it and return the entry, else returns NULL.
1140  */
1141 static vsw_fdbe_t *
1142 vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
1143 {
1144 	uint64_t	key = 0;
1145 	vsw_fdbe_t	*fp;
1146 	int		rv;
1147 
1148 	KEY_HASH(key, addrp);
1149 
1150 	rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
1151 	    (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);
1152 
1153 	if (rv != 0)
1154 		return (NULL);
1155 
1156 	return (fp);
1157 }
1158 
1159 /*
1160  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1161  * entry corresponding to the key (macaddr), this callback will be invoked by
1162  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1163  * entry before returning the found entry.
1164  */
1165 static void
1166 vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1167 {
1168 	_NOTE(ARGUNUSED(key))
1169 	VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
1170 }
1171 
1172 /*
1173  * A given frame must be always tagged with the appropriate vlan id (unless it
1174  * is in the default-vlan) before the mac address switching function is called.
1175  * Otherwise, after switching function determines the destination, we cannot
1176  * figure out if the destination belongs to the the same vlan that the frame
1177  * originated from and if it needs tag/untag. Frames which are inbound from
1178  * the external(physical) network over a vlan trunk link are always tagged.
1179  * However frames which are received from a vnet-port over ldc or frames which
1180  * are coming down the stack on the service domain over vsw interface may be
1181  * untagged. These frames must be tagged with the appropriate pvid of the
1182  * sender (vnet-port or vsw device), before invoking the switching function.
1183  *
1184  * Arguments:
1185  *   arg:    caller of the function.
1186  *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1187  *   mp:     frame(s) to be tagged.
1188  */
1189 mblk_t *
1190 vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
1191 {
1192 	vsw_t			*vswp;
1193 	vsw_port_t		*portp;
1194 	struct ether_header	*ehp;
1195 	mblk_t			*bp;
1196 	mblk_t			*bpt;
1197 	mblk_t			*bph;
1198 	mblk_t			*bpn;
1199 	uint16_t		pvid;
1200 
1201 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1202 
1203 	if (type == VSW_LOCALDEV) {
1204 		vswp = (vsw_t *)arg;
1205 		pvid = vswp->pvid;
1206 		portp = NULL;
1207 	} else {
1208 		/* VSW_VNETPORT */
1209 		portp = (vsw_port_t *)arg;
1210 		pvid = portp->pvid;
1211 		vswp = portp->p_vswp;
1212 	}
1213 
1214 	bpn = bph = bpt = NULL;
1215 
1216 	for (bp = mp; bp != NULL; bp = bpn) {
1217 
1218 		bpn = bp->b_next;
1219 		bp->b_next = bp->b_prev = NULL;
1220 
1221 		/* Determine if it is an untagged frame */
1222 		ehp = (struct ether_header *)bp->b_rptr;
1223 
1224 		if (ehp->ether_type != ETHERTYPE_VLAN) {	/* untagged */
1225 
1226 			/* no need to tag if the frame is in default vlan */
1227 			if (pvid != vswp->default_vlan_id) {
1228 				bp = vnet_vlan_insert_tag(bp, pvid);
1229 				if (bp == NULL) {
1230 					continue;
1231 				}
1232 			}
1233 		}
1234 
1235 		/* build a chain of processed packets */
1236 		if (bph == NULL) {
1237 			bph = bpt = bp;
1238 		} else {
1239 			bpt->b_next = bp;
1240 			bpt = bp;
1241 		}
1242 
1243 	}
1244 
1245 	return (bph);
1246 }
1247 
1248 /*
1249  * Frames destined to a vnet-port or to the local vsw interface, must be
1250  * untagged if necessary before sending. This function first checks that the
1251  * frame can be sent to the destination in the vlan identified by the frame
1252  * tag. Note that when this function is invoked the frame must have been
1253  * already tagged (unless it is in the default-vlan). Because, this function is
1254  * called when the switching function determines the destination and invokes
1255  * its send function (vnet-port or vsw interface) and all frames would have
1256  * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
1257  *
1258  * Arguments:
1259  *   arg:    destination device.
1260  *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
1261  *   np:     head of pkt chain to be validated and untagged.
1262  *   npt:    tail of pkt chain to be validated and untagged.
1263  *
1264  * Returns:
1265  *   np:     head of updated chain of packets
1266  *   npt:    tail of updated chain of packets
1267  *   rv:     count of any packets dropped
1268  */
1269 uint32_t
1270 vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
1271 {
1272 	mblk_t			*bp;
1273 	mblk_t			*bpt;
1274 	mblk_t			*bph;
1275 	mblk_t			*bpn;
1276 	vsw_port_t		*portp;
1277 	vsw_t			*vswp;
1278 	uint32_t		count;
1279 	struct ether_header	*ehp;
1280 	boolean_t		is_tagged;
1281 	boolean_t		rv;
1282 	uint16_t		vlan_id;
1283 	uint16_t		pvid;
1284 	mod_hash_t		*vlan_hashp;
1285 
1286 	ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
1287 
1288 	if (type == VSW_LOCALDEV) {
1289 		vswp = (vsw_t *)arg;
1290 		pvid = vswp->pvid;
1291 		vlan_hashp = vswp->vlan_hashp;
1292 		portp = NULL;
1293 	} else {
1294 		/* type == VSW_VNETPORT */
1295 		portp = (vsw_port_t *)arg;
1296 		vswp = portp->p_vswp;
1297 		vlan_hashp = portp->vlan_hashp;
1298 		pvid = portp->pvid;
1299 	}
1300 
1301 	bpn = bph = bpt = NULL;
1302 	count = 0;
1303 
1304 	for (bp = *np; bp != NULL; bp = bpn) {
1305 
1306 		bpn = bp->b_next;
1307 		bp->b_next = bp->b_prev = NULL;
1308 
1309 		/*
1310 		 * Determine the vlan id that the frame belongs to.
1311 		 */
1312 		ehp = (struct ether_header *)bp->b_rptr;
1313 		is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);
1314 
1315 		/*
1316 		 * Check if the destination is in the same vlan.
1317 		 */
1318 		rv = vsw_vlan_lookup(vlan_hashp, vlan_id);
1319 		if (rv == B_FALSE) {
1320 			/* drop the packet */
1321 			freemsg(bp);
1322 			count++;
1323 			continue;
1324 		}
1325 
1326 		/*
1327 		 * Check the frame header if tag/untag is  needed.
1328 		 */
1329 		if (is_tagged == B_FALSE) {
1330 			/*
1331 			 * Untagged frame. We shouldn't have an untagged
1332 			 * packet at this point, unless the destination's
1333 			 * vlan id is default-vlan-id; if it is not the
1334 			 * default-vlan-id, we drop the packet.
1335 			 */
1336 			if (vlan_id != vswp->default_vlan_id) {
1337 				/* drop the packet */
1338 				freemsg(bp);
1339 				count++;
1340 				continue;
1341 			}
1342 		} else {
1343 			/*
1344 			 * Tagged frame, untag if it's the destination's pvid.
1345 			 */
1346 			if (vlan_id == pvid) {
1347 
1348 				bp = vnet_vlan_remove_tag(bp);
1349 				if (bp == NULL) {
1350 					/* packet dropped */
1351 					count++;
1352 					continue;
1353 				}
1354 			}
1355 		}
1356 
1357 		/* build a chain of processed packets */
1358 		if (bph == NULL) {
1359 			bph = bpt = bp;
1360 		} else {
1361 			bpt->b_next = bp;
1362 			bpt = bp;
1363 		}
1364 
1365 	}
1366 
1367 	*np = bph;
1368 	*npt = bpt;
1369 
1370 	return (count);
1371 }
1372 
1373 /*
1374  * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
1375  * then the vlan-id is available in the tag; otherwise, its vlan id is
1376  * implicitly obtained based on the caller (destination of the frame:
1377  * VSW_VNETPORT or VSW_LOCALDEV).
1378  * The vlan id determined is returned in vidp.
1379  * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
1380  */
1381 boolean_t
1382 vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
1383 	uint16_t *vidp)
1384 {
1385 	struct ether_vlan_header	*evhp;
1386 	vsw_t				*vswp;
1387 	vsw_port_t			*portp;
1388 
1389 	/* If it's a tagged frame, get the vid from vlan header */
1390 	if (ehp->ether_type == ETHERTYPE_VLAN) {
1391 
1392 		evhp = (struct ether_vlan_header *)ehp;
1393 		*vidp = VLAN_ID(ntohs(evhp->ether_tci));
1394 		return (B_TRUE);
1395 	}
1396 
1397 	/* Untagged frame; determine vlan id based on caller */
1398 	switch (caller) {
1399 
1400 	case VSW_VNETPORT:
1401 		/*
1402 		 * packet destined to a vnet; vlan-id is pvid of vnet-port.
1403 		 */
1404 		portp = (vsw_port_t *)arg;
1405 		*vidp = portp->pvid;
1406 		break;
1407 
1408 	case VSW_LOCALDEV:
1409 
1410 		/*
1411 		 * packet destined to vsw interface;
1412 		 * vlan-id is port-vlan-id of vsw device.
1413 		 */
1414 		vswp = (vsw_t *)arg;
1415 		*vidp = vswp->pvid;
1416 		break;
1417 	}
1418 
1419 	return (B_FALSE);
1420 }
1421 
1422 /*
1423  * Add or remove multicast address(es).
1424  *
1425  * Returns 0 on success, 1 on failure.
1426  */
1427 int
1428 vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
1429 {
1430 	mcst_addr_t		*mcst_p = NULL;
1431 	vsw_t			*vswp = port->p_vswp;
1432 	uint64_t		addr = 0x0;
1433 	int			i;
1434 
1435 	D1(vswp, "%s: enter", __func__);
1436 
1437 	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
1438 
1439 	for (i = 0; i < mcst_pkt->count; i++) {
1440 		/*
1441 		 * Convert address into form that can be used
1442 		 * as hash table key.
1443 		 */
1444 		KEY_HASH(addr, &(mcst_pkt->mca[i]));
1445 
1446 		/*
1447 		 * Add or delete the specified address/port combination.
1448 		 */
1449 		if (mcst_pkt->set == 0x1) {
1450 			D3(vswp, "%s: adding multicast address 0x%llx for "
1451 			    "port %ld", __func__, addr, port->p_instance);
1452 			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1453 				/*
1454 				 * Update the list of multicast
1455 				 * addresses contained within the
1456 				 * port structure to include this new
1457 				 * one.
1458 				 */
1459 				mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
1460 				    KM_NOSLEEP);
1461 				if (mcst_p == NULL) {
1462 					DERR(vswp, "%s: unable to alloc mem",
1463 					    __func__);
1464 					(void) vsw_del_mcst(vswp,
1465 					    VSW_VNETPORT, addr, port);
1466 					return (1);
1467 				}
1468 
1469 				mcst_p->nextp = NULL;
1470 				mcst_p->addr = addr;
1471 				ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);
1472 
1473 				/*
1474 				 * Program the address into HW. If the addr
1475 				 * has already been programmed then the MAC
1476 				 * just increments a ref counter (which is
1477 				 * used when the address is being deleted)
1478 				 */
1479 				WRITE_ENTER(&vswp->mac_rwlock);
1480 				if (vswp->mh != NULL) {
1481 					if (mac_multicst_add(vswp->mh,
1482 					    (uchar_t *)&mcst_pkt->mca[i])) {
1483 						RW_EXIT(&vswp->mac_rwlock);
1484 						cmn_err(CE_WARN, "!vsw%d: "
1485 						    "unable to add multicast "
1486 						    "address: %s\n",
1487 						    vswp->instance,
1488 						    ether_sprintf((void *)
1489 						    &mcst_p->mca));
1490 						(void) vsw_del_mcst(vswp,
1491 						    VSW_VNETPORT, addr, port);
1492 						kmem_free(mcst_p,
1493 						    sizeof (*mcst_p));
1494 						return (1);
1495 					}
1496 					mcst_p->mac_added = B_TRUE;
1497 				}
1498 				RW_EXIT(&vswp->mac_rwlock);
1499 
1500 				mutex_enter(&port->mca_lock);
1501 				mcst_p->nextp = port->mcap;
1502 				port->mcap = mcst_p;
1503 				mutex_exit(&port->mca_lock);
1504 
1505 			} else {
1506 				DERR(vswp, "%s: error adding multicast "
1507 				    "address 0x%llx for port %ld",
1508 				    __func__, addr, port->p_instance);
1509 				return (1);
1510 			}
1511 		} else {
1512 			/*
1513 			 * Delete an entry from the multicast hash
1514 			 * table and update the address list
1515 			 * appropriately.
1516 			 */
1517 			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
1518 				D3(vswp, "%s: deleting multicast address "
1519 				    "0x%llx for port %ld", __func__, addr,
1520 				    port->p_instance);
1521 
1522 				mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
1523 				ASSERT(mcst_p != NULL);
1524 
1525 				/*
1526 				 * Remove the address from HW. The address
1527 				 * will actually only be removed once the ref
1528 				 * count within the MAC layer has dropped to
1529 				 * zero. I.e. we can safely call this fn even
1530 				 * if other ports are interested in this
1531 				 * address.
1532 				 */
1533 				WRITE_ENTER(&vswp->mac_rwlock);
1534 				if (vswp->mh != NULL && mcst_p->mac_added) {
1535 					if (mac_multicst_remove(vswp->mh,
1536 					    (uchar_t *)&mcst_pkt->mca[i])) {
1537 						RW_EXIT(&vswp->mac_rwlock);
1538 						cmn_err(CE_WARN, "!vsw%d: "
1539 						    "unable to remove mcast "
1540 						    "address: %s\n",
1541 						    vswp->instance,
1542 						    ether_sprintf((void *)
1543 						    &mcst_p->mca));
1544 						kmem_free(mcst_p,
1545 						    sizeof (*mcst_p));
1546 						return (1);
1547 					}
1548 					mcst_p->mac_added = B_FALSE;
1549 				}
1550 				RW_EXIT(&vswp->mac_rwlock);
1551 				kmem_free(mcst_p, sizeof (*mcst_p));
1552 
1553 			} else {
1554 				DERR(vswp, "%s: error deleting multicast "
1555 				    "addr 0x%llx for port %ld",
1556 				    __func__, addr, port->p_instance);
1557 				return (1);
1558 			}
1559 		}
1560 	}
1561 	D1(vswp, "%s: exit", __func__);
1562 	return (0);
1563 }
1564 
1565 /*
1566  * Add a new multicast entry.
1567  *
1568  * Search hash table based on address. If match found then
1569  * update associated val (which is chain of ports), otherwise
1570  * create new key/val (addr/port) pair and insert into table.
1571  */
1572 int
1573 vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1574 {
1575 	int		dup = 0;
1576 	int		rv = 0;
1577 	mfdb_ent_t	*ment = NULL;
1578 	mfdb_ent_t	*tmp_ent = NULL;
1579 	mfdb_ent_t	*new_ent = NULL;
1580 	void		*tgt = NULL;
1581 
1582 	if (devtype == VSW_VNETPORT) {
1583 		/*
1584 		 * Being invoked from a vnet.
1585 		 */
1586 		ASSERT(arg != NULL);
1587 		tgt = arg;
1588 		D2(NULL, "%s: port %d : address 0x%llx", __func__,
1589 		    ((vsw_port_t *)arg)->p_instance, addr);
1590 	} else {
1591 		/*
1592 		 * We are being invoked via the m_multicst mac entry
1593 		 * point.
1594 		 */
1595 		D2(NULL, "%s: address 0x%llx", __func__, addr);
1596 		tgt = (void *)vswp;
1597 	}
1598 
1599 	WRITE_ENTER(&vswp->mfdbrw);
1600 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1601 	    (mod_hash_val_t *)&ment) != 0) {
1602 
1603 		/* address not currently in table */
1604 		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1605 		ment->d_addr = (void *)tgt;
1606 		ment->d_type = devtype;
1607 		ment->nextp = NULL;
1608 
1609 		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
1610 		    (mod_hash_val_t)ment) != 0) {
1611 			DERR(vswp, "%s: hash table insertion failed", __func__);
1612 			kmem_free(ment, sizeof (mfdb_ent_t));
1613 			rv = 1;
1614 		} else {
1615 			D2(vswp, "%s: added initial entry for 0x%llx to "
1616 			    "table", __func__, addr);
1617 		}
1618 	} else {
1619 		/*
1620 		 * Address in table. Check to see if specified port
1621 		 * is already associated with the address. If not add
1622 		 * it now.
1623 		 */
1624 		tmp_ent = ment;
1625 		while (tmp_ent != NULL) {
1626 			if (tmp_ent->d_addr == (void *)tgt) {
1627 				if (devtype == VSW_VNETPORT) {
1628 					DERR(vswp, "%s: duplicate port entry "
1629 					    "found for portid %ld and key "
1630 					    "0x%llx", __func__,
1631 					    ((vsw_port_t *)arg)->p_instance,
1632 					    addr);
1633 				} else {
1634 					DERR(vswp, "%s: duplicate entry found"
1635 					    "for key 0x%llx", __func__, addr);
1636 				}
1637 				rv = 1;
1638 				dup = 1;
1639 				break;
1640 			}
1641 			tmp_ent = tmp_ent->nextp;
1642 		}
1643 
1644 		/*
1645 		 * Port not on list so add it to end now.
1646 		 */
1647 		if (0 == dup) {
1648 			D2(vswp, "%s: added entry for 0x%llx to table",
1649 			    __func__, addr);
1650 			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
1651 			new_ent->d_addr = (void *)tgt;
1652 			new_ent->d_type = devtype;
1653 			new_ent->nextp = NULL;
1654 
1655 			tmp_ent = ment;
1656 			while (tmp_ent->nextp != NULL)
1657 				tmp_ent = tmp_ent->nextp;
1658 
1659 			tmp_ent->nextp = new_ent;
1660 		}
1661 	}
1662 
1663 	RW_EXIT(&vswp->mfdbrw);
1664 	return (rv);
1665 }
1666 
1667 /*
1668  * Remove a multicast entry from the hashtable.
1669  *
1670  * Search hash table based on address. If match found, scan
1671  * list of ports associated with address. If specified port
1672  * found remove it from list.
1673  */
1674 int
1675 vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
1676 {
1677 	mfdb_ent_t	*ment = NULL;
1678 	mfdb_ent_t	*curr_p, *prev_p;
1679 	void		*tgt = NULL;
1680 
1681 	D1(vswp, "%s: enter", __func__);
1682 
1683 	if (devtype == VSW_VNETPORT) {
1684 		tgt = (vsw_port_t *)arg;
1685 		D2(vswp, "%s: removing port %d from mFDB for address"
1686 		    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
1687 	} else {
1688 		D2(vswp, "%s: removing entry", __func__);
1689 		tgt = (void *)vswp;
1690 	}
1691 
1692 	WRITE_ENTER(&vswp->mfdbrw);
1693 	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
1694 	    (mod_hash_val_t *)&ment) != 0) {
1695 		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
1696 		RW_EXIT(&vswp->mfdbrw);
1697 		return (1);
1698 	}
1699 
1700 	prev_p = curr_p = ment;
1701 
1702 	while (curr_p != NULL) {
1703 		if (curr_p->d_addr == (void *)tgt) {
1704 			if (devtype == VSW_VNETPORT) {
1705 				D2(vswp, "%s: port %d found", __func__,
1706 				    ((vsw_port_t *)tgt)->p_instance);
1707 			} else {
1708 				D2(vswp, "%s: instance found", __func__);
1709 			}
1710 
1711 			if (prev_p == curr_p) {
1712 				/*
1713 				 * head of list, if no other element is in
1714 				 * list then destroy this entry, otherwise
1715 				 * just replace it with updated value.
1716 				 */
1717 				ment = curr_p->nextp;
1718 				if (ment == NULL) {
1719 					(void) mod_hash_destroy(vswp->mfdb,
1720 					    (mod_hash_val_t)addr);
1721 				} else {
1722 					(void) mod_hash_replace(vswp->mfdb,
1723 					    (mod_hash_key_t)addr,
1724 					    (mod_hash_val_t)ment);
1725 				}
1726 			} else {
1727 				/*
1728 				 * Not head of list, no need to do
1729 				 * replacement, just adjust list pointers.
1730 				 */
1731 				prev_p->nextp = curr_p->nextp;
1732 			}
1733 			break;
1734 		}
1735 
1736 		prev_p = curr_p;
1737 		curr_p = curr_p->nextp;
1738 	}
1739 
1740 	RW_EXIT(&vswp->mfdbrw);
1741 
1742 	D1(vswp, "%s: exit", __func__);
1743 
1744 	if (curr_p == NULL)
1745 		return (1);
1746 	kmem_free(curr_p, sizeof (mfdb_ent_t));
1747 	return (0);
1748 }
1749 
1750 /*
1751  * Port is being deleted, but has registered an interest in one
1752  * or more multicast groups. Using the list of addresses maintained
1753  * within the port structure find the appropriate entry in the hash
1754  * table and remove this port from the list of interested ports.
1755  */
1756 void
1757 vsw_del_mcst_port(vsw_port_t *port)
1758 {
1759 	mcst_addr_t	*mcap = NULL;
1760 	vsw_t		*vswp = port->p_vswp;
1761 
1762 	D1(vswp, "%s: enter", __func__);
1763 
1764 	mutex_enter(&port->mca_lock);
1765 
1766 	while ((mcap = port->mcap) != NULL) {
1767 
1768 		port->mcap = mcap->nextp;
1769 
1770 		mutex_exit(&port->mca_lock);
1771 
1772 		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
1773 		    mcap->addr, port);
1774 
1775 		/*
1776 		 * Remove the address from HW. The address
1777 		 * will actually only be removed once the ref
1778 		 * count within the MAC layer has dropped to
1779 		 * zero. I.e. we can safely call this fn even
1780 		 * if other ports are interested in this
1781 		 * address.
1782 		 */
1783 		WRITE_ENTER(&vswp->mac_rwlock);
1784 		if (vswp->mh != NULL && mcap->mac_added) {
1785 			(void) mac_multicst_remove(vswp->mh,
1786 			    (uchar_t *)&mcap->mca);
1787 		}
1788 		RW_EXIT(&vswp->mac_rwlock);
1789 
1790 		kmem_free(mcap, sizeof (*mcap));
1791 
1792 		mutex_enter(&port->mca_lock);
1793 
1794 	}
1795 
1796 	mutex_exit(&port->mca_lock);
1797 
1798 	D1(vswp, "%s: exit", __func__);
1799 }
1800 
1801 /*
1802  * This vsw instance is detaching, but has registered an interest in one
1803  * or more multicast groups. Using the list of addresses maintained
1804  * within the vsw structure find the appropriate entry in the hash
1805  * table and remove this instance from the list of interested ports.
1806  */
1807 void
1808 vsw_del_mcst_vsw(vsw_t *vswp)
1809 {
1810 	mcst_addr_t	*next_p = NULL;
1811 
1812 	D1(vswp, "%s: enter", __func__);
1813 
1814 	mutex_enter(&vswp->mca_lock);
1815 
1816 	while (vswp->mcap != NULL) {
1817 		DERR(vswp, "%s: deleting addr 0x%llx",
1818 		    __func__, vswp->mcap->addr);
1819 		(void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);
1820 
1821 		next_p = vswp->mcap->nextp;
1822 		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
1823 		vswp->mcap = next_p;
1824 	}
1825 
1826 	vswp->mcap = NULL;
1827 	mutex_exit(&vswp->mca_lock);
1828 
1829 	D1(vswp, "%s: exit", __func__);
1830 }
1831 
1832 static uint32_t
1833 vsw_get_same_dest_list(struct ether_header *ehp,
1834     mblk_t **rhead, mblk_t **rtail, mblk_t **mpp)
1835 {
1836 	uint32_t		count = 0;
1837 	mblk_t			*bp;
1838 	mblk_t			*nbp;
1839 	mblk_t			*head = NULL;
1840 	mblk_t			*tail = NULL;
1841 	mblk_t			*prev = NULL;
1842 	struct ether_header	*behp;
1843 
1844 	/* process the chain of packets */
1845 	bp = *mpp;
1846 	while (bp) {
1847 		nbp = bp->b_next;
1848 		behp = (struct ether_header *)bp->b_rptr;
1849 		bp->b_prev = NULL;
1850 		if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
1851 			if (prev == NULL) {
1852 				*mpp = nbp;
1853 			} else {
1854 				prev->b_next = nbp;
1855 			}
1856 			bp->b_next =  NULL;
1857 			if (head == NULL) {
1858 				head = tail = bp;
1859 			} else {
1860 				tail->b_next = bp;
1861 				tail = bp;
1862 			}
1863 			count++;
1864 		} else {
1865 			prev = bp;
1866 		}
1867 		bp = nbp;
1868 	}
1869 	*rhead = head;
1870 	*rtail = tail;
1871 	DTRACE_PROBE1(vsw_same_dest, int, count);
1872 	return (count);
1873 }
1874 
1875 static mblk_t *
1876 vsw_dupmsgchain(mblk_t *mp)
1877 {
1878 	mblk_t	*nmp = NULL;
1879 	mblk_t	**nmpp = &nmp;
1880 
1881 	for (; mp != NULL; mp = mp->b_next) {
1882 		if ((*nmpp = dupmsg(mp)) == NULL) {
1883 			freemsgchain(nmp);
1884 			return (NULL);
1885 		}
1886 
1887 		nmpp = &((*nmpp)->b_next);
1888 	}
1889 
1890 	return (nmp);
1891 }
1892