xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_mroute.c (revision 23a1ccea6aac035f084a7a4cdc968687d1b02daf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * Procedures for the kernel part of DVMRP,
29  * a Distance-Vector Multicast Routing Protocol.
30  * (See RFC-1075)
31  * Written by David Waitzman, BBN Labs, August 1988.
32  * Modified by Steve Deering, Stanford, February 1989.
33  * Modified by Mark J. Steiglitz, Stanford, May, 1991
34  * Modified by Van Jacobson, LBL, January 1993
35  * Modified by Ajit Thyagarajan, PARC, August 1993
36  * Modified by Bill Fenner, PARC, April 1995
37  *
38  * MROUTING 3.5
39  */
40 
41 /*
42  * TODO
43  * - function pointer field in vif, void *vif_sendit()
44  */
45 
46 #include <sys/types.h>
47 #include <sys/stream.h>
48 #include <sys/stropts.h>
49 #include <sys/strlog.h>
50 #include <sys/systm.h>
51 #include <sys/ddi.h>
52 #include <sys/cmn_err.h>
53 #include <sys/zone.h>
54 
55 #include <sys/param.h>
56 #include <sys/socket.h>
57 #include <sys/vtrace.h>
58 #include <sys/debug.h>
59 #include <net/if.h>
60 #include <sys/sockio.h>
61 #include <netinet/in.h>
62 #include <net/if_dl.h>
63 
64 #include <inet/ipsec_impl.h>
65 #include <inet/common.h>
66 #include <inet/mi.h>
67 #include <inet/nd.h>
68 #include <inet/tunables.h>
69 #include <inet/mib2.h>
70 #include <netinet/ip6.h>
71 #include <inet/ip.h>
72 #include <inet/snmpcom.h>
73 
74 #include <netinet/igmp.h>
75 #include <netinet/igmp_var.h>
76 #include <netinet/udp.h>
77 #include <netinet/ip_mroute.h>
78 #include <inet/ip_multi.h>
79 #include <inet/ip_ire.h>
80 #include <inet/ip_ndp.h>
81 #include <inet/ip_if.h>
82 #include <inet/ipclassifier.h>
83 
84 #include <netinet/pim.h>
85 
86 
87 /*
88  * MT Design:
89  *
90  * There are three main data structures viftable, mfctable and tbftable that
91  * need to be protected against MT races.
92  *
93  * vitable is a fixed length array of vif structs. There is no lock to protect
94  * the whole array, instead each struct is protected by its own indiviual lock.
95  * The value of v_marks in conjuction with the value of v_refcnt determines the
96  * current state of a vif structure. One special state that needs mention
97  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
98  * that vif is being initalized.
99  * Each structure is freed when the refcnt goes down to zero. If a delete comes
100  * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
101  * which prevents the struct from further use.  When the refcnt goes to zero
102  * the struct is freed and is marked VIF_MARK_NOTINUSE.
103  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
104  * from  going away a refhold is put on the ipif before using it. see
105  * lock_good_vif() and unlock_good_vif().
106  *
107  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
108  * of the vif struct.
109  *
110  * tbftable is also a fixed length array of tbf structs and is only accessed
111  * via v_tbf.  It is protected by its own lock tbf_lock.
112  *
113  * Lock Ordering is
114  * v_lock --> tbf_lock
115  * v_lock --> ill_locK
116  *
117  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
118  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
119  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
120  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
121  * protect the struct elements.
122  *
123  * mfc structs are dynamically allocated and are singly linked
124  * at the head of the chain. When an mfc structure is to be deleted
125  * it is marked condemned and so is the state in the bucket struct.
126  * When the last walker of the hash bucket exits all the mfc structs
127  * marked condemed are freed.
128  *
129  * Locking Hierarchy:
130  * The bucket lock should be acquired before the mfc struct lock.
131  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
132  * operations on the bucket struct.
133  *
134  * last_encap_lock and numvifs_mutex should be acquired after
135  * acquring vif or mfc locks. These locks protect some global variables.
136  *
137  * The statistics are not currently protected by a lock
138  * causing the stats be be approximate, not exact.
139  */
140 
141 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
142 
143 /*
144  * Timeouts:
145  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
146  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
147  *	SunOS 5.x uses mfc->timeout for each mfc.
148  *	Some Unixes are limited in the number of simultaneous timeouts
149  * 	that can be run, SunOS 5.x does not have this restriction.
150  */
151 
152 /*
153  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
154  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
155  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
156  */
157 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
158 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
159 
160 /*
161  * Hash function for a source, group entry
162  */
163 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
164 	((g) >> 20) ^ ((g) >> 10) ^ (g))
165 
166 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
167 
168 /* Identify PIM packet that came on a Register interface */
169 #define	PIM_REGISTER_MARKER	0xffffffff
170 
171 /* Function declarations */
172 static int	add_mfc(struct mfcctl *, ip_stack_t *);
173 static int	add_vif(struct vifctl *, conn_t *, ip_stack_t *);
174 static int	del_mfc(struct mfcctl *, ip_stack_t *);
175 static int	del_vif(vifi_t *, ip_stack_t *);
176 static void	del_vifp(struct vif *);
177 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
178 static void	expire_upcalls(void *);
179 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
180 static void	free_queue(struct mfc *);
181 static int	get_assert(uchar_t *, ip_stack_t *);
182 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
183 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
184 static int	get_version(uchar_t *);
185 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
186 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
187 		    ipaddr_t, struct mfc *);
188 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
189 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
190 static int	register_mforward(mblk_t *, ip_recv_attr_t *);
191 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
192 static int	set_assert(int *, ip_stack_t *);
193 
194 /*
195  * Token Bucket Filter functions
196  */
197 static int  priority(struct vif *, ipha_t *);
198 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
199 static int  tbf_dq_sel(struct vif *, ipha_t *);
200 static void tbf_process_q(struct vif *);
201 static void tbf_queue(struct vif *, mblk_t *);
202 static void tbf_reprocess_q(void *);
203 static void tbf_send_packet(struct vif *, mblk_t *);
204 static void tbf_update_tokens(struct vif *);
205 static void release_mfc(struct mfcb *);
206 
207 static boolean_t is_mrouter_off(ip_stack_t *);
208 /*
209  * Encapsulation packets
210  */
211 
212 #define	ENCAP_TTL	64
213 
214 /* prototype IP hdr for encapsulated packets */
215 static ipha_t multicast_encap_iphdr = {
216 	IP_SIMPLE_HDR_VERSION,
217 	0,				/* tos */
218 	sizeof (ipha_t),		/* total length */
219 	0,				/* id */
220 	0,				/* frag offset */
221 	ENCAP_TTL, IPPROTO_ENCAP,
222 	0,				/* checksum */
223 };
224 
225 /*
226  * Rate limit for assert notification messages, in nsec.
227  */
228 #define	ASSERT_MSG_TIME		3000000000
229 
230 
231 #define	VIF_REFHOLD(vifp) {			\
232 	mutex_enter(&(vifp)->v_lock);		\
233 	(vifp)->v_refcnt++;			\
234 	mutex_exit(&(vifp)->v_lock);		\
235 }
236 
237 #define	VIF_REFRELE_LOCKED(vifp) {				\
238 	(vifp)->v_refcnt--;					\
239 	if ((vifp)->v_refcnt == 0 &&				\
240 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
241 			del_vifp(vifp);				\
242 	} else {						\
243 		mutex_exit(&(vifp)->v_lock);			\
244 	}							\
245 }
246 
247 #define	VIF_REFRELE(vifp) {					\
248 	mutex_enter(&(vifp)->v_lock);				\
249 	(vifp)->v_refcnt--;					\
250 	if ((vifp)->v_refcnt == 0 &&				\
251 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
252 			del_vifp(vifp);				\
253 	} else {						\
254 		mutex_exit(&(vifp)->v_lock);			\
255 	}							\
256 }
257 
258 #define	MFCB_REFHOLD(mfcb) {				\
259 	mutex_enter(&(mfcb)->mfcb_lock);		\
260 	(mfcb)->mfcb_refcnt++;				\
261 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
262 	mutex_exit(&(mfcb)->mfcb_lock);			\
263 }
264 
265 #define	MFCB_REFRELE(mfcb) {					\
266 	mutex_enter(&(mfcb)->mfcb_lock);			\
267 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
268 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
269 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
270 			release_mfc(mfcb);			\
271 	}							\
272 	mutex_exit(&(mfcb)->mfcb_lock);				\
273 }
274 
275 /*
276  * MFCFIND:
277  * Find a route for a given origin IP address and multicast group address.
278  * Skip entries with pending upcalls.
279  * Type of service parameter to be added in the future!
280  */
281 #define	MFCFIND(mfcbp, o, g, rt) { \
282 	struct mfc *_mb_rt = NULL; \
283 	rt = NULL; \
284 	_mb_rt = mfcbp->mfcb_mfc; \
285 	while (_mb_rt) { \
286 		if ((_mb_rt->mfc_origin.s_addr == o) && \
287 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
288 		    (_mb_rt->mfc_rte == NULL) && \
289 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
290 		    rt = _mb_rt; \
291 		    break; \
292 		} \
293 	_mb_rt = _mb_rt->mfc_next; \
294 	} \
295 }
296 
297 /*
298  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
299  * are inefficient. We use gethrestime() which returns a timespec_t with
300  * sec and nsec, the resolution is machine dependent.
301  * The following 2 macros have been changed to use nsec instead of usec.
302  */
303 /*
304  * Macros to compute elapsed time efficiently.
305  * Borrowed from Van Jacobson's scheduling code.
306  * Delta should be a hrtime_t.
307  */
308 #define	TV_DELTA(a, b, delta) { \
309 	int xxs; \
310  \
311 	delta = (a).tv_nsec - (b).tv_nsec; \
312 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
313 		switch (xxs) { \
314 		case 2: \
315 		    delta += 1000000000; \
316 		    /*FALLTHROUGH*/ \
317 		case 1: \
318 		    delta += 1000000000; \
319 		    break; \
320 		default: \
321 		    delta += (1000000000 * xxs); \
322 		} \
323 	} \
324 }
325 
326 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
327 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
328 
329 /*
330  * Handle MRT setsockopt commands to modify the multicast routing tables.
331  */
332 int
333 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
334     int datalen)
335 {
336 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
337 
338 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
339 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
340 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
341 		return (EACCES);
342 	}
343 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
344 
345 	if (checkonly) {
346 		/*
347 		 * do not do operation, just pretend to - new T_CHECK
348 		 * Note: Even routines further on can probably fail but
349 		 * this T_CHECK stuff is only to please XTI so it not
350 		 * necessary to be perfect.
351 		 */
352 		switch (cmd) {
353 		case MRT_INIT:
354 		case MRT_DONE:
355 		case MRT_ADD_VIF:
356 		case MRT_DEL_VIF:
357 		case MRT_ADD_MFC:
358 		case MRT_DEL_MFC:
359 		case MRT_ASSERT:
360 			return (0);
361 		default:
362 			return (EOPNOTSUPP);
363 		}
364 	}
365 
366 	/*
367 	 * make sure no command is issued after multicast routing has been
368 	 * turned off.
369 	 */
370 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
371 		if (is_mrouter_off(ipst))
372 			return (EINVAL);
373 	}
374 
375 	switch (cmd) {
376 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
377 	case MRT_DONE:	return (ip_mrouter_done(ipst));
378 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp, ipst));
379 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, ipst));
380 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
381 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
382 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
383 	default:	   return (EOPNOTSUPP);
384 	}
385 }
386 
387 /*
388  * Handle MRT getsockopt commands
389  */
390 int
391 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
392 {
393 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
394 
395 	if (connp != ipst->ips_ip_g_mrouter)
396 		return (EACCES);
397 
398 	switch (cmd) {
399 	case MRT_VERSION:	return (get_version((uchar_t *)data));
400 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
401 	default:		return (EOPNOTSUPP);
402 	}
403 }
404 
405 /*
406  * Handle ioctl commands to obtain information from the cache.
407  * Called with shared access to IP. These are read_only ioctls.
408  */
409 /* ARGSUSED */
410 int
411 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
412     ip_ioctl_cmd_t *ipip, void *if_req)
413 {
414 	mblk_t	*mp1;
415 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
416 	conn_t		*connp = Q_TO_CONN(q);
417 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
418 
419 	/* Existence verified in ip_wput_nondata */
420 	mp1 = mp->b_cont->b_cont;
421 
422 	switch (iocp->ioc_cmd) {
423 	case (SIOCGETVIFCNT):
424 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
425 	case (SIOCGETSGCNT):
426 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
427 	case (SIOCGETLSGCNT):
428 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
429 	default:
430 		return (EINVAL);
431 	}
432 }
433 
434 /*
435  * Returns the packet, byte, rpf-failure count for the source, group provided.
436  */
437 static int
438 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
439 {
440 	struct mfc *rt;
441 	struct mfcb *mfcbp;
442 
443 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
444 	MFCB_REFHOLD(mfcbp);
445 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
446 
447 	if (rt != NULL) {
448 		mutex_enter(&rt->mfc_mutex);
449 		req->pktcnt   = rt->mfc_pkt_cnt;
450 		req->bytecnt  = rt->mfc_byte_cnt;
451 		req->wrong_if = rt->mfc_wrong_if;
452 		mutex_exit(&rt->mfc_mutex);
453 	} else
454 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
455 
456 	MFCB_REFRELE(mfcbp);
457 	return (0);
458 }
459 
460 /*
461  * Returns the packet, byte, rpf-failure count for the source, group provided.
462  * Uses larger counters and IPv6 addresses.
463  */
464 /* ARGSUSED XXX until implemented */
465 static int
466 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
467 {
468 	/* XXX TODO SIOCGETLSGCNT */
469 	return (ENXIO);
470 }
471 
472 /*
473  * Returns the input and output packet and byte counts on the vif provided.
474  */
475 static int
476 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
477 {
478 	vifi_t vifi = req->vifi;
479 
480 	if (vifi >= ipst->ips_numvifs)
481 		return (EINVAL);
482 
483 	/*
484 	 * No locks here, an approximation is fine.
485 	 */
486 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
487 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
488 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
489 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
490 
491 	return (0);
492 }
493 
494 static int
495 get_version(uchar_t *data)
496 {
497 	int *v = (int *)data;
498 
499 	*v = 0x0305;	/* XXX !!!! */
500 
501 	return (0);
502 }
503 
504 /*
505  * Set PIM assert processing global.
506  */
507 static int
508 set_assert(int *i, ip_stack_t *ipst)
509 {
510 	if ((*i != 1) && (*i != 0))
511 		return (EINVAL);
512 
513 	ipst->ips_pim_assert = *i;
514 
515 	return (0);
516 }
517 
518 /*
519  * Get PIM assert processing global.
520  */
521 static int
522 get_assert(uchar_t *data, ip_stack_t *ipst)
523 {
524 	int *i = (int *)data;
525 
526 	*i = ipst->ips_pim_assert;
527 
528 	return (0);
529 }
530 
531 /*
532  * Enable multicast routing.
533  */
534 static int
535 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
536 {
537 	int	*v;
538 
539 	if (data == NULL || (datalen != sizeof (int)))
540 		return (ENOPROTOOPT);
541 
542 	v = (int *)data;
543 	if (*v != 1)
544 		return (ENOPROTOOPT);
545 
546 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
547 	if (ipst->ips_ip_g_mrouter != NULL) {
548 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
549 		return (EADDRINUSE);
550 	}
551 
552 	/*
553 	 * MRT_INIT should only be allowed for RAW sockets, but we double
554 	 * check.
555 	 */
556 	if (!IPCL_IS_RAWIP(connp)) {
557 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
558 		return (EINVAL);
559 	}
560 
561 	ipst->ips_ip_g_mrouter = connp;
562 	connp->conn_multi_router = 1;
563 	/* In order for tunnels to work we have to turn ip_g_forward on */
564 	if (!WE_ARE_FORWARDING(ipst)) {
565 		if (ipst->ips_ip_mrtdebug > 1) {
566 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
567 			    "ip_mrouter_init: turning on forwarding");
568 		}
569 		ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding;
570 		ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS;
571 	}
572 
573 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
574 	return (0);
575 }
576 
577 void
578 ip_mrouter_stack_init(ip_stack_t *ipst)
579 {
580 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
581 
582 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
583 	    KM_SLEEP);
584 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
585 	/*
586 	 * mfctable:
587 	 * Includes all mfcs, including waiting upcalls.
588 	 * Multiple mfcs per bucket.
589 	 */
590 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
591 	    KM_SLEEP);
592 	/*
593 	 * Define the token bucket filter structures.
594 	 * tbftable -> each vif has one of these for storing info.
595 	 */
596 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
597 
598 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
599 
600 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
601 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
602 }
603 
604 /*
605  * Disable multicast routing.
606  * Didn't use global timeout_val (BSD version), instead check the mfctable.
607  */
608 int
609 ip_mrouter_done(ip_stack_t *ipst)
610 {
611 	conn_t		*mrouter;
612 	vifi_t 		vifi;
613 	struct mfc	*mfc_rt;
614 	int		i;
615 
616 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
617 	if (ipst->ips_ip_g_mrouter == NULL) {
618 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
619 		return (EINVAL);
620 	}
621 
622 	mrouter = ipst->ips_ip_g_mrouter;
623 
624 	if (ipst->ips_saved_ip_forwarding != -1) {
625 		if (ipst->ips_ip_mrtdebug > 1) {
626 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
627 			    "ip_mrouter_done: turning off forwarding");
628 		}
629 		ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding;
630 		ipst->ips_saved_ip_forwarding = -1;
631 	}
632 
633 	/*
634 	 * Always clear cache when vifs change.
635 	 * No need to get ipst->ips_last_encap_lock since we are running as
636 	 * a writer.
637 	 */
638 	mutex_enter(&ipst->ips_last_encap_lock);
639 	ipst->ips_last_encap_src = 0;
640 	ipst->ips_last_encap_vif = NULL;
641 	mutex_exit(&ipst->ips_last_encap_lock);
642 	mrouter->conn_multi_router = 0;
643 
644 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
645 
646 	/*
647 	 * For each phyint in use,
648 	 * disable promiscuous reception of all IP multicasts.
649 	 */
650 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
651 		struct vif *vifp = ipst->ips_vifs + vifi;
652 
653 		mutex_enter(&vifp->v_lock);
654 		/*
655 		 * if the vif is active mark it condemned.
656 		 */
657 		if (vifp->v_marks & VIF_MARK_GOOD) {
658 			ASSERT(vifp->v_ipif != NULL);
659 			ipif_refhold(vifp->v_ipif);
660 			/* Phyint only */
661 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
662 				ipif_t *ipif = vifp->v_ipif;
663 				ilm_t *ilm = vifp->v_ilm;
664 
665 				vifp->v_ilm = NULL;
666 				vifp->v_marks &= ~VIF_MARK_GOOD;
667 				vifp->v_marks |= VIF_MARK_CONDEMNED;
668 
669 				mutex_exit(&(vifp)->v_lock);
670 				if (ilm != NULL) {
671 					ill_t *ill = ipif->ipif_ill;
672 
673 					(void) ip_delmulti(ilm);
674 					ASSERT(ill->ill_mrouter_cnt > 0);
675 					atomic_dec_32(&ill->ill_mrouter_cnt);
676 				}
677 				mutex_enter(&vifp->v_lock);
678 			}
679 			ipif_refrele(vifp->v_ipif);
680 			/*
681 			 * decreases the refcnt added in add_vif.
682 			 * and release v_lock.
683 			 */
684 			VIF_REFRELE_LOCKED(vifp);
685 		} else {
686 			mutex_exit(&vifp->v_lock);
687 			continue;
688 		}
689 	}
690 
691 	mutex_enter(&ipst->ips_numvifs_mutex);
692 	ipst->ips_numvifs = 0;
693 	ipst->ips_pim_assert = 0;
694 	ipst->ips_reg_vif_num = ALL_VIFS;
695 	mutex_exit(&ipst->ips_numvifs_mutex);
696 
697 	/*
698 	 * Free upcall msgs.
699 	 * Go through mfctable and stop any outstanding upcall
700 	 * timeouts remaining on mfcs.
701 	 */
702 	for (i = 0; i < MFCTBLSIZ; i++) {
703 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
704 		ipst->ips_mfcs[i].mfcb_refcnt++;
705 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
706 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
707 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
708 		while (mfc_rt) {
709 			/* Free upcalls */
710 			mutex_enter(&mfc_rt->mfc_mutex);
711 			if (mfc_rt->mfc_rte != NULL) {
712 				if (mfc_rt->mfc_timeout_id != 0) {
713 					/*
714 					 * OK to drop the lock as we have
715 					 * a refcnt on the bucket. timeout
716 					 * can fire but it will see that
717 					 * mfc_timeout_id == 0 and not do
718 					 * anything. see expire_upcalls().
719 					 */
720 					mfc_rt->mfc_timeout_id = 0;
721 					mutex_exit(&mfc_rt->mfc_mutex);
722 					(void) untimeout(
723 					    mfc_rt->mfc_timeout_id);
724 						mfc_rt->mfc_timeout_id = 0;
725 					mutex_enter(&mfc_rt->mfc_mutex);
726 
727 					/*
728 					 * all queued upcall packets
729 					 * and mblk will be freed in
730 					 * release_mfc().
731 					 */
732 				}
733 			}
734 
735 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
736 
737 			mutex_exit(&mfc_rt->mfc_mutex);
738 			mfc_rt = mfc_rt->mfc_next;
739 		}
740 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
741 	}
742 
743 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
744 	ipst->ips_ip_g_mrouter = NULL;
745 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
746 	return (0);
747 }
748 
749 void
750 ip_mrouter_stack_destroy(ip_stack_t *ipst)
751 {
752 	struct mfcb *mfcbp;
753 	struct mfc  *rt;
754 	int i;
755 
756 	for (i = 0; i < MFCTBLSIZ; i++) {
757 		mfcbp = &ipst->ips_mfcs[i];
758 
759 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
760 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
761 			    i);
762 
763 			mfcbp->mfcb_mfc = rt->mfc_next;
764 			free_queue(rt);
765 			mi_free(rt);
766 		}
767 	}
768 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
769 	ipst->ips_vifs = NULL;
770 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
771 	ipst->ips_mrtstat = NULL;
772 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
773 	ipst->ips_mfcs = NULL;
774 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
775 	ipst->ips_tbfs = NULL;
776 
777 	mutex_destroy(&ipst->ips_last_encap_lock);
778 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
779 }
780 
781 static boolean_t
782 is_mrouter_off(ip_stack_t *ipst)
783 {
784 	conn_t	*mrouter;
785 
786 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
787 	if (ipst->ips_ip_g_mrouter == NULL) {
788 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
789 		return (B_TRUE);
790 	}
791 
792 	mrouter = ipst->ips_ip_g_mrouter;
793 	if (mrouter->conn_multi_router == 0) {
794 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
795 		return (B_TRUE);
796 	}
797 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
798 	return (B_FALSE);
799 }
800 
801 static void
802 unlock_good_vif(struct vif *vifp)
803 {
804 	ASSERT(vifp->v_ipif != NULL);
805 	ipif_refrele(vifp->v_ipif);
806 	VIF_REFRELE(vifp);
807 }
808 
809 static boolean_t
810 lock_good_vif(struct vif *vifp)
811 {
812 	mutex_enter(&vifp->v_lock);
813 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
814 		mutex_exit(&vifp->v_lock);
815 		return (B_FALSE);
816 	}
817 
818 	ASSERT(vifp->v_ipif != NULL);
819 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
820 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
821 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
822 		mutex_exit(&vifp->v_lock);
823 		return (B_FALSE);
824 	}
825 	ipif_refhold_locked(vifp->v_ipif);
826 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
827 	vifp->v_refcnt++;
828 	mutex_exit(&vifp->v_lock);
829 	return (B_TRUE);
830 }
831 
832 /*
833  * Add a vif to the vif table.
834  */
835 static int
836 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
837 {
838 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
839 	ipif_t		*ipif;
840 	int		error = 0;
841 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
842 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
843 	ilm_t		*ilm;
844 	ill_t		*ill;
845 
846 	ASSERT(connp != NULL);
847 
848 	if (vifcp->vifc_vifi >= MAXVIFS)
849 		return (EINVAL);
850 
851 	if (is_mrouter_off(ipst))
852 		return (EINVAL);
853 
854 	mutex_enter(&vifp->v_lock);
855 	/*
856 	 * Viftable entry should be 0.
857 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
858 	 * initialized.
859 	 *
860 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
861 	 * request while the delete is in progress, mrouted only sends add
862 	 * requests when a new interface is added and the new interface cannot
863 	 * have the same vifi as an existing interface. We make sure that
864 	 * ill_delete will block till the vif is deleted by adding a refcnt
865 	 * to ipif in del_vif().
866 	 */
867 	if (vifp->v_lcl_addr.s_addr != 0 ||
868 	    vifp->v_marks != 0 ||
869 	    vifp->v_refcnt != 0) {
870 		mutex_exit(&vifp->v_lock);
871 		return (EADDRINUSE);
872 	}
873 
874 	/* Incoming vif should not be 0 */
875 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
876 		mutex_exit(&vifp->v_lock);
877 		return (EINVAL);
878 	}
879 
880 	vifp->v_refcnt++;
881 	mutex_exit(&vifp->v_lock);
882 	/* Find the interface with the local address */
883 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
884 	    IPCL_ZONEID(connp), ipst);
885 	if (ipif == NULL) {
886 		VIF_REFRELE(vifp);
887 		return (EADDRNOTAVAIL);
888 	}
889 
890 	if (ipst->ips_ip_mrtdebug > 1) {
891 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
892 		    "add_vif: src 0x%x enter",
893 		    vifcp->vifc_lcl_addr.s_addr);
894 	}
895 
896 	mutex_enter(&vifp->v_lock);
897 	/*
898 	 * Always clear cache when vifs change.
899 	 * Needed to ensure that src isn't left over from before vif was added.
900 	 * No need to get last_encap_lock, since we are running as a writer.
901 	 */
902 
903 	mutex_enter(&ipst->ips_last_encap_lock);
904 	ipst->ips_last_encap_src = 0;
905 	ipst->ips_last_encap_vif = NULL;
906 	mutex_exit(&ipst->ips_last_encap_lock);
907 
908 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
909 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
910 			cmn_err(CE_WARN,
911 			    "add_vif: source route tunnels not supported\n");
912 			VIF_REFRELE_LOCKED(vifp);
913 			ipif_refrele(ipif);
914 			return (EOPNOTSUPP);
915 		}
916 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
917 
918 	} else {
919 		/* Phyint or Register vif */
920 		if (vifcp->vifc_flags & VIFF_REGISTER) {
921 			/*
922 			 * Note: Since all IPPROTO_IP level options (including
923 			 * MRT_ADD_VIF) are done exclusively via
924 			 * ip_optmgmt_writer(), a lock is not necessary to
925 			 * protect reg_vif_num.
926 			 */
927 			mutex_enter(&ipst->ips_numvifs_mutex);
928 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
929 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
930 				mutex_exit(&ipst->ips_numvifs_mutex);
931 			} else {
932 				mutex_exit(&ipst->ips_numvifs_mutex);
933 				VIF_REFRELE_LOCKED(vifp);
934 				ipif_refrele(ipif);
935 				return (EADDRINUSE);
936 			}
937 		}
938 
939 		/* Make sure the interface supports multicast */
940 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
941 			VIF_REFRELE_LOCKED(vifp);
942 			ipif_refrele(ipif);
943 			if (vifcp->vifc_flags & VIFF_REGISTER) {
944 				mutex_enter(&ipst->ips_numvifs_mutex);
945 				ipst->ips_reg_vif_num = ALL_VIFS;
946 				mutex_exit(&ipst->ips_numvifs_mutex);
947 			}
948 			return (EOPNOTSUPP);
949 		}
950 		/* Enable promiscuous reception of all IP mcasts from the if */
951 		mutex_exit(&vifp->v_lock);
952 
953 		ill = ipif->ipif_ill;
954 		if (IS_UNDER_IPMP(ill))
955 			ill = ipmp_ill_hold_ipmp_ill(ill);
956 
957 		if (ill == NULL) {
958 			ilm = NULL;
959 		} else {
960 			ilm = ip_addmulti(&ipv6_all_zeros, ill,
961 			    ipif->ipif_zoneid, &error);
962 			if (ilm != NULL)
963 				atomic_inc_32(&ill->ill_mrouter_cnt);
964 			if (IS_UNDER_IPMP(ipif->ipif_ill)) {
965 				ill_refrele(ill);
966 				ill = ipif->ipif_ill;
967 			}
968 		}
969 
970 		mutex_enter(&vifp->v_lock);
971 		/*
972 		 * since we released the lock lets make sure that
973 		 * ip_mrouter_done() has not been called.
974 		 */
975 		if (ilm == NULL || is_mrouter_off(ipst)) {
976 			if (ilm != NULL) {
977 				(void) ip_delmulti(ilm);
978 				ASSERT(ill->ill_mrouter_cnt > 0);
979 				atomic_dec_32(&ill->ill_mrouter_cnt);
980 			}
981 			if (vifcp->vifc_flags & VIFF_REGISTER) {
982 				mutex_enter(&ipst->ips_numvifs_mutex);
983 				ipst->ips_reg_vif_num = ALL_VIFS;
984 				mutex_exit(&ipst->ips_numvifs_mutex);
985 			}
986 			VIF_REFRELE_LOCKED(vifp);
987 			ipif_refrele(ipif);
988 			return (error?error:EINVAL);
989 		}
990 		vifp->v_ilm = ilm;
991 	}
992 	/* Define parameters for the tbf structure */
993 	vifp->v_tbf = v_tbf;
994 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
995 	vifp->v_tbf->tbf_n_tok = 0;
996 	vifp->v_tbf->tbf_q_len = 0;
997 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
998 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
999 
1000 	vifp->v_flags = vifcp->vifc_flags;
1001 	vifp->v_threshold = vifcp->vifc_threshold;
1002 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1003 	vifp->v_ipif = ipif;
1004 	ipif_refrele(ipif);
1005 	/* Scaling up here, allows division by 1024 in critical code.	*/
1006 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1007 	vifp->v_timeout_id = 0;
1008 	/* initialize per vif pkt counters */
1009 	vifp->v_pkt_in = 0;
1010 	vifp->v_pkt_out = 0;
1011 	vifp->v_bytes_in = 0;
1012 	vifp->v_bytes_out = 0;
1013 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1014 
1015 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1016 	mutex_enter(&ipst->ips_numvifs_mutex);
1017 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1018 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1019 	mutex_exit(&ipst->ips_numvifs_mutex);
1020 
1021 	if (ipst->ips_ip_mrtdebug > 1) {
1022 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1023 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1024 		    vifcp->vifc_vifi,
1025 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1026 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1027 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1028 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1029 	}
1030 
1031 	vifp->v_marks = VIF_MARK_GOOD;
1032 	mutex_exit(&vifp->v_lock);
1033 	return (0);
1034 }
1035 
1036 
1037 /* Delete a vif from the vif table. */
1038 static void
1039 del_vifp(struct vif *vifp)
1040 {
1041 	struct tbf	*t = vifp->v_tbf;
1042 	mblk_t  *mp0;
1043 	vifi_t  vifi;
1044 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1045 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1046 
1047 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1048 	ASSERT(t != NULL);
1049 
1050 	if (ipst->ips_ip_mrtdebug > 1) {
1051 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1052 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1053 	}
1054 
1055 	if (vifp->v_timeout_id != 0) {
1056 		(void) untimeout(vifp->v_timeout_id);
1057 		vifp->v_timeout_id = 0;
1058 	}
1059 
1060 	/*
1061 	 * Free packets queued at the interface.
1062 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1063 	 */
1064 	mutex_enter(&t->tbf_lock);
1065 	while (t->tbf_q != NULL) {
1066 		mp0 = t->tbf_q;
1067 		t->tbf_q = t->tbf_q->b_next;
1068 		mp0->b_prev = mp0->b_next = NULL;
1069 		freemsg(mp0);
1070 	}
1071 	mutex_exit(&t->tbf_lock);
1072 
1073 	/*
1074 	 * Always clear cache when vifs change.
1075 	 * No need to get last_encap_lock since we are running as a writer.
1076 	 */
1077 	mutex_enter(&ipst->ips_last_encap_lock);
1078 	if (vifp == ipst->ips_last_encap_vif) {
1079 		ipst->ips_last_encap_vif = NULL;
1080 		ipst->ips_last_encap_src = 0;
1081 	}
1082 	mutex_exit(&ipst->ips_last_encap_lock);
1083 
1084 	mutex_destroy(&t->tbf_lock);
1085 
1086 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1087 
1088 	/* Adjust numvifs down */
1089 	mutex_enter(&ipst->ips_numvifs_mutex);
1090 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1091 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1092 			break;
1093 	ipst->ips_numvifs = vifi;
1094 	mutex_exit(&ipst->ips_numvifs_mutex);
1095 
1096 	bzero(vifp, sizeof (*vifp));
1097 }
1098 
1099 static int
1100 del_vif(vifi_t *vifip, ip_stack_t *ipst)
1101 {
1102 	struct vif	*vifp = ipst->ips_vifs + *vifip;
1103 
1104 	if (*vifip >= ipst->ips_numvifs)
1105 		return (EINVAL);
1106 
1107 	mutex_enter(&vifp->v_lock);
1108 	/*
1109 	 * Not initialized
1110 	 * Here we are not looking at the vif that is being initialized
1111 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1112 	 */
1113 	if (vifp->v_lcl_addr.s_addr == 0 ||
1114 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1115 		mutex_exit(&vifp->v_lock);
1116 		return (EADDRNOTAVAIL);
1117 	}
1118 
1119 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1120 	vifp->v_marks &= ~VIF_MARK_GOOD;
1121 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1122 
1123 	/* Phyint only */
1124 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1125 		ipif_t *ipif = vifp->v_ipif;
1126 		ilm_t *ilm = vifp->v_ilm;
1127 
1128 		vifp->v_ilm = NULL;
1129 
1130 		ASSERT(ipif != NULL);
1131 		/*
1132 		 * should be OK to drop the lock as we
1133 		 * have marked this as CONDEMNED.
1134 		 */
1135 		mutex_exit(&(vifp)->v_lock);
1136 		if (ilm != NULL) {
1137 			(void) ip_delmulti(ilm);
1138 			ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
1139 			atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
1140 		}
1141 		mutex_enter(&(vifp)->v_lock);
1142 	}
1143 
1144 	if (vifp->v_flags & VIFF_REGISTER) {
1145 		mutex_enter(&ipst->ips_numvifs_mutex);
1146 		ipst->ips_reg_vif_num = ALL_VIFS;
1147 		mutex_exit(&ipst->ips_numvifs_mutex);
1148 	}
1149 
1150 	/*
1151 	 * decreases the refcnt added in add_vif.
1152 	 */
1153 	VIF_REFRELE_LOCKED(vifp);
1154 	return (0);
1155 }
1156 
1157 /*
1158  * Add an mfc entry.
1159  */
1160 static int
1161 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1162 {
1163 	struct mfc *rt;
1164 	struct rtdetq *rte;
1165 	ushort_t nstl;
1166 	int i;
1167 	struct mfcb *mfcbp;
1168 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1169 
1170 	/*
1171 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1172 	 * did not have a real route for pkt.
1173 	 * We want this pkt without rt installed in the mfctable to prevent
1174 	 * multiiple tries, so go ahead and put it in mfctable, it will
1175 	 * be discarded later in ip_mdq() because the child is NULL.
1176 	 */
1177 
1178 	/* Error checking, out of bounds? */
1179 	if (mfccp->mfcc_parent > MAXVIFS) {
1180 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1181 		    (int)mfccp->mfcc_parent));
1182 		return (EINVAL);
1183 	}
1184 
1185 	if ((mfccp->mfcc_parent != NO_VIF) &&
1186 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1187 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1188 		    (int)mfccp->mfcc_parent));
1189 		return (EINVAL);
1190 	}
1191 
1192 	if (is_mrouter_off(ipst)) {
1193 		return (EINVAL);
1194 	}
1195 
1196 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1197 	    mfccp->mfcc_mcastgrp.s_addr)];
1198 	MFCB_REFHOLD(mfcbp);
1199 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1200 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1201 
1202 	/* If an entry already exists, just update the fields */
1203 	if (rt) {
1204 		if (ipst->ips_ip_mrtdebug > 1) {
1205 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1206 			    "add_mfc: update o %x grp %x parent %x",
1207 			    ntohl(mfccp->mfcc_origin.s_addr),
1208 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1209 			    mfccp->mfcc_parent);
1210 		}
1211 		mutex_enter(&rt->mfc_mutex);
1212 		rt->mfc_parent = mfccp->mfcc_parent;
1213 
1214 		mutex_enter(&ipst->ips_numvifs_mutex);
1215 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
1216 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1217 		mutex_exit(&ipst->ips_numvifs_mutex);
1218 		mutex_exit(&rt->mfc_mutex);
1219 
1220 		MFCB_REFRELE(mfcbp);
1221 		return (0);
1222 	}
1223 
1224 	/*
1225 	 * Find the entry for which the upcall was made and update.
1226 	 */
1227 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1228 		mutex_enter(&rt->mfc_mutex);
1229 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1230 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1231 		    (rt->mfc_rte != NULL) &&
1232 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1233 			if (nstl++ != 0)
1234 				cmn_err(CE_WARN,
1235 				    "add_mfc: %s o %x g %x p %x",
1236 				    "multiple kernel entries",
1237 				    ntohl(mfccp->mfcc_origin.s_addr),
1238 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1239 				    mfccp->mfcc_parent);
1240 
1241 			if (ipst->ips_ip_mrtdebug > 1) {
1242 				(void) mi_strlog(mrouter->conn_rq, 1,
1243 				    SL_TRACE,
1244 				    "add_mfc: o %x g %x p %x",
1245 				    ntohl(mfccp->mfcc_origin.s_addr),
1246 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1247 				    mfccp->mfcc_parent);
1248 			}
1249 			fill_route(rt, mfccp, ipst);
1250 
1251 			/*
1252 			 * Prevent cleanup of cache entry.
1253 			 * Timer starts in ip_mforward.
1254 			 */
1255 			if (rt->mfc_timeout_id != 0) {
1256 				timeout_id_t id;
1257 				id = rt->mfc_timeout_id;
1258 				/*
1259 				 * setting id to zero will avoid this
1260 				 * entry from being cleaned up in
1261 				 * expire_up_calls().
1262 				 */
1263 				rt->mfc_timeout_id = 0;
1264 				/*
1265 				 * dropping the lock is fine as we
1266 				 * have a refhold on the bucket.
1267 				 * so mfc cannot be freed.
1268 				 * The timeout can fire but it will see
1269 				 * that mfc_timeout_id == 0 and not cleanup.
1270 				 */
1271 				mutex_exit(&rt->mfc_mutex);
1272 				(void) untimeout(id);
1273 				mutex_enter(&rt->mfc_mutex);
1274 			}
1275 
1276 			/*
1277 			 * Send all pkts that are queued waiting for the upcall.
1278 			 * ip_mdq param tun set to 0 -
1279 			 * the return value of ip_mdq() isn't used here,
1280 			 * so value we send doesn't matter.
1281 			 */
1282 			while (rt->mfc_rte != NULL) {
1283 				rte = rt->mfc_rte;
1284 				rt->mfc_rte = rte->rte_next;
1285 				mutex_exit(&rt->mfc_mutex);
1286 				(void) ip_mdq(rte->mp, (ipha_t *)
1287 				    rte->mp->b_rptr, rte->ill, 0, rt);
1288 				freemsg(rte->mp);
1289 				mi_free((char *)rte);
1290 				mutex_enter(&rt->mfc_mutex);
1291 			}
1292 		}
1293 		mutex_exit(&rt->mfc_mutex);
1294 	}
1295 
1296 
1297 	/*
1298 	 * It is possible that an entry is being inserted without an upcall
1299 	 */
1300 	if (nstl == 0) {
1301 		mutex_enter(&(mfcbp->mfcb_lock));
1302 		if (ipst->ips_ip_mrtdebug > 1) {
1303 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1304 			    "add_mfc: no upcall o %x g %x p %x",
1305 			    ntohl(mfccp->mfcc_origin.s_addr),
1306 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1307 			    mfccp->mfcc_parent);
1308 		}
1309 		if (is_mrouter_off(ipst)) {
1310 			mutex_exit(&mfcbp->mfcb_lock);
1311 			MFCB_REFRELE(mfcbp);
1312 			return (EINVAL);
1313 		}
1314 
1315 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1316 
1317 			mutex_enter(&rt->mfc_mutex);
1318 			if ((rt->mfc_origin.s_addr ==
1319 			    mfccp->mfcc_origin.s_addr) &&
1320 			    (rt->mfc_mcastgrp.s_addr ==
1321 			    mfccp->mfcc_mcastgrp.s_addr) &&
1322 			    (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1323 				fill_route(rt, mfccp, ipst);
1324 				mutex_exit(&rt->mfc_mutex);
1325 				break;
1326 			}
1327 			mutex_exit(&rt->mfc_mutex);
1328 		}
1329 
1330 		/* No upcall, so make a new entry into mfctable */
1331 		if (rt == NULL) {
1332 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1333 			if (rt == NULL) {
1334 				ip1dbg(("add_mfc: out of memory\n"));
1335 				mutex_exit(&mfcbp->mfcb_lock);
1336 				MFCB_REFRELE(mfcbp);
1337 				return (ENOBUFS);
1338 			}
1339 
1340 			/* Insert new entry at head of hash chain */
1341 			mutex_enter(&rt->mfc_mutex);
1342 			fill_route(rt, mfccp, ipst);
1343 
1344 			/* Link into table */
1345 			rt->mfc_next   = mfcbp->mfcb_mfc;
1346 			mfcbp->mfcb_mfc = rt;
1347 			mutex_exit(&rt->mfc_mutex);
1348 		}
1349 		mutex_exit(&mfcbp->mfcb_lock);
1350 	}
1351 
1352 	MFCB_REFRELE(mfcbp);
1353 	return (0);
1354 }
1355 
1356 /*
1357  * Fills in mfc structure from mrouted mfcctl.
1358  */
1359 static void
1360 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1361 {
1362 	int i;
1363 
1364 	rt->mfc_origin		= mfccp->mfcc_origin;
1365 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1366 	rt->mfc_parent		= mfccp->mfcc_parent;
1367 	mutex_enter(&ipst->ips_numvifs_mutex);
1368 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1369 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1370 	}
1371 	mutex_exit(&ipst->ips_numvifs_mutex);
1372 	/* Initialize pkt counters per src-grp */
1373 	rt->mfc_pkt_cnt	= 0;
1374 	rt->mfc_byte_cnt	= 0;
1375 	rt->mfc_wrong_if	= 0;
1376 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1377 
1378 }
1379 
1380 static void
1381 free_queue(struct mfc *mfcp)
1382 {
1383 	struct rtdetq *rte0;
1384 
1385 	/*
1386 	 * Drop all queued upcall packets.
1387 	 * Free the mbuf with the pkt.
1388 	 */
1389 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1390 		mfcp->mfc_rte = rte0->rte_next;
1391 		freemsg(rte0->mp);
1392 		mi_free((char *)rte0);
1393 	}
1394 }
1395 /*
1396  * go thorugh the hash bucket and free all the entries marked condemned.
1397  */
1398 void
1399 release_mfc(struct mfcb *mfcbp)
1400 {
1401 	struct mfc *current_mfcp;
1402 	struct mfc *prev_mfcp;
1403 
1404 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1405 
1406 	while (current_mfcp != NULL) {
1407 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1408 			if (current_mfcp == mfcbp->mfcb_mfc) {
1409 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1410 				free_queue(current_mfcp);
1411 				mi_free(current_mfcp);
1412 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1413 				continue;
1414 			}
1415 			ASSERT(prev_mfcp != NULL);
1416 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1417 			free_queue(current_mfcp);
1418 			mi_free(current_mfcp);
1419 			current_mfcp = NULL;
1420 		} else {
1421 			prev_mfcp = current_mfcp;
1422 		}
1423 
1424 		current_mfcp = prev_mfcp->mfc_next;
1425 
1426 	}
1427 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1428 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1429 }
1430 
1431 /*
1432  * Delete an mfc entry.
1433  */
1434 static int
1435 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1436 {
1437 	struct in_addr	origin;
1438 	struct in_addr	mcastgrp;
1439 	struct mfc 	*rt;
1440 	uint_t		hash;
1441 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1442 
1443 	origin = mfccp->mfcc_origin;
1444 	mcastgrp = mfccp->mfcc_mcastgrp;
1445 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1446 
1447 	if (ipst->ips_ip_mrtdebug > 1) {
1448 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1449 		    "del_mfc: o %x g %x",
1450 		    ntohl(origin.s_addr),
1451 		    ntohl(mcastgrp.s_addr));
1452 	}
1453 
1454 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1455 
1456 	/* Find mfc in mfctable, finds only entries without upcalls */
1457 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1458 		mutex_enter(&rt->mfc_mutex);
1459 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1460 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1461 		    rt->mfc_rte == NULL &&
1462 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1463 			break;
1464 		mutex_exit(&rt->mfc_mutex);
1465 	}
1466 
1467 	/*
1468 	 * Return if there was an upcall (mfc_rte != NULL,
1469 	 * or rt not in mfctable.
1470 	 */
1471 	if (rt == NULL) {
1472 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1473 		return (EADDRNOTAVAIL);
1474 	}
1475 
1476 
1477 	/*
1478 	 * no need to hold lock as we have a reference.
1479 	 */
1480 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1481 	/* error checking */
1482 	if (rt->mfc_timeout_id != 0) {
1483 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1484 		/*
1485 		 * Its ok to drop the lock,  the struct cannot be freed
1486 		 * since we have a ref on the hash bucket.
1487 		 */
1488 		rt->mfc_timeout_id = 0;
1489 		mutex_exit(&rt->mfc_mutex);
1490 		(void) untimeout(rt->mfc_timeout_id);
1491 		mutex_enter(&rt->mfc_mutex);
1492 	}
1493 
1494 	ASSERT(rt->mfc_rte == NULL);
1495 
1496 
1497 	/*
1498 	 * Delete the entry from the cache
1499 	 */
1500 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1501 	mutex_exit(&rt->mfc_mutex);
1502 
1503 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1504 
1505 	return (0);
1506 }
1507 
1508 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1509 
1510 /*
1511  * IP multicast forwarding function. This function assumes that the packet
1512  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1513  * pointed to by "ill", and the packet is to be relayed to other networks
1514  * that have members of the packet's destination IP multicast group.
1515  *
1516  * The packet is returned unscathed to the caller, unless it is
1517  * erroneous, in which case a -1 value tells the caller (IP)
1518  * to discard it.
1519  *
1520  * Unlike BSD, SunOS 5.x needs to return to IP info about
1521  * whether pkt came in thru a tunnel, so it can be discarded, unless
1522  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1523  * to be delivered.
1524  * Return values are 0 - pkt is okay and phyint
1525  *		    -1 - pkt is malformed and to be tossed
1526  *                   1 - pkt came in on tunnel
1527  */
1528 int
1529 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
1530 {
1531 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
1532 	ill_t		*ill = ira->ira_ill;
1533 	struct mfc 	*rt;
1534 	ipaddr_t	src, dst, tunnel_src = 0;
1535 	static int	srctun = 0;
1536 	vifi_t		vifi;
1537 	boolean_t	pim_reg_packet = B_FALSE;
1538 	struct mfcb	*mfcbp;
1539 	ip_stack_t	*ipst = ill->ill_ipst;
1540 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1541 	ill_t		*rill = ira->ira_rill;
1542 
1543 	ASSERT(ira->ira_pktlen == msgdsize(mp));
1544 
1545 	if (ipst->ips_ip_mrtdebug > 1) {
1546 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1547 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1548 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1549 		    ill->ill_name);
1550 	}
1551 
1552 	dst = ipha->ipha_dst;
1553 	if (ira->ira_flags & IRAF_PIM_REGISTER)
1554 		pim_reg_packet = B_TRUE;
1555 	else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
1556 		tunnel_src = ira->ira_mroute_tunnel;
1557 
1558 	/*
1559 	 * Don't forward a packet with time-to-live of zero or one,
1560 	 * or a packet destined to a local-only group.
1561 	 */
1562 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1563 	    (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1564 		if (ipst->ips_ip_mrtdebug > 1) {
1565 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1566 			    "ip_mforward: not forwarded ttl %d,"
1567 			    " dst 0x%x ill %s",
1568 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1569 		}
1570 		if (tunnel_src != 0)
1571 			return (1);
1572 		else
1573 			return (0);
1574 	}
1575 
1576 	if ((tunnel_src != 0) || pim_reg_packet) {
1577 		/*
1578 		 * Packet arrived over an encapsulated tunnel or via a PIM
1579 		 * register message.
1580 		 */
1581 		if (ipst->ips_ip_mrtdebug > 1) {
1582 			if (tunnel_src != 0) {
1583 				(void) mi_strlog(mrouter->conn_rq, 1,
1584 				    SL_TRACE,
1585 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1586 				    ill->ill_name);
1587 			} else if (pim_reg_packet) {
1588 				(void) mi_strlog(mrouter->conn_rq, 1,
1589 				    SL_TRACE,
1590 				    "ip_mforward: ill %s arrived via"
1591 				    "  REGISTER VIF",
1592 				    ill->ill_name);
1593 			}
1594 		}
1595 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1596 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1597 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1598 		/* Packet arrived via a physical interface. */
1599 		if (ipst->ips_ip_mrtdebug > 1) {
1600 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1601 			    "ip_mforward: ill %s arrived via PHYINT",
1602 			    ill->ill_name);
1603 		}
1604 
1605 	} else {
1606 		/*
1607 		 * Packet arrived through a SRCRT tunnel.
1608 		 * Source-route tunnels are no longer supported.
1609 		 * Error message printed every 1000 times.
1610 		 */
1611 		if ((srctun++ % 1000) == 0) {
1612 			cmn_err(CE_WARN,
1613 			    "ip_mforward: received source-routed pkt from %x",
1614 			    ntohl(ipha->ipha_src));
1615 		}
1616 		return (-1);
1617 	}
1618 
1619 	ipst->ips_mrtstat->mrts_fwd_in++;
1620 	src = ipha->ipha_src;
1621 
1622 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1623 
1624 	/*
1625 	 * Lock the mfctable against changes made by ip_mforward.
1626 	 * Note that only add_mfc and del_mfc can remove entries and
1627 	 * they run with exclusive access to IP. So we do not need to
1628 	 * guard against the rt being deleted, so release lock after reading.
1629 	 */
1630 
1631 	if (is_mrouter_off(ipst))
1632 		return (-1);
1633 
1634 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1635 	MFCB_REFHOLD(mfcbp);
1636 	MFCFIND(mfcbp, src, dst, rt);
1637 
1638 	/* Entry exists, so forward if necessary */
1639 	if (rt != NULL) {
1640 		int ret = 0;
1641 		ipst->ips_mrtstat->mrts_mfc_hits++;
1642 		if (pim_reg_packet) {
1643 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1644 			ret = ip_mdq(mp, ipha,
1645 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1646 			    v_ipif->ipif_ill,
1647 			    0, rt);
1648 		} else {
1649 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1650 		}
1651 
1652 		MFCB_REFRELE(mfcbp);
1653 		return (ret);
1654 
1655 		/*
1656 		 * Don't forward if we don't have a cache entry.  Mrouted will
1657 		 * always provide a cache entry in response to an upcall.
1658 		 */
1659 	} else {
1660 		/*
1661 		 * If we don't have a route for packet's origin, make a copy
1662 		 * of the packet and send message to routing daemon.
1663 		 */
1664 		struct mfc	*mfc_rt	 = NULL;
1665 		mblk_t		*mp0	 = NULL;
1666 		mblk_t		*mp_copy = NULL;
1667 		struct rtdetq	*rte	 = NULL;
1668 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1669 		uint_t		hash;
1670 		int		npkts;
1671 		boolean_t	new_mfc = B_FALSE;
1672 		ipst->ips_mrtstat->mrts_mfc_misses++;
1673 		/* BSD uses mrts_no_route++ */
1674 		if (ipst->ips_ip_mrtdebug > 1) {
1675 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1676 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1677 			    ill->ill_name, ntohl(src), ntohl(dst),
1678 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
1679 		}
1680 		/*
1681 		 * The order of the following code differs from the BSD code.
1682 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1683 		 * code works, so SunOS 5.x wasn't changed to conform to the
1684 		 * BSD version.
1685 		 */
1686 
1687 		/* Lock mfctable. */
1688 		hash = MFCHASH(src, dst);
1689 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1690 
1691 		/*
1692 		 * If we are turning off mrouted return an error
1693 		 */
1694 		if (is_mrouter_off(ipst)) {
1695 			mutex_exit(&mfcbp->mfcb_lock);
1696 			MFCB_REFRELE(mfcbp);
1697 			return (-1);
1698 		}
1699 
1700 		/* Is there an upcall waiting for this packet? */
1701 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1702 		    mfc_rt = mfc_rt->mfc_next) {
1703 			mutex_enter(&mfc_rt->mfc_mutex);
1704 			if (ipst->ips_ip_mrtdebug > 1) {
1705 				(void) mi_strlog(mrouter->conn_rq, 1,
1706 				    SL_TRACE,
1707 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1708 				    " g 0x%x\n",
1709 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1710 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1711 			}
1712 			/* There is an upcall */
1713 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1714 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1715 			    (mfc_rt->mfc_rte != NULL) &&
1716 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1717 				break;
1718 			}
1719 			mutex_exit(&mfc_rt->mfc_mutex);
1720 		}
1721 		/* No upcall, so make a new entry into mfctable */
1722 		if (mfc_rt == NULL) {
1723 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1724 			if (mfc_rt == NULL) {
1725 				ipst->ips_mrtstat->mrts_fwd_drop++;
1726 				ip1dbg(("ip_mforward: out of memory "
1727 				    "for mfc, mfc_rt\n"));
1728 				goto error_return;
1729 			} else
1730 				new_mfc = B_TRUE;
1731 			/* Get resources */
1732 			/* TODO could copy header and dup rest */
1733 			mp_copy = copymsg(mp);
1734 			if (mp_copy == NULL) {
1735 				ipst->ips_mrtstat->mrts_fwd_drop++;
1736 				ip1dbg(("ip_mforward: out of memory for "
1737 				    "mblk, mp_copy\n"));
1738 				goto error_return;
1739 			}
1740 			mutex_enter(&mfc_rt->mfc_mutex);
1741 		}
1742 		/* Get resources for rte, whether first rte or not first. */
1743 		/* Add this packet into rtdetq */
1744 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1745 		if (rte == NULL) {
1746 			ipst->ips_mrtstat->mrts_fwd_drop++;
1747 			mutex_exit(&mfc_rt->mfc_mutex);
1748 			ip1dbg(("ip_mforward: out of memory for"
1749 			    " rtdetq, rte\n"));
1750 			goto error_return;
1751 		}
1752 
1753 		mp0 = copymsg(mp);
1754 		if (mp0 == NULL) {
1755 			ipst->ips_mrtstat->mrts_fwd_drop++;
1756 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1757 			mutex_exit(&mfc_rt->mfc_mutex);
1758 			goto error_return;
1759 		}
1760 		rte->mp		= mp0;
1761 		if (pim_reg_packet) {
1762 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1763 			rte->ill =
1764 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1765 			    v_ipif->ipif_ill;
1766 		} else {
1767 			rte->ill = ill;
1768 		}
1769 		rte->rte_next	= NULL;
1770 
1771 		/*
1772 		 * Determine if upcall q (rtdetq) has overflowed.
1773 		 * mfc_rt->mfc_rte is null by mi_zalloc
1774 		 * if it is the first message.
1775 		 */
1776 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1777 		    rte_m = rte_m->rte_next)
1778 			npkts++;
1779 		if (ipst->ips_ip_mrtdebug > 1) {
1780 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1781 			    "ip_mforward: upcalls %d\n", npkts);
1782 		}
1783 		if (npkts > MAX_UPQ) {
1784 			ipst->ips_mrtstat->mrts_upq_ovflw++;
1785 			mutex_exit(&mfc_rt->mfc_mutex);
1786 			goto error_return;
1787 		}
1788 
1789 		if (npkts == 0) {	/* first upcall */
1790 			int i = 0;
1791 			/*
1792 			 * Now finish installing the new mfc! Now that we have
1793 			 * resources!  Insert new entry at head of hash chain.
1794 			 * Use src and dst which are ipaddr_t's.
1795 			 */
1796 			mfc_rt->mfc_origin.s_addr = src;
1797 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1798 
1799 			mutex_enter(&ipst->ips_numvifs_mutex);
1800 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
1801 				mfc_rt->mfc_ttls[i] = 0;
1802 			mutex_exit(&ipst->ips_numvifs_mutex);
1803 			mfc_rt->mfc_parent = ALL_VIFS;
1804 
1805 			/* Link into table */
1806 			if (ipst->ips_ip_mrtdebug > 1) {
1807 				(void) mi_strlog(mrouter->conn_rq, 1,
1808 				    SL_TRACE,
1809 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1810 				    "g 0x%x\n", hash,
1811 				    ntohl(mfc_rt->mfc_origin.s_addr),
1812 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1813 			}
1814 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1815 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1816 			mfc_rt->mfc_rte = NULL;
1817 		}
1818 
1819 		/* Link in the upcall */
1820 		/* First upcall */
1821 		if (mfc_rt->mfc_rte == NULL)
1822 			mfc_rt->mfc_rte = rte;
1823 		else {
1824 			/* not the first upcall */
1825 			prev_rte = mfc_rt->mfc_rte;
1826 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1827 			    prev_rte = rte1, rte1 = rte1->rte_next)
1828 				;
1829 			prev_rte->rte_next = rte;
1830 		}
1831 
1832 		/*
1833 		 * No upcalls waiting, this is first one, so send a message to
1834 		 * routing daemon to install a route into kernel table.
1835 		 */
1836 		if (npkts == 0) {
1837 			struct igmpmsg	*im;
1838 			/* ipha_protocol is 0, for upcall */
1839 			ASSERT(mp_copy != NULL);
1840 			im = (struct igmpmsg *)mp_copy->b_rptr;
1841 			im->im_msgtype	= IGMPMSG_NOCACHE;
1842 			im->im_mbz = 0;
1843 			mutex_enter(&ipst->ips_numvifs_mutex);
1844 			if (pim_reg_packet) {
1845 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1846 				mutex_exit(&ipst->ips_numvifs_mutex);
1847 			} else {
1848 				/*
1849 				 * XXX do we need to hold locks here ?
1850 				 */
1851 				for (vifi = 0;
1852 				    vifi < ipst->ips_numvifs;
1853 				    vifi++) {
1854 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
1855 						continue;
1856 					if (ipst->ips_vifs[vifi].
1857 					    v_ipif->ipif_ill == ill) {
1858 						im->im_vif = (uchar_t)vifi;
1859 						break;
1860 					}
1861 				}
1862 				mutex_exit(&ipst->ips_numvifs_mutex);
1863 				ASSERT(vifi < ipst->ips_numvifs);
1864 			}
1865 
1866 			ipst->ips_mrtstat->mrts_upcalls++;
1867 			/* Timer to discard upcalls if mrouted is too slow */
1868 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1869 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1870 			mutex_exit(&mfc_rt->mfc_mutex);
1871 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1872 			/* Pass to RAWIP */
1873 			ira->ira_ill = ira->ira_rill = NULL;
1874 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
1875 			ira->ira_ill = ill;
1876 			ira->ira_rill = rill;
1877 		} else {
1878 			mutex_exit(&mfc_rt->mfc_mutex);
1879 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1880 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1881 			ip_drop_input("ip_mforward - upcall already waiting",
1882 			    mp_copy, ill);
1883 			freemsg(mp_copy);
1884 		}
1885 
1886 		MFCB_REFRELE(mfcbp);
1887 		if (tunnel_src != 0)
1888 			return (1);
1889 		else
1890 			return (0);
1891 	error_return:
1892 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1893 		MFCB_REFRELE(mfcbp);
1894 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1895 			mi_free((char *)mfc_rt);
1896 		if (rte != NULL)
1897 			mi_free((char *)rte);
1898 		if (mp_copy != NULL) {
1899 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1900 			ip_drop_input("ip_mforward error", mp_copy, ill);
1901 			freemsg(mp_copy);
1902 		}
1903 		if (mp0 != NULL)
1904 			freemsg(mp0);
1905 		return (-1);
1906 	}
1907 }
1908 
1909 /*
1910  * Clean up the mfctable cache entry if upcall is not serviced.
1911  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1912  */
1913 static void
1914 expire_upcalls(void *arg)
1915 {
1916 	struct mfc *mfc_rt = arg;
1917 	uint_t hash;
1918 	struct mfc *prev_mfc, *mfc0;
1919 	ip_stack_t	*ipst;
1920 	conn_t		*mrouter;
1921 
1922 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1923 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1924 		return;
1925 	}
1926 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1927 	mrouter = ipst->ips_ip_g_mrouter;
1928 
1929 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1930 	if (ipst->ips_ip_mrtdebug > 1) {
1931 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1932 		    "expire_upcalls: hash %d s %x g %x",
1933 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1934 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1935 	}
1936 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1937 	mutex_enter(&mfc_rt->mfc_mutex);
1938 	/*
1939 	 * if timeout has been set to zero, than the
1940 	 * entry has been filled, no need to delete it.
1941 	 */
1942 	if (mfc_rt->mfc_timeout_id == 0)
1943 		goto done;
1944 	ipst->ips_mrtstat->mrts_cache_cleanups++;
1945 	mfc_rt->mfc_timeout_id = 0;
1946 
1947 	/* Determine entry to be cleaned up in cache table. */
1948 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1949 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1950 		if (mfc0 == mfc_rt)
1951 			break;
1952 
1953 	/* del_mfc takes care of gone mfcs */
1954 	ASSERT(prev_mfc != NULL);
1955 	ASSERT(mfc0 != NULL);
1956 
1957 	/*
1958 	 * Delete the entry from the cache
1959 	 */
1960 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1961 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1962 
1963 	/*
1964 	 * release_mfc will drop all queued upcall packets.
1965 	 * and will free the mbuf with the pkt, if, timing info.
1966 	 */
1967 done:
1968 	mutex_exit(&mfc_rt->mfc_mutex);
1969 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1970 }
1971 
1972 /*
1973  * Packet forwarding routine once entry in the cache is made.
1974  */
1975 static int
1976 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1977     struct mfc *rt)
1978 {
1979 	vifi_t vifi;
1980 	struct vif *vifp;
1981 	ipaddr_t dst = ipha->ipha_dst;
1982 	size_t  plen = msgdsize(mp);
1983 	vifi_t num_of_vifs;
1984 	ip_stack_t	*ipst = ill->ill_ipst;
1985 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1986 	ip_recv_attr_t	iras;
1987 
1988 	if (ipst->ips_ip_mrtdebug > 1) {
1989 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1990 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
1991 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1992 		    ill->ill_name);
1993 	}
1994 
1995 	/* Macro to send packet on vif */
1996 #define	MC_SEND(ipha, mp, vifp, dst) { \
1997 	if ((vifp)->v_flags & VIFF_TUNNEL) \
1998 		encap_send((ipha), (mp), (vifp), (dst)); \
1999 	else if ((vifp)->v_flags & VIFF_REGISTER) \
2000 		register_send((ipha), (mp), (vifp), (dst)); \
2001 	else \
2002 		phyint_send((ipha), (mp), (vifp), (dst)); \
2003 }
2004 
2005 	vifi = rt->mfc_parent;
2006 
2007 	/*
2008 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2009 	 * Mrouted had no route.
2010 	 * We wanted the route installed in the mfctable to prevent multiple
2011 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2012 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2013 	 * 3.6.
2014 	 */
2015 	if (vifi == NO_VIF) {
2016 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2017 		    ill->ill_name));
2018 		if (ipst->ips_ip_mrtdebug > 1) {
2019 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2020 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2021 		}
2022 		return (-1);	/* drop pkt */
2023 	}
2024 
2025 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2026 		return (-1);
2027 	/*
2028 	 * The MFC entries are not cleaned up when an ipif goes
2029 	 * away thus this code has to guard against an MFC referencing
2030 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2031 	 * sets the v_ipif to NULL when the ipif disappears.
2032 	 */
2033 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2034 
2035 	if (vifi >= ipst->ips_numvifs) {
2036 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2037 		    "%d ill %s viftable ill %s\n",
2038 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2039 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2040 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2041 		return (-1);
2042 	}
2043 	/*
2044 	 * Don't forward if it didn't arrive from the parent vif for its
2045 	 * origin.
2046 	 */
2047 	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
2048 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2049 		/* Came in the wrong interface */
2050 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2051 			"numvifs %d ill %s viftable ill %s\n",
2052 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2053 			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2054 		if (ipst->ips_ip_mrtdebug > 1) {
2055 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2056 			    "ip_mdq: arrived wrong if, vifi %d ill "
2057 			    "%s viftable ill %s\n",
2058 			    (int)vifi, ill->ill_name,
2059 			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2060 		}
2061 		ipst->ips_mrtstat->mrts_wrong_if++;
2062 		rt->mfc_wrong_if++;
2063 
2064 		/*
2065 		 * If we are doing PIM assert processing and we are forwarding
2066 		 * packets on this interface, and it is a broadcast medium
2067 		 * interface (and not a tunnel), send a message to the routing.
2068 		 *
2069 		 * We use the first ipif on the list, since it's all we have.
2070 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2071 		 */
2072 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2073 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2074 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2075 			mblk_t		*mp_copy;
2076 			struct igmpmsg	*im;
2077 
2078 			/* TODO could copy header and dup rest */
2079 			mp_copy = copymsg(mp);
2080 			if (mp_copy == NULL) {
2081 				ipst->ips_mrtstat->mrts_fwd_drop++;
2082 				ip1dbg(("ip_mdq: out of memory "
2083 				    "for mblk, mp_copy\n"));
2084 				unlock_good_vif(&ipst->ips_vifs[vifi]);
2085 				return (-1);
2086 			}
2087 
2088 			im = (struct igmpmsg *)mp_copy->b_rptr;
2089 			im->im_msgtype = IGMPMSG_WRONGVIF;
2090 			im->im_mbz = 0;
2091 			im->im_vif = (ushort_t)vifi;
2092 			/* Pass to RAWIP */
2093 
2094 			bzero(&iras, sizeof (iras));
2095 			iras.ira_flags = IRAF_IS_IPV4;
2096 			iras.ira_ip_hdr_length =
2097 			    IPH_HDR_LENGTH(mp_copy->b_rptr);
2098 			iras.ira_pktlen = msgdsize(mp_copy);
2099 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2100 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2101 		}
2102 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2103 		if (tunnel_src != 0)
2104 			return (1);
2105 		else
2106 			return (0);
2107 	}
2108 	/*
2109 	 * If I sourced this packet, it counts as output, else it was input.
2110 	 */
2111 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2112 		ipst->ips_vifs[vifi].v_pkt_out++;
2113 		ipst->ips_vifs[vifi].v_bytes_out += plen;
2114 	} else {
2115 		ipst->ips_vifs[vifi].v_pkt_in++;
2116 		ipst->ips_vifs[vifi].v_bytes_in += plen;
2117 	}
2118 	mutex_enter(&rt->mfc_mutex);
2119 	rt->mfc_pkt_cnt++;
2120 	rt->mfc_byte_cnt += plen;
2121 	mutex_exit(&rt->mfc_mutex);
2122 	unlock_good_vif(&ipst->ips_vifs[vifi]);
2123 	/*
2124 	 * For each vif, decide if a copy of the packet should be forwarded.
2125 	 * Forward if:
2126 	 *		- the vif threshold ttl is non-zero AND
2127 	 *		- the pkt ttl exceeds the vif's threshold
2128 	 * A non-zero mfc_ttl indicates that the vif is part of
2129 	 * the output set for the mfc entry.
2130 	 */
2131 	mutex_enter(&ipst->ips_numvifs_mutex);
2132 	num_of_vifs = ipst->ips_numvifs;
2133 	mutex_exit(&ipst->ips_numvifs_mutex);
2134 	for (vifp = ipst->ips_vifs, vifi = 0;
2135 	    vifi < num_of_vifs;
2136 	    vifp++, vifi++) {
2137 		if (!lock_good_vif(vifp))
2138 			continue;
2139 		if ((rt->mfc_ttls[vifi] > 0) &&
2140 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2141 			/*
2142 			 * lock_good_vif should not have succedded if
2143 			 * v_ipif is null.
2144 			 */
2145 			ASSERT(vifp->v_ipif != NULL);
2146 			vifp->v_pkt_out++;
2147 			vifp->v_bytes_out += plen;
2148 			MC_SEND(ipha, mp, vifp, dst);
2149 			ipst->ips_mrtstat->mrts_fwd_out++;
2150 		}
2151 		unlock_good_vif(vifp);
2152 	}
2153 	if (tunnel_src != 0)
2154 		return (1);
2155 	else
2156 		return (0);
2157 }
2158 
2159 /*
2160  * Send the packet on physical interface.
2161  * Caller assumes can continue to use mp on return.
2162  */
2163 /* ARGSUSED */
2164 static void
2165 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2166 {
2167 	mblk_t 	*mp_copy;
2168 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2169 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2170 
2171 	/* Make a new reference to the packet */
2172 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2173 	if (mp_copy == NULL) {
2174 		ipst->ips_mrtstat->mrts_fwd_drop++;
2175 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2176 		return;
2177 	}
2178 	if (vifp->v_rate_limit <= 0)
2179 		tbf_send_packet(vifp, mp_copy);
2180 	else  {
2181 		if (ipst->ips_ip_mrtdebug > 1) {
2182 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2183 			    "phyint_send: tbf_contr rate %d "
2184 			    "vifp 0x%p mp 0x%p dst 0x%x",
2185 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2186 		}
2187 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2188 	}
2189 }
2190 
2191 /*
2192  * Send the whole packet for REGISTER encapsulation to PIM daemon
2193  * Caller assumes it can continue to use mp on return.
2194  */
2195 /* ARGSUSED */
2196 static void
2197 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2198 {
2199 	struct igmpmsg	*im;
2200 	mblk_t		*mp_copy;
2201 	ipha_t		*ipha_copy;
2202 	ill_t		*ill = vifp->v_ipif->ipif_ill;
2203 	ip_stack_t	*ipst = ill->ill_ipst;
2204 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2205 	ip_recv_attr_t	iras;
2206 
2207 	if (ipst->ips_ip_mrtdebug > 1) {
2208 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2209 		    "register_send: src %x, dst %x\n",
2210 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2211 	}
2212 
2213 	/*
2214 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2215 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2216 	 * ethernet driver will.
2217 	 */
2218 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2219 	if (mp_copy == NULL) {
2220 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2221 		if (ipst->ips_ip_mrtdebug > 3) {
2222 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2223 			    "register_send: allocb failure.");
2224 		}
2225 		return;
2226 	}
2227 
2228 	/*
2229 	 * Bump write pointer to account for igmpmsg being added.
2230 	 */
2231 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2232 
2233 	/*
2234 	 * Chain packet to new mblk_t.
2235 	 */
2236 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2237 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2238 		if (ipst->ips_ip_mrtdebug > 3) {
2239 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2240 			    "register_send: copymsg failure.");
2241 		}
2242 		freeb(mp_copy);
2243 		return;
2244 	}
2245 
2246 	/*
2247 	 * icmp_input() asserts that IP version field is set to an
2248 	 * appropriate version. Hence, the struct igmpmsg that this really
2249 	 * becomes, needs to have the correct IP version field.
2250 	 */
2251 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2252 	*ipha_copy = multicast_encap_iphdr;
2253 
2254 	/*
2255 	 * The kernel uses the struct igmpmsg header to encode the messages to
2256 	 * the multicast routing daemon. Fill in the fields in the header
2257 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2258 	 */
2259 	im = (struct igmpmsg *)mp_copy->b_rptr;
2260 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2261 	im->im_src.s_addr = ipha->ipha_src;
2262 	im->im_dst.s_addr = ipha->ipha_dst;
2263 
2264 	/*
2265 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2266 	 * header with renamed fields and the multicast routing daemon uses
2267 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2268 	 */
2269 	im->im_mbz = 0;
2270 
2271 	++ipst->ips_mrtstat->mrts_upcalls;
2272 	if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
2273 	    !canputnext(mrouter->conn_rq)) {
2274 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2275 		if (ipst->ips_ip_mrtdebug > 3) {
2276 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2277 			    "register_send: register upcall failure.");
2278 		}
2279 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2280 		ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
2281 		freemsg(mp_copy);
2282 	} else {
2283 		/* Pass to RAWIP */
2284 		bzero(&iras, sizeof (iras));
2285 		iras.ira_flags = IRAF_IS_IPV4;
2286 		iras.ira_ip_hdr_length = sizeof (ipha_t);
2287 		iras.ira_pktlen = msgdsize(mp_copy);
2288 		(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2289 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2290 	}
2291 }
2292 
2293 /*
2294  * pim_validate_cksum handles verification of the checksum in the
2295  * pim header.  For PIM Register packets, the checksum is calculated
2296  * across the PIM header only.  For all other packets, the checksum
2297  * is for the PIM header and remainder of the packet.
2298  *
2299  * returns: B_TRUE, if checksum is okay.
2300  *          B_FALSE, if checksum is not valid.
2301  */
2302 static boolean_t
2303 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2304 {
2305 	mblk_t *mp_dup;
2306 
2307 	if ((mp_dup = dupmsg(mp)) == NULL)
2308 		return (B_FALSE);
2309 
2310 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2311 	if (pimp->pim_type == PIM_REGISTER)
2312 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2313 	if (IP_CSUM(mp_dup, 0, 0)) {
2314 		freemsg(mp_dup);
2315 		return (B_FALSE);
2316 	}
2317 	freemsg(mp_dup);
2318 	return (B_TRUE);
2319 }
2320 
2321 /*
2322  * Process PIM protocol packets i.e. IP Protocol 103.
2323  * Register messages are decapsulated and sent onto multicast forwarding.
2324  *
2325  * Return NULL for a bad packet that is discarded here.
2326  * Return mp if the message is OK and should be handed to "raw" receivers.
2327  * Callers of pim_input() may need to reinitialize variables that were copied
2328  * from the mblk as this calls pullupmsg().
2329  */
2330 mblk_t *
2331 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
2332 {
2333 	ipha_t		*eip, *ip;
2334 	int		iplen, pimlen, iphlen;
2335 	struct pim	*pimp;	/* pointer to a pim struct */
2336 	uint32_t	*reghdr;
2337 	ill_t		*ill = ira->ira_ill;
2338 	ip_stack_t	*ipst = ill->ill_ipst;
2339 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2340 
2341 	/*
2342 	 * Pullup the msg for PIM protocol processing.
2343 	 */
2344 	if (pullupmsg(mp, -1) == 0) {
2345 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2346 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2347 		ip_drop_input("mrts_pim_nomemory", mp, ill);
2348 		freemsg(mp);
2349 		return (NULL);
2350 	}
2351 
2352 	ip = (ipha_t *)mp->b_rptr;
2353 	iplen = ip->ipha_length;
2354 	iphlen = IPH_HDR_LENGTH(ip);
2355 	pimlen = ntohs(iplen) - iphlen;
2356 
2357 	/*
2358 	 * Validate lengths
2359 	 */
2360 	if (pimlen < PIM_MINLEN) {
2361 		++ipst->ips_mrtstat->mrts_pim_malformed;
2362 		if (ipst->ips_ip_mrtdebug > 1) {
2363 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2364 			    "pim_input: length not at least minlen");
2365 		}
2366 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2367 		ip_drop_input("mrts_pim_malformed", mp, ill);
2368 		freemsg(mp);
2369 		return (NULL);
2370 	}
2371 
2372 	/*
2373 	 * Point to the PIM header.
2374 	 */
2375 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2376 
2377 	/*
2378 	 * Check the version number.
2379 	 */
2380 	if (pimp->pim_vers != PIM_VERSION) {
2381 		++ipst->ips_mrtstat->mrts_pim_badversion;
2382 		if (ipst->ips_ip_mrtdebug > 1) {
2383 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2384 			    "pim_input: unknown version of PIM");
2385 		}
2386 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2387 		ip_drop_input("mrts_pim_badversion", mp, ill);
2388 		freemsg(mp);
2389 		return (NULL);
2390 	}
2391 
2392 	/*
2393 	 * Validate the checksum
2394 	 */
2395 	if (!pim_validate_cksum(mp, ip, pimp)) {
2396 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2397 		if (ipst->ips_ip_mrtdebug > 1) {
2398 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2399 			    "pim_input: invalid checksum");
2400 		}
2401 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2402 		ip_drop_input("pim_rcv_badcsum", mp, ill);
2403 		freemsg(mp);
2404 		return (NULL);
2405 	}
2406 
2407 	if (pimp->pim_type != PIM_REGISTER)
2408 		return (mp);
2409 
2410 	reghdr = (uint32_t *)(pimp + 1);
2411 	eip = (ipha_t *)(reghdr + 1);
2412 
2413 	/*
2414 	 * check if the inner packet is destined to mcast group
2415 	 */
2416 	if (!CLASSD(eip->ipha_dst)) {
2417 		++ipst->ips_mrtstat->mrts_pim_badregisters;
2418 		if (ipst->ips_ip_mrtdebug > 1) {
2419 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2420 			    "pim_input: Inner pkt not mcast .. !");
2421 		}
2422 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2423 		ip_drop_input("mrts_pim_badregisters", mp, ill);
2424 		freemsg(mp);
2425 		return (NULL);
2426 	}
2427 	if (ipst->ips_ip_mrtdebug > 1) {
2428 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2429 		    "register from %x, to %x, len %d",
2430 		    ntohl(eip->ipha_src),
2431 		    ntohl(eip->ipha_dst),
2432 		    ntohs(eip->ipha_length));
2433 	}
2434 	/*
2435 	 * If the null register bit is not set, decapsulate
2436 	 * the packet before forwarding it.
2437 	 * Avoid this in no register vif
2438 	 */
2439 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
2440 	    ipst->ips_reg_vif_num != ALL_VIFS) {
2441 		mblk_t *mp_copy;
2442 		uint_t saved_pktlen;
2443 
2444 		/* Copy the message */
2445 		if ((mp_copy = copymsg(mp)) == NULL) {
2446 			++ipst->ips_mrtstat->mrts_pim_nomemory;
2447 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2448 			ip_drop_input("mrts_pim_nomemory", mp, ill);
2449 			freemsg(mp);
2450 			return (NULL);
2451 		}
2452 
2453 		/*
2454 		 * Decapsulate the packet and give it to
2455 		 * register_mforward.
2456 		 */
2457 		mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
2458 		saved_pktlen = ira->ira_pktlen;
2459 		ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
2460 		if (register_mforward(mp_copy, ira) != 0) {
2461 			/* register_mforward already called ip_drop_input */
2462 			freemsg(mp);
2463 			ira->ira_pktlen = saved_pktlen;
2464 			return (NULL);
2465 		}
2466 		ira->ira_pktlen = saved_pktlen;
2467 	}
2468 
2469 	/*
2470 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2471 	 * PIM socket. For Solaris it is done right after pim_input() is
2472 	 * called.
2473 	 */
2474 	return (mp);
2475 }
2476 
2477 /*
2478  * PIM sparse mode hook.  Called by pim_input after decapsulating
2479  * the packet. Loop back the packet, as if we have received it.
2480  * In pim_input() we have to check if the destination is a multicast address.
2481  */
2482 static int
2483 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
2484 {
2485 	ire_t		*ire;
2486 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2487 	ill_t		*ill = ira->ira_ill;
2488 	ip_stack_t	*ipst = ill->ill_ipst;
2489 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2490 
2491 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2492 
2493 	if (ipst->ips_ip_mrtdebug > 3) {
2494 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2495 		    "register_mforward: src %x, dst %x\n",
2496 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2497 	}
2498 	/*
2499 	 * Need to pass in to ip_mforward() the information that the
2500 	 * packet has arrived on the register_vif. We mark it with
2501 	 * the IRAF_PIM_REGISTER attribute.
2502 	 * pim_input verified that the (inner) destination is multicast,
2503 	 * hence we skip the generic code in ip_input.
2504 	 */
2505 	ira->ira_flags |= IRAF_PIM_REGISTER;
2506 	++ipst->ips_mrtstat->mrts_pim_regforwards;
2507 
2508 	if (!CLASSD(ipha->ipha_dst)) {
2509 		ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
2510 		    ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst,
2511 		    NULL, NULL, NULL);
2512 	} else {
2513 		ire = ire_multicast(ill);
2514 	}
2515 	ASSERT(ire != NULL);
2516 	/* Normally this will return the IRE_MULTICAST */
2517 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2518 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2519 		ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
2520 		freemsg(mp);
2521 		ire_refrele(ire);
2522 		return (-1);
2523 	}
2524 	ASSERT(ire->ire_type & IRE_MULTICAST);
2525 	(*ire->ire_recvfn)(ire, mp, ipha, ira);
2526 	ire_refrele(ire);
2527 
2528 	return (0);
2529 }
2530 
2531 /*
2532  * Send an encapsulated packet.
2533  * Caller assumes can continue to use mp when routine returns.
2534  */
2535 /* ARGSUSED */
2536 static void
2537 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2538 {
2539 	mblk_t 	*mp_copy;
2540 	ipha_t 	*ipha_copy;
2541 	size_t	len;
2542 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2543 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2544 
2545 	if (ipst->ips_ip_mrtdebug > 1) {
2546 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2547 		    "encap_send: vif %ld enter",
2548 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
2549 	}
2550 	len = ntohs(ipha->ipha_length);
2551 
2552 	/*
2553 	 * Copy the old packet & pullup it's IP header into the
2554 	 * new mbuf so we can modify it.  Try to fill the new
2555 	 * mbuf since if we don't the ethernet driver will.
2556 	 */
2557 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2558 	if (mp_copy == NULL)
2559 		return;
2560 	mp_copy->b_rptr += 32;
2561 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2562 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2563 		freeb(mp_copy);
2564 		return;
2565 	}
2566 
2567 	/*
2568 	 * Fill in the encapsulating IP header.
2569 	 * Remote tunnel dst in rmt_addr, from add_vif().
2570 	 */
2571 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2572 	*ipha_copy = multicast_encap_iphdr;
2573 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2574 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2575 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2576 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2577 	ASSERT(ipha_copy->ipha_ident == 0);
2578 
2579 	/* Turn the encapsulated IP header back into a valid one. */
2580 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2581 	ipha->ipha_ttl--;
2582 	ipha->ipha_hdr_checksum = 0;
2583 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2584 
2585 	ipha_copy->ipha_ttl = ipha->ipha_ttl;
2586 
2587 	if (ipst->ips_ip_mrtdebug > 1) {
2588 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2589 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2590 	}
2591 	if (vifp->v_rate_limit <= 0)
2592 		tbf_send_packet(vifp, mp_copy);
2593 	else
2594 		/* ipha is from the original header */
2595 		tbf_control(vifp, mp_copy, ipha);
2596 }
2597 
2598 /*
2599  * De-encapsulate a packet and feed it back through IP input if it
2600  * matches one of our multicast tunnels.
2601  *
2602  * This routine is called whenever IP gets a packet with prototype
2603  * IPPROTO_ENCAP and a local destination address and the packet didn't
2604  * match one of our configured IP-in-IP tunnels.
2605  */
2606 void
2607 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
2608 {
2609 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2610 	ipha_t		*ipha_encap;
2611 	int		hlen = IPH_HDR_LENGTH(ipha);
2612 	int		hlen_encap;
2613 	ipaddr_t	src;
2614 	struct vif	*vifp;
2615 	ire_t		*ire;
2616 	ill_t		*ill = ira->ira_ill;
2617 	ip_stack_t	*ipst = ill->ill_ipst;
2618 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2619 
2620 	/* Make sure we have all of the inner header */
2621 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2622 	if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
2623 		ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
2624 		if (ipha == NULL) {
2625 			ipst->ips_mrtstat->mrts_bad_tunnel++;
2626 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2627 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
2628 			freemsg(mp);
2629 			return;
2630 		}
2631 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
2632 	}
2633 	hlen_encap = IPH_HDR_LENGTH(ipha_encap);
2634 	if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
2635 		ipha = ip_pullup(mp, hlen + hlen_encap, ira);
2636 		if (ipha == NULL) {
2637 			ipst->ips_mrtstat->mrts_bad_tunnel++;
2638 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2639 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
2640 			freemsg(mp);
2641 			return;
2642 		}
2643 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
2644 	}
2645 
2646 	/*
2647 	 * Dump the packet if it's not to a multicast destination or if
2648 	 * we don't have an encapsulating tunnel with the source.
2649 	 * Note:  This code assumes that the remote site IP address
2650 	 * uniquely identifies the tunnel (i.e., that this site has
2651 	 * at most one tunnel with the remote site).
2652 	 */
2653 	if (!CLASSD(ipha_encap->ipha_dst)) {
2654 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2655 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2656 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2657 		ip_drop_input("mrts_bad_tunnel", mp, ill);
2658 		freemsg(mp);
2659 		return;
2660 	}
2661 	src = (ipaddr_t)ipha->ipha_src;
2662 	mutex_enter(&ipst->ips_last_encap_lock);
2663 	if (src != ipst->ips_last_encap_src) {
2664 		struct vif *vife;
2665 
2666 		vifp = ipst->ips_vifs;
2667 		vife = vifp + ipst->ips_numvifs;
2668 		ipst->ips_last_encap_src = src;
2669 		ipst->ips_last_encap_vif = 0;
2670 		for (; vifp < vife; ++vifp) {
2671 			if (!lock_good_vif(vifp))
2672 				continue;
2673 			if (vifp->v_rmt_addr.s_addr == src) {
2674 				if (vifp->v_flags & VIFF_TUNNEL)
2675 					ipst->ips_last_encap_vif = vifp;
2676 				if (ipst->ips_ip_mrtdebug > 1) {
2677 					(void) mi_strlog(mrouter->conn_rq,
2678 					    1, SL_TRACE,
2679 					    "ip_mroute_decap: good tun "
2680 					    "vif %ld with %x",
2681 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
2682 					    ntohl(src));
2683 				}
2684 				unlock_good_vif(vifp);
2685 				break;
2686 			}
2687 			unlock_good_vif(vifp);
2688 		}
2689 	}
2690 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
2691 		mutex_exit(&ipst->ips_last_encap_lock);
2692 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2693 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2694 		ip_drop_input("mrts_bad_tunnel", mp, ill);
2695 		freemsg(mp);
2696 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2697 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2698 		return;
2699 	}
2700 	mutex_exit(&ipst->ips_last_encap_lock);
2701 
2702 	/*
2703 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2704 	 * verify that the packet arrived over the correct vif.)
2705 	 */
2706 	ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
2707 	ira->ira_mroute_tunnel = src;
2708 	mp->b_rptr += hlen;
2709 	ira->ira_pktlen -= hlen;
2710 	ira->ira_ip_hdr_length = hlen_encap;
2711 
2712 	/*
2713 	 * We don't redo any of the filtering in ill_input_full_v4 and we
2714 	 * have checked that all of ipha_encap and any IP options are
2715 	 * pulled up. Hence we call ire_recv_multicast_v4 directly.
2716 	 * However, we have to check for RSVP as in ip_input_full_v4
2717 	 * and if so we pass it to ire_recv_broadcast_v4 for local delivery
2718 	 * to the rsvpd.
2719 	 */
2720 	if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
2721 	    ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
2722 		ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
2723 		    ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
2724 		    IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2725 	} else {
2726 		ire = ire_multicast(ill);
2727 	}
2728 	ASSERT(ire != NULL);
2729 	/* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
2730 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2731 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2732 		ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
2733 		freemsg(mp);
2734 		ire_refrele(ire);
2735 		return;
2736 	}
2737 	ire->ire_ib_pkt_count++;
2738 	ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
2739 	(*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
2740 	ire_refrele(ire);
2741 }
2742 
2743 /*
2744  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2745  * (stream closed).  Called as writer.
2746  */
2747 void
2748 reset_mrt_vif_ipif(ipif_t *ipif)
2749 {
2750 	vifi_t vifi, tmp_vifi;
2751 	vifi_t num_of_vifs;
2752 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2753 
2754 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2755 
2756 	mutex_enter(&ipst->ips_numvifs_mutex);
2757 	num_of_vifs = ipst->ips_numvifs;
2758 	mutex_exit(&ipst->ips_numvifs_mutex);
2759 
2760 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2761 		tmp_vifi = vifi - 1;
2762 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2763 			(void) del_vif(&tmp_vifi, ipst);
2764 		}
2765 	}
2766 }
2767 
2768 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2769 void
2770 reset_mrt_ill(ill_t *ill)
2771 {
2772 	struct mfc	*rt;
2773 	struct rtdetq	*rte;
2774 	int		i;
2775 	ip_stack_t	*ipst = ill->ill_ipst;
2776 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2777 	timeout_id_t	id;
2778 
2779 	for (i = 0; i < MFCTBLSIZ; i++) {
2780 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2781 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2782 			if (ipst->ips_ip_mrtdebug > 1) {
2783 				(void) mi_strlog(mrouter->conn_rq, 1,
2784 				    SL_TRACE,
2785 				    "reset_mrt_ill: mfctable [%d]", i);
2786 			}
2787 			while (rt != NULL) {
2788 				mutex_enter(&rt->mfc_mutex);
2789 				while ((rte = rt->mfc_rte) != NULL) {
2790 					if (rte->ill == ill &&
2791 					    (id = rt->mfc_timeout_id) != 0) {
2792 						/*
2793 						 * Its ok to drop the lock,  the
2794 						 * struct cannot be freed since
2795 						 * we have a ref on the hash
2796 						 * bucket.
2797 						 */
2798 						mutex_exit(&rt->mfc_mutex);
2799 						(void) untimeout(id);
2800 						mutex_enter(&rt->mfc_mutex);
2801 					}
2802 					if (rte->ill == ill) {
2803 						if (ipst->ips_ip_mrtdebug > 1) {
2804 						(void) mi_strlog(
2805 						    mrouter->conn_rq,
2806 						    1, SL_TRACE,
2807 						    "reset_mrt_ill: "
2808 						    "ill 0x%p", (void *)ill);
2809 						}
2810 						rt->mfc_rte = rte->rte_next;
2811 						freemsg(rte->mp);
2812 						mi_free((char *)rte);
2813 					}
2814 				}
2815 				mutex_exit(&rt->mfc_mutex);
2816 				rt = rt->mfc_next;
2817 			}
2818 		}
2819 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
2820 	}
2821 }
2822 
2823 /*
2824  * Token bucket filter module.
2825  * The ipha is for mcastgrp destination for phyint and encap.
2826  */
2827 static void
2828 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2829 {
2830 	size_t 	p_len =  msgdsize(mp);
2831 	struct tbf	*t    = vifp->v_tbf;
2832 	timeout_id_t id = 0;
2833 	ill_t		*ill = vifp->v_ipif->ipif_ill;
2834 	ip_stack_t	*ipst = ill->ill_ipst;
2835 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2836 
2837 	/* Drop if packet is too large */
2838 	if (p_len > MAX_BKT_SIZE) {
2839 		ipst->ips_mrtstat->mrts_pkt2large++;
2840 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2841 		ip_drop_output("tbf_control - too large", mp, ill);
2842 		freemsg(mp);
2843 		return;
2844 	}
2845 	if (ipst->ips_ip_mrtdebug > 1) {
2846 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2847 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2848 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2849 		    ntohl(ipha->ipha_dst));
2850 	}
2851 
2852 	mutex_enter(&t->tbf_lock);
2853 
2854 	tbf_update_tokens(vifp);
2855 
2856 	/*
2857 	 * If there are enough tokens,
2858 	 * and the queue is empty, send this packet out.
2859 	 */
2860 	if (ipst->ips_ip_mrtdebug > 1) {
2861 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2862 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2863 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2864 		    t->tbf_q_len);
2865 	}
2866 	/* No packets are queued */
2867 	if (t->tbf_q_len == 0) {
2868 		/* queue empty, send packet if enough tokens */
2869 		if (p_len <= t->tbf_n_tok) {
2870 			t->tbf_n_tok -= p_len;
2871 			mutex_exit(&t->tbf_lock);
2872 			tbf_send_packet(vifp, mp);
2873 			return;
2874 		} else {
2875 			/* Queue packet and timeout till later */
2876 			tbf_queue(vifp, mp);
2877 			ASSERT(vifp->v_timeout_id == 0);
2878 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2879 			    TBF_REPROCESS);
2880 		}
2881 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2882 		/* Finite queue length, so queue pkts and process queue */
2883 		tbf_queue(vifp, mp);
2884 		tbf_process_q(vifp);
2885 	} else {
2886 		/* Check that we have UDP header with IP header */
2887 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2888 		    sizeof (struct udphdr);
2889 
2890 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2891 			if (!pullupmsg(mp, hdr_length)) {
2892 				BUMP_MIB(ill->ill_ip_mib,
2893 				    ipIfStatsOutDiscards);
2894 				ip_drop_output("tbf_control - pullup", mp, ill);
2895 				freemsg(mp);
2896 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2897 				    "vif %ld src 0x%x dst 0x%x\n",
2898 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
2899 				    ntohl(ipha->ipha_src),
2900 				    ntohl(ipha->ipha_dst)));
2901 				mutex_exit(&vifp->v_tbf->tbf_lock);
2902 				return;
2903 			} else
2904 				/* Have to reassign ipha after pullupmsg */
2905 				ipha = (ipha_t *)mp->b_rptr;
2906 		}
2907 		/*
2908 		 * Queue length too much,
2909 		 * try to selectively dq, or queue and process
2910 		 */
2911 		if (!tbf_dq_sel(vifp, ipha)) {
2912 			ipst->ips_mrtstat->mrts_q_overflow++;
2913 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2914 			ip_drop_output("mrts_q_overflow", mp, ill);
2915 			freemsg(mp);
2916 		} else {
2917 			tbf_queue(vifp, mp);
2918 			tbf_process_q(vifp);
2919 		}
2920 	}
2921 	if (t->tbf_q_len == 0) {
2922 		id = vifp->v_timeout_id;
2923 		vifp->v_timeout_id = 0;
2924 	}
2925 	mutex_exit(&vifp->v_tbf->tbf_lock);
2926 	if (id != 0)
2927 		(void) untimeout(id);
2928 }
2929 
2930 /*
2931  * Adds a packet to the tbf queue at the interface.
2932  * The ipha is for mcastgrp destination for phyint and encap.
2933  */
2934 static void
2935 tbf_queue(struct vif *vifp, mblk_t *mp)
2936 {
2937 	struct tbf	*t = vifp->v_tbf;
2938 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2939 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2940 
2941 	if (ipst->ips_ip_mrtdebug > 1) {
2942 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2943 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2944 	}
2945 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2946 
2947 	if (t->tbf_t == NULL) {
2948 		/* Queue was empty */
2949 		t->tbf_q = mp;
2950 	} else {
2951 		/* Insert at tail */
2952 		t->tbf_t->b_next = mp;
2953 	}
2954 	/* set new tail pointer */
2955 	t->tbf_t = mp;
2956 
2957 	mp->b_next = mp->b_prev = NULL;
2958 
2959 	t->tbf_q_len++;
2960 }
2961 
2962 /*
2963  * Process the queue at the vif interface.
2964  * Drops the tbf_lock when sending packets.
2965  *
2966  * NOTE : The caller should quntimeout if the queue length is 0.
2967  */
2968 static void
2969 tbf_process_q(struct vif *vifp)
2970 {
2971 	mblk_t	*mp;
2972 	struct tbf	*t = vifp->v_tbf;
2973 	size_t	len;
2974 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2975 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2976 
2977 	if (ipst->ips_ip_mrtdebug > 1) {
2978 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2979 		    "tbf_process_q 1: vif %ld qlen = %d",
2980 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2981 	}
2982 
2983 	/*
2984 	 * Loop through the queue at the interface and send
2985 	 * as many packets as possible.
2986 	 */
2987 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2988 
2989 	while (t->tbf_q_len > 0) {
2990 		mp = t->tbf_q;
2991 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2992 
2993 		/* Determine if the packet can be sent */
2994 		if (len <= t->tbf_n_tok) {
2995 			/*
2996 			 * If so, reduce no. of tokens, dequeue the packet,
2997 			 * send the packet.
2998 			 */
2999 			t->tbf_n_tok -= len;
3000 
3001 			t->tbf_q = mp->b_next;
3002 			if (--t->tbf_q_len == 0) {
3003 				t->tbf_t = NULL;
3004 			}
3005 			mp->b_next = NULL;
3006 			/* Exit mutex before sending packet, then re-enter */
3007 			mutex_exit(&t->tbf_lock);
3008 			tbf_send_packet(vifp, mp);
3009 			mutex_enter(&t->tbf_lock);
3010 		} else
3011 			break;
3012 	}
3013 }
3014 
3015 /* Called at tbf timeout to update tokens, process q and reset timer.  */
3016 static void
3017 tbf_reprocess_q(void *arg)
3018 {
3019 	struct vif *vifp = arg;
3020 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3021 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3022 
3023 	mutex_enter(&vifp->v_tbf->tbf_lock);
3024 	vifp->v_timeout_id = 0;
3025 	tbf_update_tokens(vifp);
3026 
3027 	tbf_process_q(vifp);
3028 
3029 	if (vifp->v_tbf->tbf_q_len > 0) {
3030 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
3031 		    TBF_REPROCESS);
3032 	}
3033 	mutex_exit(&vifp->v_tbf->tbf_lock);
3034 
3035 	if (ipst->ips_ip_mrtdebug > 1) {
3036 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3037 		    "tbf_reprcess_q: vif %ld timeout id = %p",
3038 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
3039 	}
3040 }
3041 
3042 /*
3043  * Function that will selectively discard a member of the tbf queue,
3044  * based on the precedence value and the priority.
3045  *
3046  * NOTE : The caller should quntimeout if the queue length is 0.
3047  */
3048 static int
3049 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
3050 {
3051 	uint_t		p;
3052 	struct tbf		*t = vifp->v_tbf;
3053 	mblk_t		**np;
3054 	mblk_t		*last, *mp;
3055 	ill_t		*ill = vifp->v_ipif->ipif_ill;
3056 	ip_stack_t	*ipst = ill->ill_ipst;
3057 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3058 
3059 	if (ipst->ips_ip_mrtdebug > 1) {
3060 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3061 		    "dq_sel: vif %ld dst 0x%x",
3062 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
3063 	}
3064 
3065 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3066 	p = priority(vifp, ipha);
3067 
3068 	np = &t->tbf_q;
3069 	last = NULL;
3070 	while ((mp = *np) != NULL) {
3071 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
3072 			*np = mp->b_next;
3073 			/* If removing the last packet, fix the tail pointer */
3074 			if (mp == t->tbf_t)
3075 				t->tbf_t = last;
3076 			mp->b_prev = mp->b_next = NULL;
3077 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3078 			ip_drop_output("tbf_dq_send", mp, ill);
3079 			freemsg(mp);
3080 			/*
3081 			 * It's impossible for the queue to be empty, but
3082 			 * we check anyway.
3083 			 */
3084 			if (--t->tbf_q_len == 0) {
3085 				t->tbf_t = NULL;
3086 			}
3087 			ipst->ips_mrtstat->mrts_drop_sel++;
3088 			return (1);
3089 		}
3090 		np = &mp->b_next;
3091 		last = mp;
3092 	}
3093 	return (0);
3094 }
3095 
3096 /* Sends packet, 2 cases - encap tunnel, phyint.  */
3097 static void
3098 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3099 {
3100 	ipif_t		*ipif = vifp->v_ipif;
3101 	ill_t		*ill = ipif->ipif_ill;
3102 	ip_stack_t	*ipst = ill->ill_ipst;
3103 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3104 	ipha_t		*ipha;
3105 
3106 	ipha = (ipha_t *)mp->b_rptr;
3107 	/* If encap tunnel options */
3108 	if (vifp->v_flags & VIFF_TUNNEL)  {
3109 		ip_xmit_attr_t	ixas;
3110 
3111 		if (ipst->ips_ip_mrtdebug > 1) {
3112 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3113 			    "tbf_send_packet: ENCAP tunnel vif %ld",
3114 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
3115 		}
3116 		bzero(&ixas, sizeof (ixas));
3117 		ixas.ixa_flags =
3118 		    IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
3119 		ixas.ixa_ipst = ipst;
3120 		ixas.ixa_ifindex = 0;
3121 		ixas.ixa_cred = kcred;
3122 		ixas.ixa_cpid = NOPID;
3123 		ixas.ixa_tsl = NULL;
3124 		ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3125 		ixas.ixa_pktlen = ntohs(ipha->ipha_length);
3126 		ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3127 
3128 		/*
3129 		 * Feed into ip_output_simple which will set the ident field
3130 		 * and checksum the encapsulating header.
3131 		 * BSD gets the cached route vifp->v_route from ip_output()
3132 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
3133 		 * One could make multicast forwarding faster by putting an
3134 		 * ip_xmit_attr_t in each vif thereby caching the ire/nce.
3135 		 */
3136 		(void) ip_output_simple(mp, &ixas);
3137 		ixa_cleanup(&ixas);
3138 		return;
3139 
3140 		/* phyint */
3141 	} else {
3142 		/* Need to loop back to members on the outgoing interface. */
3143 		ipaddr_t	dst;
3144 		ip_recv_attr_t	iras;
3145 		nce_t		*nce;
3146 
3147 		bzero(&iras, sizeof (iras));
3148 		iras.ira_flags = IRAF_IS_IPV4;
3149 		iras.ira_ill = iras.ira_rill = ill;
3150 		iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3151 		iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3152 		iras.ira_pktlen = ntohs(ipha->ipha_length);
3153 		iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3154 
3155 		dst = ipha->ipha_dst;
3156 		if (ill_hasmembers_v4(ill, dst)) {
3157 			iras.ira_flags |= IRAF_LOOPBACK_COPY;
3158 		}
3159 		if (ipst->ips_ip_mrtdebug > 1) {
3160 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3161 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3162 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3163 		}
3164 		/*
3165 		 * Find an NCE which matches the nexthop.
3166 		 * For a pt-pt interface we use the other end of the pt-pt
3167 		 * link.
3168 		 */
3169 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
3170 			dst = ipif->ipif_pp_dst_addr;
3171 			nce = arp_nce_init(ill, dst, ill->ill_net_type);
3172 		} else {
3173 			nce = arp_nce_init(ill, dst, IRE_MULTICAST);
3174 		}
3175 		if (nce == NULL) {
3176 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3177 			ip_drop_output("tbf_send_packet - no nce", mp, ill);
3178 			freemsg(mp);
3179 			return;
3180 		}
3181 
3182 		/*
3183 		 * We don't remeber the incoming ill. Thus we
3184 		 * pretend the  packet arrived on the outbound ill. This means
3185 		 * statistics for input errors will be increased on the wrong
3186 		 * ill but that isn't a big deal.
3187 		 */
3188 		ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mtu, 0);
3189 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3190 
3191 		nce_refrele(nce);
3192 	}
3193 }
3194 
3195 /*
3196  * Determine the current time and then the elapsed time (between the last time
3197  * and time now).  Update the no. of tokens in the bucket.
3198  */
3199 static void
3200 tbf_update_tokens(struct vif *vifp)
3201 {
3202 	timespec_t	tp;
3203 	hrtime_t	tm;
3204 	struct tbf	*t = vifp->v_tbf;
3205 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3206 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3207 
3208 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3209 
3210 	/* Time in secs and nsecs, rate limit in kbits/sec */
3211 	gethrestime(&tp);
3212 
3213 	/*LINTED*/
3214 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3215 
3216 	/*
3217 	 * This formula is actually
3218 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3219 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3220 	 *
3221 	 * The (1000/1024) was introduced in add_vif to optimize
3222 	 * this divide into a shift.
3223 	 */
3224 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3225 	t->tbf_last_pkt_t = tp;
3226 
3227 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3228 		t->tbf_n_tok = MAX_BKT_SIZE;
3229 	if (ipst->ips_ip_mrtdebug > 1) {
3230 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3231 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3232 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3233 	}
3234 }
3235 
3236 /*
3237  * Priority currently is based on port nos.
3238  * Different forwarding mechanisms have different ways
3239  * of obtaining the port no. Hence, the vif must be
3240  * given along with the packet itself.
3241  *
3242  */
3243 static int
3244 priority(struct vif *vifp, ipha_t *ipha)
3245 {
3246 	int prio;
3247 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3248 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3249 
3250 	/* Temporary hack; may add general packet classifier some day */
3251 
3252 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3253 
3254 	/*
3255 	 * The UDP port space is divided up into four priority ranges:
3256 	 * [0, 16384)	: unclassified - lowest priority
3257 	 * [16384, 32768)	: audio - highest priority
3258 	 * [32768, 49152)	: whiteboard - medium priority
3259 	 * [49152, 65536)	: video - low priority
3260 	 */
3261 
3262 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3263 		struct udphdr *udp =
3264 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3265 		switch (ntohs(udp->uh_dport) & 0xc000) {
3266 		case 0x4000:
3267 			prio = 70;
3268 			break;
3269 		case 0x8000:
3270 			prio = 60;
3271 			break;
3272 		case 0xc000:
3273 			prio = 55;
3274 			break;
3275 		default:
3276 			prio = 50;
3277 			break;
3278 		}
3279 		if (ipst->ips_ip_mrtdebug > 1) {
3280 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3281 			    "priority: port %x prio %d\n",
3282 			    ntohs(udp->uh_dport), prio);
3283 		}
3284 	} else
3285 		prio = 50;  /* default priority */
3286 	return (prio);
3287 }
3288 
3289 /*
3290  * End of token bucket filter modifications
3291  */
3292 
3293 
3294 
3295 /*
3296  * Produces data for netstat -M.
3297  */
3298 int
3299 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3300 {
3301 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3302 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3303 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3304 		sizeof (struct mrtstat))) {
3305 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3306 		    (size_t)sizeof (struct mrtstat)));
3307 		return (0);
3308 	}
3309 	return (1);
3310 }
3311 
3312 /*
3313  * Sends info for SNMP's MIB.
3314  */
3315 int
3316 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3317 {
3318 	struct vifctl 	vi;
3319 	vifi_t		vifi;
3320 
3321 	mutex_enter(&ipst->ips_numvifs_mutex);
3322 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3323 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3324 			continue;
3325 		/*
3326 		 * No locks here, an approximation is fine.
3327 		 */
3328 		vi.vifc_vifi = vifi;
3329 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3330 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3331 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
3332 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
3333 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
3334 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
3335 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
3336 
3337 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3338 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3339 			    (size_t)sizeof (vi)));
3340 			mutex_exit(&ipst->ips_numvifs_mutex);
3341 			return (0);
3342 		}
3343 	}
3344 	mutex_exit(&ipst->ips_numvifs_mutex);
3345 	return (1);
3346 }
3347 
3348 /*
3349  * Called by ip_snmp_get to send up multicast routing table.
3350  */
3351 int
3352 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3353 {
3354 	int			i, j;
3355 	struct mfc		*rt;
3356 	struct mfcctl	mfcc;
3357 
3358 	/*
3359 	 * Make sure multicast has not been turned off.
3360 	 */
3361 	if (is_mrouter_off(ipst))
3362 		return (1);
3363 
3364 	/* Loop over all hash buckets and their chains */
3365 	for (i = 0; i < MFCTBLSIZ; i++) {
3366 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3367 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3368 			mutex_enter(&rt->mfc_mutex);
3369 			if (rt->mfc_rte != NULL ||
3370 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3371 				mutex_exit(&rt->mfc_mutex);
3372 				continue;
3373 			}
3374 			mfcc.mfcc_origin = rt->mfc_origin;
3375 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3376 			mfcc.mfcc_parent = rt->mfc_parent;
3377 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3378 			mutex_enter(&ipst->ips_numvifs_mutex);
3379 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
3380 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3381 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3382 				mfcc.mfcc_ttls[j] = 0;
3383 			mutex_exit(&ipst->ips_numvifs_mutex);
3384 
3385 			mutex_exit(&rt->mfc_mutex);
3386 			if (!snmp_append_data(mp, (char *)&mfcc,
3387 			    sizeof (mfcc))) {
3388 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
3389 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3390 				    (size_t)sizeof (mfcc)));
3391 				return (0);
3392 			}
3393 		}
3394 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
3395 	}
3396 	return (1);
3397 }
3398