xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_rts.c (revision b6805bf78d2bbbeeaea8909a05623587b42d58b3)
1 /*
2  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
3  */
4 
5 /*
6  * Copyright (c) 1988, 1991, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)rtsock.c	8.6 (Berkeley) 2/11/95
38  */
39 
40 /*
41  * This file contains routines that processes routing socket requests.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/stream.h>
46 #include <sys/stropts.h>
47 #include <sys/ddi.h>
48 #include <sys/strsubr.h>
49 #include <sys/cmn_err.h>
50 #include <sys/debug.h>
51 #include <sys/policy.h>
52 #include <sys/zone.h>
53 
54 #include <sys/systm.h>
55 #include <sys/param.h>
56 #include <sys/socket.h>
57 #include <sys/strsun.h>
58 #include <net/if.h>
59 #include <net/route.h>
60 #include <netinet/in.h>
61 #include <net/if_dl.h>
62 #include <netinet/ip6.h>
63 
64 #include <inet/common.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_if.h>
68 #include <inet/ip_ire.h>
69 #include <inet/ip_ftable.h>
70 #include <inet/ip_rts.h>
71 
72 #include <inet/ipclassifier.h>
73 
74 #include <sys/tsol/tndb.h>
75 #include <sys/tsol/tnet.h>
76 
77 #define	RTS_MSG_SIZE(type, rtm_addrs, af, sacnt) \
78 	(rts_data_msg_size(rtm_addrs, af, sacnt) + rts_header_msg_size(type))
79 
80 static size_t	rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp);
81 static void	rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst,
82     ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr,
83     ipaddr_t author, ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
84     const tsol_gc_t *);
85 static int	rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp,
86     in6_addr_t *gw_addrp, in6_addr_t *net_maskp, in6_addr_t *authorp,
87     in6_addr_t *if_addrp, in6_addr_t *src_addrp, ushort_t *indexp,
88     sa_family_t *afp, tsol_rtsecattr_t *rtsecattr, int *error);
89 static void	rts_getifdata(if_data_t *if_data, const ipif_t *ipif);
90 static int	rts_getmetrics(ire_t *ire, ill_t *ill, rt_metrics_t *metrics);
91 static mblk_t	*rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire,
92     const in6_addr_t *setsrc, tsol_ire_gw_secattr_t *attrp, sa_family_t af);
93 static void	rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
94 static ire_t	*ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask,
95     ipaddr_t gw_addr, const ill_t *ill, zoneid_t zoneid,
96     const ts_label_t *tsl, int match_flags, ip_stack_t *ipst, ire_t **pifire,
97     ipaddr_t *v4setsrcp, tsol_ire_gw_secattr_t **gwattrp);
98 static ire_t	*ire_lookup_v6(const in6_addr_t *dst_addr_v6,
99     const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
100     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
101     ip_stack_t *ipst, ire_t **pifire,
102     in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp);
103 
104 /*
105  * Send `mp' to all eligible routing queues.  A queue is ineligible if:
106  *
107  *  1. SO_USELOOPBACK is off and it is not the originating queue.
108  *  2. RTA_UNDER_IPMP is on and RTSQ_UNDER_IPMP is not set in `flags'.
109  *  3. RTA_UNDER_IPMP is off and RTSQ_NORMAL is not set in `flags'.
110  *  4. It is not the same address family as `af', and `af' isn't AF_UNSPEC.
111  */
112 void
113 rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
114     ip_stack_t *ipst)
115 {
116 	mblk_t	*mp1;
117 	conn_t 	*connp, *next_connp;
118 
119 	/*
120 	 * Since we don't have an ill_t here, RTSQ_DEFAULT must already be
121 	 * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP at this point.
122 	 */
123 	ASSERT(!(flags & RTSQ_DEFAULT));
124 
125 	mutex_enter(&ipst->ips_rts_clients->connf_lock);
126 	connp = ipst->ips_rts_clients->connf_head;
127 
128 	for (; connp != NULL; connp = next_connp) {
129 		next_connp = connp->conn_next;
130 		/*
131 		 * If there was a family specified when this routing socket was
132 		 * created and it doesn't match the family of the message to
133 		 * copy, then continue.
134 		 */
135 		if ((connp->conn_proto != AF_UNSPEC) &&
136 		    (connp->conn_proto != af))
137 			continue;
138 
139 		/*
140 		 * Queue the message only if the conn_t and flags match.
141 		 */
142 		if (connp->conn_rtaware & RTAW_UNDER_IPMP) {
143 			if (!(flags & RTSQ_UNDER_IPMP))
144 				continue;
145 		} else {
146 			if (!(flags & RTSQ_NORMAL))
147 				continue;
148 		}
149 		/*
150 		 * For the originating queue, we only copy the message upstream
151 		 * if loopback is set.  For others reading on the routing
152 		 * socket, we check if there is room upstream for a copy of the
153 		 * message.
154 		 */
155 		if ((o_connp == connp) && connp->conn_useloopback == 0) {
156 			connp = connp->conn_next;
157 			continue;
158 		}
159 		CONN_INC_REF(connp);
160 		mutex_exit(&ipst->ips_rts_clients->connf_lock);
161 		/* Pass to rts_input */
162 		if (IPCL_IS_NONSTR(connp) ? !connp->conn_flow_cntrld :
163 		    canputnext(connp->conn_rq)) {
164 			mp1 = dupmsg(mp);
165 			if (mp1 == NULL)
166 				mp1 = copymsg(mp);
167 			/* Note that we pass a NULL ira to rts_input */
168 			if (mp1 != NULL)
169 				(connp->conn_recv)(connp, mp1, NULL, NULL);
170 		}
171 
172 		mutex_enter(&ipst->ips_rts_clients->connf_lock);
173 		/* reload next_connp since conn_next may have changed */
174 		next_connp = connp->conn_next;
175 		CONN_DEC_REF(connp);
176 	}
177 	mutex_exit(&ipst->ips_rts_clients->connf_lock);
178 	freemsg(mp);
179 }
180 
181 /*
182  * Takes an ire and sends an ack to all the routing sockets. This
183  * routine is used
184  * - when a route is created/deleted through the ioctl interface.
185  * - when a stale redirect is deleted
186  */
187 void
188 ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
189 {
190 	mblk_t		*mp;
191 	rt_msghdr_t	*rtm;
192 	int		rtm_addrs = (RTA_DST | RTA_NETMASK | RTA_GATEWAY);
193 	sa_family_t	af;
194 	in6_addr_t	gw_addr_v6;
195 
196 	if (ire == NULL)
197 		return;
198 	ASSERT(ire->ire_ipversion == IPV4_VERSION ||
199 	    ire->ire_ipversion == IPV6_VERSION);
200 
201 	ASSERT(!(ire->ire_type & IRE_IF_CLONE));
202 
203 	if (ire->ire_flags & RTF_SETSRC)
204 		rtm_addrs |= RTA_SRC;
205 
206 	switch (ire->ire_ipversion) {
207 	case IPV4_VERSION:
208 		af = AF_INET;
209 		mp = rts_alloc_msg(type, rtm_addrs, af, 0);
210 		if (mp == NULL)
211 			return;
212 		rts_fill_msg(type, rtm_addrs, ire->ire_addr, ire->ire_mask,
213 		    ire->ire_gateway_addr, ire->ire_setsrc_addr, 0, 0, 0, NULL,
214 		    mp, NULL);
215 		break;
216 	case IPV6_VERSION:
217 		af = AF_INET6;
218 		mp = rts_alloc_msg(type, rtm_addrs, af, 0);
219 		if (mp == NULL)
220 			return;
221 		mutex_enter(&ire->ire_lock);
222 		gw_addr_v6 = ire->ire_gateway_addr_v6;
223 		mutex_exit(&ire->ire_lock);
224 		rts_fill_msg_v6(type, rtm_addrs, &ire->ire_addr_v6,
225 		    &ire->ire_mask_v6, &gw_addr_v6,
226 		    &ire->ire_setsrc_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros,
227 		    &ipv6_all_zeros, NULL, mp, NULL);
228 		break;
229 	}
230 	rtm = (rt_msghdr_t *)mp->b_rptr;
231 	mp->b_wptr = (uchar_t *)&mp->b_rptr[rtm->rtm_msglen];
232 	rtm->rtm_addrs = rtm_addrs;
233 	rtm->rtm_flags = ire->ire_flags;
234 	if (error != 0)
235 		rtm->rtm_errno = error;
236 	else
237 		rtm->rtm_flags |= RTF_DONE;
238 	rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst);
239 }
240 
241 /*
242  * This is a call from the RTS module
243  * indicating that this is a Routing Socket
244  * Stream. Insert this conn_t in routing
245  * socket client list.
246  */
247 void
248 ip_rts_register(conn_t *connp)
249 {
250 	ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
251 
252 	connp->conn_useloopback = 1;
253 	ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
254 }
255 
256 /*
257  * This is a call from the RTS module indicating that it is closing.
258  */
259 void
260 ip_rts_unregister(conn_t *connp)
261 {
262 	ipcl_hash_remove(connp);
263 }
264 
265 /*
266  * Processes requests received on a routing socket. It extracts all the
267  * arguments and calls the appropriate function to process the request.
268  *
269  * RTA_SRC bit flag requests are sent by 'route -setsrc'.
270  *
271  * In general, this function does not consume the message supplied but rather
272  * sends the message upstream with an appropriate UNIX errno.
273  */
274 int
275 ip_rts_request_common(mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
276 {
277 	rt_msghdr_t	*rtm = NULL;
278 	in6_addr_t	dst_addr_v6;
279 	in6_addr_t	src_addr_v6;
280 	in6_addr_t	gw_addr_v6;
281 	in6_addr_t	net_mask_v6;
282 	in6_addr_t	author_v6;
283 	in6_addr_t	if_addr_v6;
284 	mblk_t		*mp1;
285 	ire_t		*ire = NULL;
286 	ire_t		*ifire = NULL;
287 	ipaddr_t	v4setsrc;
288 	in6_addr_t	v6setsrc = ipv6_all_zeros;
289 	tsol_ire_gw_secattr_t *gwattr = NULL;
290 	int		error = 0;
291 	int		match_flags = MATCH_IRE_DSTONLY;
292 	int		match_flags_local = MATCH_IRE_TYPE | MATCH_IRE_GW;
293 	int		found_addrs;
294 	sa_family_t	af;
295 	ipaddr_t	dst_addr;
296 	ipaddr_t	gw_addr;
297 	ipaddr_t	src_addr;
298 	ipaddr_t	net_mask;
299 	ushort_t	index;
300 	boolean_t	gcgrp_xtraref = B_FALSE;
301 	tsol_gcgrp_addr_t ga;
302 	tsol_rtsecattr_t rtsecattr;
303 	struct rtsa_s	*rtsap = NULL;
304 	tsol_gcgrp_t	*gcgrp = NULL;
305 	tsol_gc_t	*gc = NULL;
306 	ts_label_t	*tsl = NULL;
307 	zoneid_t	zoneid;
308 	ip_stack_t	*ipst;
309 	ill_t   	*ill = NULL;
310 
311 	zoneid = connp->conn_zoneid;
312 	ipst = connp->conn_netstack->netstack_ip;
313 
314 	if (mp->b_cont != NULL && !pullupmsg(mp, -1)) {
315 		freemsg(mp);
316 		error =  EINVAL;
317 		goto done;
318 	}
319 	if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
320 		freemsg(mp);
321 		error = EINVAL;
322 		goto done;
323 	}
324 
325 	/*
326 	 * Check the routing message for basic consistency including the
327 	 * version number and that the number of octets written is the same
328 	 * as specified by the rtm_msglen field.
329 	 *
330 	 * At this point, an error can be delivered back via rtm_errno.
331 	 */
332 	rtm = (rt_msghdr_t *)mp->b_rptr;
333 	if ((mp->b_wptr - mp->b_rptr) != rtm->rtm_msglen) {
334 		error = EINVAL;
335 		goto done;
336 	}
337 	if (rtm->rtm_version != RTM_VERSION) {
338 		error = EPROTONOSUPPORT;
339 		goto done;
340 	}
341 
342 	/* Only allow RTM_GET or RTM_RESOLVE for unprivileged process */
343 	if (rtm->rtm_type != RTM_GET &&
344 	    rtm->rtm_type != RTM_RESOLVE &&
345 	    (ioc_cr == NULL ||
346 	    secpolicy_ip_config(ioc_cr, B_FALSE) != 0)) {
347 		error = EPERM;
348 		goto done;
349 	}
350 
351 	found_addrs = rts_getaddrs(rtm, &dst_addr_v6, &gw_addr_v6, &net_mask_v6,
352 	    &author_v6, &if_addr_v6, &src_addr_v6, &index, &af, &rtsecattr,
353 	    &error);
354 
355 	if (error != 0)
356 		goto done;
357 
358 	if ((found_addrs & RTA_DST) == 0) {
359 		error = EINVAL;
360 		goto done;
361 	}
362 
363 	/*
364 	 * Based on the address family of the destination address, determine
365 	 * the destination, gateway and netmask and return the appropriate error
366 	 * if an unknown address family was specified (following the errno
367 	 * values that 4.4BSD-Lite2 returns.)
368 	 */
369 	switch (af) {
370 	case AF_INET:
371 		IN6_V4MAPPED_TO_IPADDR(&dst_addr_v6, dst_addr);
372 		IN6_V4MAPPED_TO_IPADDR(&src_addr_v6, src_addr);
373 		IN6_V4MAPPED_TO_IPADDR(&gw_addr_v6, gw_addr);
374 		if (((found_addrs & RTA_NETMASK) == 0) ||
375 		    (rtm->rtm_flags & RTF_HOST))
376 			net_mask = IP_HOST_MASK;
377 		else
378 			IN6_V4MAPPED_TO_IPADDR(&net_mask_v6, net_mask);
379 		break;
380 	case AF_INET6:
381 		if (((found_addrs & RTA_NETMASK) == 0) ||
382 		    (rtm->rtm_flags & RTF_HOST))
383 			net_mask_v6 = ipv6_all_ones;
384 		break;
385 	default:
386 		/*
387 		 * These errno values are meant to be compatible with
388 		 * 4.4BSD-Lite2 for the given message types.
389 		 */
390 		switch (rtm->rtm_type) {
391 		case RTM_ADD:
392 		case RTM_DELETE:
393 			error = ESRCH;
394 			goto done;
395 		case RTM_GET:
396 		case RTM_CHANGE:
397 			error = EAFNOSUPPORT;
398 			goto done;
399 		default:
400 			error = EOPNOTSUPP;
401 			goto done;
402 		}
403 	}
404 
405 	/*
406 	 * At this point, the address family must be something known.
407 	 */
408 	ASSERT(af == AF_INET || af == AF_INET6);
409 
410 	/* Handle RTA_IFP */
411 	if (index != 0) {
412 		ipif_t		*ipif;
413 lookup:
414 		ill = ill_lookup_on_ifindex(index, af == AF_INET6, ipst);
415 		if (ill == NULL) {
416 			error = EINVAL;
417 			goto done;
418 		}
419 
420 		/*
421 		 * Since all interfaces in an IPMP group must be equivalent,
422 		 * we prevent changes to a specific underlying interface's
423 		 * routing configuration.  However, for backward compatibility,
424 		 * we intepret a request to add a route on an underlying
425 		 * interface as a request to add a route on its IPMP interface.
426 		 */
427 		if (IS_UNDER_IPMP(ill)) {
428 			switch (rtm->rtm_type) {
429 			case RTM_CHANGE:
430 			case RTM_DELETE:
431 				error = EINVAL;
432 				goto done;
433 			case RTM_ADD:
434 				index = ipmp_ill_get_ipmp_ifindex(ill);
435 				ill_refrele(ill);
436 				if (index == 0) {
437 					ill = NULL; /* already refrele'd */
438 					error = EINVAL;
439 					goto done;
440 				}
441 				goto lookup;
442 			}
443 		}
444 
445 		match_flags |= MATCH_IRE_ILL;
446 		/*
447 		 * This provides the same zoneid as in Solaris 10
448 		 * that -ifp picks the zoneid from the first ipif on the ill.
449 		 * But it might not be useful since the first ipif will always
450 		 * have the same zoneid as the ill.
451 		 */
452 		ipif = ipif_get_next_ipif(NULL, ill);
453 		if (ipif != NULL) {
454 			zoneid = ipif->ipif_zoneid;
455 			ipif_refrele(ipif);
456 		}
457 	}
458 
459 	/*
460 	 * If a netmask was supplied in the message, then subsequent route
461 	 * lookups will attempt to match on the netmask as well.
462 	 */
463 	if ((found_addrs & RTA_NETMASK) != 0)
464 		match_flags |= MATCH_IRE_MASK;
465 
466 	/*
467 	 * We only process any passed-in route security attributes for
468 	 * either RTM_ADD or RTM_CHANGE message; We overload them
469 	 * to do an RTM_GET as a different label; ignore otherwise.
470 	 */
471 	if (rtm->rtm_type == RTM_ADD || rtm->rtm_type == RTM_CHANGE ||
472 	    rtm->rtm_type == RTM_GET) {
473 		ASSERT(rtsecattr.rtsa_cnt <= TSOL_RTSA_REQUEST_MAX);
474 		if (rtsecattr.rtsa_cnt > 0)
475 			rtsap = &rtsecattr.rtsa_attr[0];
476 	}
477 
478 	switch (rtm->rtm_type) {
479 	case RTM_ADD:
480 		/* if we are adding a route, gateway is a must */
481 		if ((found_addrs & RTA_GATEWAY) == 0) {
482 			error = EINVAL;
483 			goto done;
484 		}
485 
486 		/* Multirouting does not support net routes. */
487 		if ((rtm->rtm_flags & (RTF_MULTIRT | RTF_HOST)) ==
488 		    RTF_MULTIRT) {
489 			error = EADDRNOTAVAIL;
490 			goto done;
491 		}
492 
493 		/*
494 		 * Multirouting and user-specified source addresses
495 		 * do not support interface based routing.
496 		 * Assigning a source address to an interface based
497 		 * route is achievable by plumbing a new ipif and
498 		 * setting up the interface route via this ipif,
499 		 * though.
500 		 */
501 		if (rtm->rtm_flags & (RTF_MULTIRT | RTF_SETSRC)) {
502 			if ((rtm->rtm_flags & RTF_GATEWAY) == 0) {
503 				error = EADDRNOTAVAIL;
504 				goto done;
505 			}
506 		}
507 
508 		switch (af) {
509 		case AF_INET:
510 			if (src_addr != INADDR_ANY) {
511 				uint_t type;
512 
513 				/*
514 				 * The RTF_SETSRC flag is present, check that
515 				 * the supplied src address is not the loopback
516 				 * address. This would produce martian packets.
517 				 */
518 				if (src_addr == htonl(INADDR_LOOPBACK)) {
519 					error = EINVAL;
520 					goto done;
521 				}
522 				/*
523 				 * Also check that the supplied address is a
524 				 * valid, local one. Only allow IFF_UP ones
525 				 */
526 				type = ip_type_v4(src_addr, ipst);
527 				if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
528 					error = EADDRNOTAVAIL;
529 					goto done;
530 				}
531 			} else {
532 				/*
533 				 * The RTF_SETSRC modifier must be associated
534 				 * to a non-null source address.
535 				 */
536 				if (rtm->rtm_flags & RTF_SETSRC) {
537 					error = EINVAL;
538 					goto done;
539 				}
540 			}
541 
542 			error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr,
543 			    rtm->rtm_flags, ill, &ire, B_FALSE,
544 			    rtsap, ipst, zoneid);
545 			if (ill != NULL)
546 				ASSERT(!MUTEX_HELD(&ill->ill_lock));
547 			break;
548 		case AF_INET6:
549 			if (!IN6_IS_ADDR_UNSPECIFIED(&src_addr_v6)) {
550 				uint_t type;
551 
552 				/*
553 				 * The RTF_SETSRC flag is present, check that
554 				 * the supplied src address is not the loopback
555 				 * address. This would produce martian packets.
556 				 */
557 				if (IN6_IS_ADDR_LOOPBACK(&src_addr_v6)) {
558 					error = EINVAL;
559 					goto done;
560 				}
561 				/*
562 				 * Also check that the supplied address is a
563 				 * valid, local one. Only allow UP ones.
564 				 */
565 				type = ip_type_v6(&src_addr_v6, ipst);
566 				if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
567 					error = EADDRNOTAVAIL;
568 					goto done;
569 				}
570 
571 				error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
572 				    &gw_addr_v6, &src_addr_v6, rtm->rtm_flags,
573 				    ill, &ire, rtsap, ipst, zoneid);
574 				break;
575 			}
576 			/*
577 			 * The RTF_SETSRC modifier must be associated
578 			 * to a non-null source address.
579 			 */
580 			if (rtm->rtm_flags & RTF_SETSRC) {
581 				error = EINVAL;
582 				goto done;
583 			}
584 			error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
585 			    &gw_addr_v6, NULL, rtm->rtm_flags,
586 			    ill, &ire, rtsap, ipst, zoneid);
587 			if (ill != NULL)
588 				ASSERT(!MUTEX_HELD(&ill->ill_lock));
589 			break;
590 		}
591 		if (error != 0)
592 			goto done;
593 		ASSERT(ire != NULL);
594 		rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
595 		break;
596 	case RTM_DELETE:
597 		/* if we are deleting a route, gateway is a must */
598 		if ((found_addrs & RTA_GATEWAY) == 0) {
599 			error = EINVAL;
600 			goto done;
601 		}
602 		/*
603 		 * The RTF_SETSRC modifier does not make sense
604 		 * when deleting a route.
605 		 */
606 		if (rtm->rtm_flags & RTF_SETSRC) {
607 			error = EINVAL;
608 			goto done;
609 		}
610 
611 		switch (af) {
612 		case AF_INET:
613 			error = ip_rt_delete(dst_addr, net_mask, gw_addr,
614 			    found_addrs, rtm->rtm_flags, ill, B_FALSE,
615 			    ipst, zoneid);
616 			break;
617 		case AF_INET6:
618 			error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6,
619 			    &gw_addr_v6, found_addrs, rtm->rtm_flags, ill,
620 			    ipst, zoneid);
621 			break;
622 		}
623 		break;
624 	case RTM_GET:
625 	case RTM_CHANGE:
626 		/*
627 		 * In the case of RTM_GET, the forwarding table should be
628 		 * searched recursively.  Also, if a gateway was
629 		 * specified then the gateway address must also be matched.
630 		 *
631 		 * In the case of RTM_CHANGE, the gateway address (if supplied)
632 		 * is the new gateway address so matching on the gateway address
633 		 * is not done.  This can lead to ambiguity when looking up the
634 		 * route to change as usually only the destination (and netmask,
635 		 * if supplied) is used for the lookup.  However if a RTA_IFP
636 		 * sockaddr is also supplied, it can disambiguate which route to
637 		 * change provided the ambigous routes are tied to distinct
638 		 * ill's (or interface indices).  If the routes are not tied to
639 		 * any particular interfaces (for example, with traditional
640 		 * gateway routes), then a RTA_IFP sockaddr will be of no use as
641 		 * it won't match any such routes.
642 		 * RTA_SRC is not supported for RTM_GET and RTM_CHANGE,
643 		 * except when RTM_CHANGE is combined to RTF_SETSRC.
644 		 */
645 		if (((found_addrs & RTA_SRC) != 0) &&
646 		    ((rtm->rtm_type == RTM_GET) ||
647 		    !(rtm->rtm_flags & RTF_SETSRC))) {
648 			error = EOPNOTSUPP;
649 			goto done;
650 		}
651 
652 		if (rtm->rtm_type == RTM_GET) {
653 			match_flags |= MATCH_IRE_SECATTR;
654 			match_flags_local |= MATCH_IRE_SECATTR;
655 			if ((found_addrs & RTA_GATEWAY) != 0)
656 				match_flags |= MATCH_IRE_GW;
657 			if (ioc_cr)
658 				tsl = crgetlabel(ioc_cr);
659 			if (rtsap != NULL) {
660 				if (rtsa_validate(rtsap) != 0) {
661 					error = EINVAL;
662 					goto done;
663 				}
664 				if (tsl != NULL &&
665 				    crgetzoneid(ioc_cr) != GLOBAL_ZONEID &&
666 				    (tsl->tsl_doi != rtsap->rtsa_doi ||
667 				    !bldominates(&tsl->tsl_label,
668 				    &rtsap->rtsa_slrange.lower_bound))) {
669 					error = EPERM;
670 					goto done;
671 				}
672 				tsl = labelalloc(
673 				    &rtsap->rtsa_slrange.lower_bound,
674 				    rtsap->rtsa_doi, KM_NOSLEEP);
675 			}
676 		}
677 		if (rtm->rtm_type == RTM_CHANGE) {
678 			if ((found_addrs & RTA_GATEWAY) &&
679 			    (rtm->rtm_flags & RTF_SETSRC)) {
680 				/*
681 				 * Do not want to change the gateway,
682 				 * but rather the source address.
683 				 */
684 				match_flags |= MATCH_IRE_GW;
685 			}
686 		}
687 
688 		/*
689 		 * If the netmask is all ones (either as supplied or as derived
690 		 * above), then first check for an IRE_LOOPBACK or
691 		 * IRE_LOCAL entry.
692 		 *
693 		 * If we didn't check for or find an IRE_LOOPBACK or IRE_LOCAL
694 		 * entry, then look for any other type of IRE.
695 		 */
696 		switch (af) {
697 		case AF_INET:
698 			if (net_mask == IP_HOST_MASK) {
699 				ire = ire_ftable_lookup_v4(dst_addr, 0, gw_addr,
700 				    IRE_LOCAL | IRE_LOOPBACK, NULL, zoneid,
701 				    tsl, match_flags_local, 0, ipst, NULL);
702 			}
703 			if (ire == NULL) {
704 				ire = ire_lookup_v4(dst_addr, net_mask,
705 				    gw_addr, ill, zoneid, tsl, match_flags,
706 				    ipst, &ifire, &v4setsrc, &gwattr);
707 				IN6_IPADDR_TO_V4MAPPED(v4setsrc, &v6setsrc);
708 			}
709 			break;
710 		case AF_INET6:
711 			if (IN6_ARE_ADDR_EQUAL(&net_mask_v6, &ipv6_all_ones)) {
712 				ire = ire_ftable_lookup_v6(&dst_addr_v6, NULL,
713 				    &gw_addr_v6, IRE_LOCAL | IRE_LOOPBACK, NULL,
714 				    zoneid, tsl, match_flags_local, 0, ipst,
715 				    NULL);
716 			}
717 			if (ire == NULL) {
718 				ire = ire_lookup_v6(&dst_addr_v6,
719 				    &net_mask_v6, &gw_addr_v6, ill, zoneid,
720 				    tsl, match_flags, ipst, &ifire, &v6setsrc,
721 				    &gwattr);
722 			}
723 			break;
724 		}
725 		if (tsl != NULL && tsl != crgetlabel(ioc_cr))
726 			label_rele(tsl);
727 
728 		if (ire == NULL) {
729 			error = ESRCH;
730 			goto done;
731 		}
732 		/*
733 		 * Want to return failure if we get an IRE_NOROUTE from
734 		 * ire_route_recursive
735 		 */
736 		if (ire->ire_type & IRE_NOROUTE) {
737 			ire_refrele(ire);
738 			ire = NULL;
739 			error = ESRCH;
740 			goto done;
741 		}
742 
743 		/* we know the IRE before we come here */
744 		switch (rtm->rtm_type) {
745 		case RTM_GET:
746 			mp1 = rts_rtmget(mp, ire, ifire, &v6setsrc, gwattr, af);
747 			if (mp1 == NULL) {
748 				error = ENOBUFS;
749 				goto done;
750 			}
751 			freemsg(mp);
752 			mp = mp1;
753 			rtm = (rt_msghdr_t *)mp->b_rptr;
754 			break;
755 		case RTM_CHANGE:
756 			/*
757 			 * Do not allow to the multirouting state of a route
758 			 * to be changed. This aims to prevent undesirable
759 			 * stages where both multirt and non-multirt routes
760 			 * for the same destination are declared.
761 			 */
762 			if ((ire->ire_flags & RTF_MULTIRT) !=
763 			    (rtm->rtm_flags & RTF_MULTIRT)) {
764 				error = EINVAL;
765 				goto done;
766 			}
767 			/*
768 			 * Note that we do not need to do
769 			 * ire_flush_cache_*(IRE_FLUSH_ADD) as a change
770 			 * in metrics or gateway will not affect existing
771 			 * routes since it does not create a more specific
772 			 * route.
773 			 */
774 			switch (af) {
775 			case AF_INET:
776 				if ((found_addrs & RTA_GATEWAY) != 0 &&
777 				    (ire->ire_gateway_addr != gw_addr)) {
778 					ire->ire_gateway_addr = gw_addr;
779 				}
780 
781 				if (rtsap != NULL) {
782 					ga.ga_af = AF_INET;
783 					IN6_IPADDR_TO_V4MAPPED(
784 					    ire->ire_gateway_addr, &ga.ga_addr);
785 
786 					gcgrp = gcgrp_lookup(&ga, B_TRUE);
787 					if (gcgrp == NULL) {
788 						error = ENOMEM;
789 						goto done;
790 					}
791 				}
792 
793 				if ((found_addrs & RTA_SRC) != 0 &&
794 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
795 				    (ire->ire_setsrc_addr != src_addr)) {
796 					if (src_addr != INADDR_ANY) {
797 						uint_t type;
798 
799 						/*
800 						 * The RTF_SETSRC flag is
801 						 * present, check that the
802 						 * supplied src address is not
803 						 * the loopback address. This
804 						 * would produce martian
805 						 * packets.
806 						 */
807 						if (src_addr ==
808 						    htonl(INADDR_LOOPBACK)) {
809 							error = EINVAL;
810 							goto done;
811 						}
812 						/*
813 						 * Also check that the
814 						 * supplied addr is a valid
815 						 * local address.
816 						 */
817 						type = ip_type_v4(src_addr,
818 						    ipst);
819 						if (!(type &
820 						    (IRE_LOCAL|IRE_LOOPBACK))) {
821 							error = EADDRNOTAVAIL;
822 							goto done;
823 						}
824 						ire->ire_flags |= RTF_SETSRC;
825 						ire->ire_setsrc_addr =
826 						    src_addr;
827 					} else {
828 						ire->ire_flags &= ~RTF_SETSRC;
829 						ire->ire_setsrc_addr =
830 						    INADDR_ANY;
831 					}
832 					/*
833 					 * Let conn_ixa caching know that
834 					 * source address selection changed
835 					 */
836 					ip_update_source_selection(ipst);
837 				}
838 				ire_flush_cache_v4(ire, IRE_FLUSH_GWCHANGE);
839 				break;
840 			case AF_INET6:
841 				mutex_enter(&ire->ire_lock);
842 				if ((found_addrs & RTA_GATEWAY) != 0 &&
843 				    !IN6_ARE_ADDR_EQUAL(
844 				    &ire->ire_gateway_addr_v6, &gw_addr_v6)) {
845 					ire->ire_gateway_addr_v6 = gw_addr_v6;
846 				}
847 				mutex_exit(&ire->ire_lock);
848 
849 				if (rtsap != NULL) {
850 					ga.ga_af = AF_INET6;
851 					mutex_enter(&ire->ire_lock);
852 					ga.ga_addr = ire->ire_gateway_addr_v6;
853 					mutex_exit(&ire->ire_lock);
854 
855 					gcgrp = gcgrp_lookup(&ga, B_TRUE);
856 					if (gcgrp == NULL) {
857 						error = ENOMEM;
858 						goto done;
859 					}
860 				}
861 
862 				if ((found_addrs & RTA_SRC) != 0 &&
863 				    (rtm->rtm_flags & RTF_SETSRC) != 0 &&
864 				    !IN6_ARE_ADDR_EQUAL(
865 				    &ire->ire_setsrc_addr_v6, &src_addr_v6)) {
866 					if (!IN6_IS_ADDR_UNSPECIFIED(
867 					    &src_addr_v6)) {
868 						uint_t type;
869 
870 						/*
871 						 * The RTF_SETSRC flag is
872 						 * present, check that the
873 						 * supplied src address is not
874 						 * the loopback address. This
875 						 * would produce martian
876 						 * packets.
877 						 */
878 						if (IN6_IS_ADDR_LOOPBACK(
879 						    &src_addr_v6)) {
880 							error = EINVAL;
881 							goto done;
882 						}
883 						/*
884 						 * Also check that the
885 						 * supplied addr is a valid
886 						 * local address.
887 						 */
888 						type = ip_type_v6(&src_addr_v6,
889 						    ipst);
890 						if (!(type &
891 						    (IRE_LOCAL|IRE_LOOPBACK))) {
892 							error = EADDRNOTAVAIL;
893 							goto done;
894 						}
895 						mutex_enter(&ire->ire_lock);
896 						ire->ire_flags |= RTF_SETSRC;
897 						ire->ire_setsrc_addr_v6 =
898 						    src_addr_v6;
899 						mutex_exit(&ire->ire_lock);
900 					} else {
901 						mutex_enter(&ire->ire_lock);
902 						ire->ire_flags &= ~RTF_SETSRC;
903 						ire->ire_setsrc_addr_v6 =
904 						    ipv6_all_zeros;
905 						mutex_exit(&ire->ire_lock);
906 					}
907 					/*
908 					 * Let conn_ixa caching know that
909 					 * source address selection changed
910 					 */
911 					ip_update_source_selection(ipst);
912 				}
913 				ire_flush_cache_v6(ire, IRE_FLUSH_GWCHANGE);
914 				break;
915 			}
916 
917 			if (rtsap != NULL) {
918 				ASSERT(gcgrp != NULL);
919 
920 				/*
921 				 * Create and add the security attribute to
922 				 * prefix IRE; it will add a reference to the
923 				 * group upon allocating a new entry.  If it
924 				 * finds an already-existing entry for the
925 				 * security attribute, it simply returns it
926 				 * and no new group reference is made.
927 				 */
928 				gc = gc_create(rtsap, gcgrp, &gcgrp_xtraref);
929 				if (gc == NULL ||
930 				    (error = tsol_ire_init_gwattr(ire,
931 				    ire->ire_ipversion, gc)) != 0) {
932 					if (gc != NULL) {
933 						GC_REFRELE(gc);
934 					} else {
935 						/* gc_create failed */
936 						error = ENOMEM;
937 					}
938 					goto done;
939 				}
940 			}
941 			rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
942 			break;
943 		}
944 		break;
945 	default:
946 		error = EOPNOTSUPP;
947 		break;
948 	}
949 done:
950 	if (ire != NULL)
951 		ire_refrele(ire);
952 	if (ifire != NULL)
953 		ire_refrele(ifire);
954 	if (ill != NULL)
955 		ill_refrele(ill);
956 
957 	if (gcgrp_xtraref)
958 		GCGRP_REFRELE(gcgrp);
959 
960 	if (rtm != NULL) {
961 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
962 		if (error != 0) {
963 			rtm->rtm_errno = error;
964 			/* Send error ACK */
965 			ip1dbg(("ip_rts_request: error %d\n", error));
966 		} else {
967 			rtm->rtm_flags |= RTF_DONE;
968 			/* OK ACK already set up by caller except this */
969 			ip2dbg(("ip_rts_request: OK ACK\n"));
970 		}
971 		rts_queue_input(mp, connp, af, RTSQ_ALL, ipst);
972 	}
973 	return (error);
974 }
975 
976 /*
977  * Helper function that can do recursive lookups including when
978  * MATCH_IRE_GW and/or MATCH_IRE_MASK is set.
979  */
980 static ire_t *
981 ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask, ipaddr_t gw_addr,
982     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
983     int match_flags, ip_stack_t *ipst, ire_t **pifire, ipaddr_t *v4setsrcp,
984     tsol_ire_gw_secattr_t **gwattrp)
985 {
986 	ire_t		*ire;
987 	ire_t		*ifire = NULL;
988 	uint_t		ire_type;
989 
990 	*pifire = NULL;
991 	*v4setsrcp = INADDR_ANY;
992 	*gwattrp = NULL;
993 
994 	/* Skip IRE_IF_CLONE */
995 	match_flags |= MATCH_IRE_TYPE;
996 	ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
997 
998 	/*
999 	 * ire_route_recursive can't match gateway or mask thus if they are
1000 	 * set we have to do two steps of lookups
1001 	 */
1002 	if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
1003 		ire = ire_ftable_lookup_v4(dst_addr, net_mask, gw_addr,
1004 		    ire_type, ill, zoneid, tsl, match_flags, 0, ipst, NULL);
1005 
1006 		if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
1007 			return (ire);
1008 
1009 		if (ire->ire_type & IRE_ONLINK)
1010 			return (ire);
1011 
1012 		if (ire->ire_flags & RTF_SETSRC) {
1013 			ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1014 			*v4setsrcp = ire->ire_setsrc_addr;
1015 			v4setsrcp = NULL;
1016 		}
1017 
1018 		/* The first ire_gw_secattr is passed back */
1019 		if (ire->ire_gw_secattr != NULL) {
1020 			*gwattrp = ire->ire_gw_secattr;
1021 			gwattrp = NULL;
1022 		}
1023 
1024 		/* Look for an interface ire recursively based on the gateway */
1025 		dst_addr = ire->ire_gateway_addr;
1026 		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
1027 		/*
1028 		 * Don't allow anything unusual past the first iteration.
1029 		 * After the first lookup, we should no longer look for
1030 		 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1031 		 * routes.
1032 		 *
1033 		 * In addition, after we have found a direct IRE_OFFLINK,
1034 		 * we should only look for interface or clone routes.
1035 		 */
1036 		match_flags |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1037 
1038 		if ((ire->ire_type & IRE_OFFLINK) &&
1039 		    !(ire->ire_flags & RTF_INDIRECT)) {
1040 			ire_type = IRE_IF_ALL;
1041 		} else {
1042 			/*
1043 			 * no more local, loopback, broadcast routes
1044 			 */
1045 			if (!(match_flags & MATCH_IRE_TYPE))
1046 				ire_type = (IRE_OFFLINK|IRE_ONLINK);
1047 			ire_type &= ~(IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
1048 		}
1049 		match_flags |= MATCH_IRE_TYPE;
1050 
1051 		ifire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
1052 		    tsl, match_flags, IRR_INCOMPLETE, 0, ipst, v4setsrcp,
1053 		    gwattrp, NULL);
1054 	} else {
1055 		ire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
1056 		    tsl, match_flags, IRR_INCOMPLETE, 0, ipst, v4setsrcp,
1057 		    gwattrp, NULL);
1058 	}
1059 	*pifire = ifire;
1060 	return (ire);
1061 }
1062 
1063 static ire_t *
1064 ire_lookup_v6(const in6_addr_t *dst_addr_v6,
1065     const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
1066     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
1067     ip_stack_t *ipst, ire_t **pifire,
1068     in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp)
1069 {
1070 	ire_t		*ire;
1071 	ire_t		*ifire = NULL;
1072 	uint_t		ire_type;
1073 
1074 	*pifire = NULL;
1075 	*v6setsrcp = ipv6_all_zeros;
1076 	*gwattrp = NULL;
1077 
1078 	/* Skip IRE_IF_CLONE */
1079 	match_flags |= MATCH_IRE_TYPE;
1080 	ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
1081 
1082 	/*
1083 	 * ire_route_recursive can't match gateway or mask thus if they are
1084 	 * set we have to do two steps of lookups
1085 	 */
1086 	if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
1087 		in6_addr_t dst;
1088 
1089 		ire = ire_ftable_lookup_v6(dst_addr_v6, net_mask_v6,
1090 		    gw_addr_v6, ire_type, ill, zoneid, tsl, match_flags, 0,
1091 		    ipst, NULL);
1092 
1093 		if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
1094 			return (ire);
1095 
1096 		if (ire->ire_type & IRE_ONLINK)
1097 			return (ire);
1098 
1099 		if (ire->ire_flags & RTF_SETSRC) {
1100 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
1101 			    &ire->ire_setsrc_addr_v6));
1102 			*v6setsrcp = ire->ire_setsrc_addr_v6;
1103 			v6setsrcp = NULL;
1104 		}
1105 
1106 		/* The first ire_gw_secattr is passed back */
1107 		if (ire->ire_gw_secattr != NULL) {
1108 			*gwattrp = ire->ire_gw_secattr;
1109 			gwattrp = NULL;
1110 		}
1111 
1112 		mutex_enter(&ire->ire_lock);
1113 		dst = ire->ire_gateway_addr_v6;
1114 		mutex_exit(&ire->ire_lock);
1115 		match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
1116 		/*
1117 		 * Don't allow anything unusual past the first iteration.
1118 		 * After the first lookup, we should no longer look for
1119 		 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1120 		 * routes.
1121 		 *
1122 		 * In addition, after we have found a direct IRE_OFFLINK,
1123 		 * we should only look for interface or clone routes.
1124 		 */
1125 		match_flags |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1126 
1127 		if ((ire->ire_type & IRE_OFFLINK) &&
1128 		    !(ire->ire_flags & RTF_INDIRECT)) {
1129 			ire_type = IRE_IF_ALL;
1130 		} else {
1131 			/*
1132 			 * no more local, loopback routes
1133 			 */
1134 			if (!(match_flags & MATCH_IRE_TYPE))
1135 				ire_type = (IRE_OFFLINK|IRE_ONLINK);
1136 			ire_type &= ~(IRE_LOCAL|IRE_LOOPBACK);
1137 		}
1138 		match_flags |= MATCH_IRE_TYPE;
1139 
1140 		ifire = ire_route_recursive_v6(&dst, ire_type, ill, zoneid, tsl,
1141 		    match_flags, IRR_INCOMPLETE, 0, ipst, v6setsrcp, gwattrp,
1142 		    NULL);
1143 	} else {
1144 		ire = ire_route_recursive_v6(dst_addr_v6, ire_type, ill, zoneid,
1145 		    tsl, match_flags, IRR_INCOMPLETE, 0, ipst, v6setsrcp,
1146 		    gwattrp, NULL);
1147 	}
1148 	*pifire = ifire;
1149 	return (ire);
1150 }
1151 
1152 
1153 /*
1154  * Handle IP_IOC_RTS_REQUEST ioctls
1155  */
1156 int
1157 ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
1158 {
1159 	conn_t	*connp = Q_TO_CONN(q);
1160 	IOCP	iocp = (IOCP)mp->b_rptr;
1161 	mblk_t	*mp1, *ioc_mp = mp;
1162 	int	error = 0;
1163 	ip_stack_t	*ipst;
1164 
1165 	ipst = connp->conn_netstack->netstack_ip;
1166 
1167 	ASSERT(mp->b_cont != NULL);
1168 	/* ioc_mp holds mp */
1169 	mp = mp->b_cont;
1170 
1171 	/*
1172 	 * The Routing Socket data starts on
1173 	 * next block. If there is no next block
1174 	 * this is an indication from routing module
1175 	 * that it is a routing socket stream queue.
1176 	 * We need to support that for compatibility with SDP since
1177 	 * it has a contract private interface to use IP_IOC_RTS_REQUEST.
1178 	 * Note: SDP no longer uses IP_IOC_RTS_REQUEST - we can remove this.
1179 	 */
1180 	if (mp->b_cont == NULL) {
1181 		/*
1182 		 * This is a message from SDP
1183 		 * indicating that this is a Routing Socket
1184 		 * Stream. Insert this conn_t in routing
1185 		 * socket client list.
1186 		 */
1187 		connp->conn_useloopback = 1;
1188 		ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
1189 		goto done;
1190 	}
1191 	mp1 = dupmsg(mp->b_cont);
1192 	if (mp1 == NULL) {
1193 		error  = ENOBUFS;
1194 		goto done;
1195 	}
1196 	mp = mp1;
1197 
1198 	error = ip_rts_request_common(mp, connp, ioc_cr);
1199 done:
1200 	iocp->ioc_error = error;
1201 	ioc_mp->b_datap->db_type = M_IOCACK;
1202 	if (iocp->ioc_error != 0)
1203 		iocp->ioc_count = 0;
1204 	/* Note that we pass a NULL ira to rts_input */
1205 	(connp->conn_recv)(connp, ioc_mp, NULL, NULL);
1206 
1207 	/* conn was refheld in ip_wput_ioctl. */
1208 	CONN_DEC_IOCTLREF(connp);
1209 	CONN_OPER_PENDING_DONE(connp);
1210 
1211 	return (error);
1212 }
1213 
1214 /*
1215  * Build a reply to the RTM_GET request contained in the given message block
1216  * using the retrieved IRE of the destination address, the parent IRE (if it
1217  * exists) and the address family.
1218  *
1219  * Returns a pointer to a message block containing the reply if successful,
1220  * otherwise NULL is returned.
1221  */
1222 static mblk_t *
1223 rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire, const in6_addr_t *setsrc,
1224     tsol_ire_gw_secattr_t *attrp, sa_family_t af)
1225 {
1226 	rt_msghdr_t	*rtm;
1227 	rt_msghdr_t	*new_rtm;
1228 	mblk_t		*new_mp;
1229 	int		rtm_addrs;
1230 	int		rtm_flags;
1231 	tsol_gc_t	*gc = NULL;
1232 	tsol_gcgrp_t	*gcgrp = NULL;
1233 	ill_t		*ill;
1234 	ipif_t		*ipif = NULL;
1235 	ipaddr_t	brdaddr;	/* IFF_POINTOPOINT destination */
1236 	ipaddr_t	ifaddr;
1237 	in6_addr_t	brdaddr6;	/* IFF_POINTOPOINT destination */
1238 	in6_addr_t	ifaddr6;
1239 	ipaddr_t	v4setsrc;
1240 
1241 	rtm = (rt_msghdr_t *)mp->b_rptr;
1242 
1243 	/*
1244 	 * Find the ill used to send packets. This will be NULL in case
1245 	 * of a reject or blackhole.
1246 	 */
1247 	if (ifire != NULL)
1248 		ill = ire_nexthop_ill(ifire);
1249 	else
1250 		ill = ire_nexthop_ill(ire);
1251 
1252 	if (attrp != NULL) {
1253 		mutex_enter(&attrp->igsa_lock);
1254 		if ((gc = attrp->igsa_gc) != NULL) {
1255 			gcgrp = gc->gc_grp;
1256 			ASSERT(gcgrp != NULL);
1257 			rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1258 		}
1259 		mutex_exit(&attrp->igsa_lock);
1260 	}
1261 
1262 	/*
1263 	 * Always return RTA_DST, RTA_GATEWAY and RTA_NETMASK.
1264 	 *
1265 	 * The 4.4BSD-Lite2 code (net/rtsock.c) returns both
1266 	 * RTA_IFP and RTA_IFA if either is defined, and also
1267 	 * returns RTA_BRD if the appropriate interface is
1268 	 * point-to-point.
1269 	 */
1270 	rtm_addrs = (RTA_DST | RTA_GATEWAY | RTA_NETMASK);
1271 	if ((rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) && ill != NULL) {
1272 		rtm_addrs |= (RTA_IFP | RTA_IFA);
1273 		/*
1274 		 * We associate an IRE with an ILL, hence we don't exactly
1275 		 * know what might make sense for RTA_IFA and RTA_BRD. We
1276 		 * pick the first ipif on the ill.
1277 		 */
1278 		ipif = ipif_get_next_ipif(NULL, ill);
1279 		if (ipif != NULL) {
1280 			if (ipif->ipif_isv6)
1281 				ifaddr6 = ipif->ipif_v6lcl_addr;
1282 			else
1283 				ifaddr = ipif->ipif_lcl_addr;
1284 			if (ipif->ipif_flags & IPIF_POINTOPOINT) {
1285 				rtm_addrs |= RTA_BRD;
1286 				if (ipif->ipif_isv6)
1287 					brdaddr6 = ipif->ipif_v6pp_dst_addr;
1288 				else
1289 					brdaddr = ipif->ipif_pp_dst_addr;
1290 			}
1291 			ipif_refrele(ipif);
1292 		}
1293 	}
1294 
1295 	new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, gc != NULL ? 1 : 0);
1296 	if (new_mp == NULL) {
1297 		if (gcgrp != NULL)
1298 			rw_exit(&gcgrp->gcgrp_rwlock);
1299 		if (ill != NULL)
1300 			ill_refrele(ill);
1301 		return (NULL);
1302 	}
1303 
1304 	/*
1305 	 * We set the destination address, gateway address,
1306 	 * netmask and flags in the RTM_GET response depending
1307 	 * on whether we found a parent IRE or not.
1308 	 * In particular, if we did find a parent IRE during the
1309 	 * recursive search, use that IRE's gateway address.
1310 	 * Otherwise, we use the IRE's source address for the
1311 	 * gateway address.
1312 	 */
1313 	ASSERT(af == AF_INET || af == AF_INET6);
1314 	switch (af) {
1315 	case AF_INET:
1316 		IN6_V4MAPPED_TO_IPADDR(setsrc, v4setsrc);
1317 		if (v4setsrc != INADDR_ANY)
1318 			rtm_addrs |= RTA_SRC;
1319 
1320 		rtm_flags = ire->ire_flags;
1321 		rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr,
1322 		    ire->ire_mask, ire->ire_gateway_addr, v4setsrc,
1323 		    brdaddr, 0, ifaddr, ill, new_mp, gc);
1324 		break;
1325 	case AF_INET6:
1326 		if (!IN6_IS_ADDR_UNSPECIFIED(setsrc))
1327 			rtm_addrs |= RTA_SRC;
1328 
1329 		rtm_flags = ire->ire_flags;
1330 		rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6,
1331 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
1332 		    setsrc, &brdaddr6, &ipv6_all_zeros,
1333 		    &ifaddr6, ill, new_mp, gc);
1334 		break;
1335 	}
1336 
1337 	if (gcgrp != NULL)
1338 		rw_exit(&gcgrp->gcgrp_rwlock);
1339 
1340 	new_rtm = (rt_msghdr_t *)new_mp->b_rptr;
1341 
1342 	/*
1343 	 * The rtm_msglen, rtm_version and rtm_type fields in
1344 	 * RTM_GET response are filled in by rts_fill_msg.
1345 	 *
1346 	 * rtm_addrs and rtm_flags are filled in based on what
1347 	 * was requested and the state of the IREs looked up
1348 	 * above.
1349 	 *
1350 	 * rtm_inits and rtm_rmx are filled in with metrics
1351 	 * based on whether a parent IRE was found or not.
1352 	 *
1353 	 * TODO: rtm_index and rtm_use should probably be
1354 	 * filled in with something resonable here and not just
1355 	 * copied from the request.
1356 	 */
1357 	new_rtm->rtm_index = rtm->rtm_index;
1358 	new_rtm->rtm_pid = rtm->rtm_pid;
1359 	new_rtm->rtm_seq = rtm->rtm_seq;
1360 	new_rtm->rtm_use = rtm->rtm_use;
1361 	new_rtm->rtm_addrs = rtm_addrs;
1362 	new_rtm->rtm_flags = rtm_flags;
1363 	new_rtm->rtm_inits = rts_getmetrics(ire, ill, &new_rtm->rtm_rmx);
1364 	if (ill != NULL)
1365 		ill_refrele(ill);
1366 	return (new_mp);
1367 }
1368 
1369 /*
1370  * Fill the given if_data_t with interface statistics.
1371  */
1372 static void
1373 rts_getifdata(if_data_t *if_data, const ipif_t *ipif)
1374 {
1375 	if_data->ifi_type = ipif->ipif_ill->ill_type;
1376 						/* ethernet, tokenring, etc */
1377 	if_data->ifi_addrlen = 0;		/* media address length */
1378 	if_data->ifi_hdrlen = 0;		/* media header length */
1379 	if_data->ifi_mtu = ipif->ipif_ill->ill_mtu;	/* mtu */
1380 						/* metric (external only) */
1381 	if_data->ifi_metric = ipif->ipif_ill->ill_metric;
1382 	if_data->ifi_baudrate = 0;		/* linespeed */
1383 
1384 	if_data->ifi_ipackets = 0;		/* packets received on if */
1385 	if_data->ifi_ierrors = 0;		/* input errors on interface */
1386 	if_data->ifi_opackets = 0;		/* packets sent on interface */
1387 	if_data->ifi_oerrors = 0;		/* output errors on if */
1388 	if_data->ifi_collisions = 0;		/* collisions on csma if */
1389 	if_data->ifi_ibytes = 0;		/* total number received */
1390 	if_data->ifi_obytes = 0;		/* total number sent */
1391 	if_data->ifi_imcasts = 0;		/* multicast packets received */
1392 	if_data->ifi_omcasts = 0;		/* multicast packets sent */
1393 	if_data->ifi_iqdrops = 0;		/* dropped on input */
1394 	if_data->ifi_noproto = 0;		/* destined for unsupported */
1395 						/* protocol. */
1396 }
1397 
1398 /*
1399  * Set the metrics on a forwarding table route.
1400  */
1401 static void
1402 rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
1403 {
1404 	clock_t		rtt;
1405 	clock_t		rtt_sd;
1406 	ill_t		*ill;
1407 	ifrt_t		*ifrt;
1408 	mblk_t		*mp;
1409 	in6_addr_t	gw_addr_v6;
1410 
1411 	/* Need to add back some metrics to the IRE? */
1412 	/*
1413 	 * Bypass obtaining the lock and searching ill_saved_ire_mp in the
1414 	 * common case of no metrics.
1415 	 */
1416 	if (which == 0)
1417 		return;
1418 	ire->ire_metrics.iulp_set = B_TRUE;
1419 
1420 	/*
1421 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
1422 	 * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
1423 	 * microseconds.
1424 	 */
1425 	if (which & RTV_RTT)
1426 		rtt = metrics->rmx_rtt / 1000;
1427 	if (which & RTV_RTTVAR)
1428 		rtt_sd = metrics->rmx_rttvar / 1000;
1429 
1430 	/*
1431 	 * Update the metrics in the IRE itself.
1432 	 */
1433 	mutex_enter(&ire->ire_lock);
1434 	if (which & RTV_MTU)
1435 		ire->ire_metrics.iulp_mtu = metrics->rmx_mtu;
1436 	if (which & RTV_RTT)
1437 		ire->ire_metrics.iulp_rtt = rtt;
1438 	if (which & RTV_SSTHRESH)
1439 		ire->ire_metrics.iulp_ssthresh = metrics->rmx_ssthresh;
1440 	if (which & RTV_RTTVAR)
1441 		ire->ire_metrics.iulp_rtt_sd = rtt_sd;
1442 	if (which & RTV_SPIPE)
1443 		ire->ire_metrics.iulp_spipe = metrics->rmx_sendpipe;
1444 	if (which & RTV_RPIPE)
1445 		ire->ire_metrics.iulp_rpipe = metrics->rmx_recvpipe;
1446 	mutex_exit(&ire->ire_lock);
1447 
1448 	/*
1449 	 * Search through the ifrt_t chain hanging off the ILL in order to
1450 	 * reflect the metric change there.
1451 	 */
1452 	ill = ire->ire_ill;
1453 	if (ill == NULL)
1454 		return;
1455 	ASSERT((ill->ill_isv6 && ire->ire_ipversion == IPV6_VERSION) ||
1456 	    ((!ill->ill_isv6 && ire->ire_ipversion == IPV4_VERSION)));
1457 	if (ill->ill_isv6) {
1458 		mutex_enter(&ire->ire_lock);
1459 		gw_addr_v6 = ire->ire_gateway_addr_v6;
1460 		mutex_exit(&ire->ire_lock);
1461 	}
1462 	mutex_enter(&ill->ill_saved_ire_lock);
1463 	for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
1464 		/*
1465 		 * On a given ill, the tuple of address, gateway, mask,
1466 		 * ire_type and zoneid unique for each saved IRE.
1467 		 */
1468 		ifrt = (ifrt_t *)mp->b_rptr;
1469 		if (ill->ill_isv6) {
1470 			if (!IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
1471 			    &ire->ire_addr_v6) ||
1472 			    !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
1473 			    &gw_addr_v6) ||
1474 			    !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
1475 			    &ire->ire_mask_v6))
1476 				continue;
1477 		} else {
1478 			if (ifrt->ifrt_addr != ire->ire_addr ||
1479 			    ifrt->ifrt_gateway_addr != ire->ire_gateway_addr ||
1480 			    ifrt->ifrt_mask != ire->ire_mask)
1481 				continue;
1482 		}
1483 		if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
1484 		    ifrt->ifrt_type != ire->ire_type)
1485 			continue;
1486 
1487 		if (which & RTV_MTU)
1488 			ifrt->ifrt_metrics.iulp_mtu = metrics->rmx_mtu;
1489 		if (which & RTV_RTT)
1490 			ifrt->ifrt_metrics.iulp_rtt = rtt;
1491 		if (which & RTV_SSTHRESH) {
1492 			ifrt->ifrt_metrics.iulp_ssthresh =
1493 			    metrics->rmx_ssthresh;
1494 		}
1495 		if (which & RTV_RTTVAR)
1496 			ifrt->ifrt_metrics.iulp_rtt_sd = metrics->rmx_rttvar;
1497 		if (which & RTV_SPIPE)
1498 			ifrt->ifrt_metrics.iulp_spipe = metrics->rmx_sendpipe;
1499 		if (which & RTV_RPIPE)
1500 			ifrt->ifrt_metrics.iulp_rpipe = metrics->rmx_recvpipe;
1501 		break;
1502 	}
1503 	mutex_exit(&ill->ill_saved_ire_lock);
1504 
1505 	/*
1506 	 * Update any IRE_IF_CLONE hanging created from this IRE_IF so they
1507 	 * get any new iulp_mtu.
1508 	 * We do that by deleting them; ire_create_if_clone will pick
1509 	 * up the new metrics.
1510 	 */
1511 	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
1512 		ire_dep_delete_if_clone(ire);
1513 }
1514 
1515 /*
1516  * Get the metrics from a forwarding table route.
1517  */
1518 static int
1519 rts_getmetrics(ire_t *ire, ill_t *ill, rt_metrics_t *metrics)
1520 {
1521 	int	metrics_set = 0;
1522 
1523 	bzero(metrics, sizeof (rt_metrics_t));
1524 
1525 	/*
1526 	 * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
1527 	 * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
1528 	 * microseconds.
1529 	 */
1530 	metrics->rmx_rtt = ire->ire_metrics.iulp_rtt * 1000;
1531 	metrics_set |= RTV_RTT;
1532 	if (ire->ire_metrics.iulp_mtu != 0) {
1533 		metrics->rmx_mtu = ire->ire_metrics.iulp_mtu;
1534 		metrics_set |= RTV_MTU;
1535 	} else if (ill != NULL) {
1536 		metrics->rmx_mtu = ill->ill_mtu;
1537 		metrics_set |= RTV_MTU;
1538 	}
1539 	metrics->rmx_ssthresh = ire->ire_metrics.iulp_ssthresh;
1540 	metrics_set |= RTV_SSTHRESH;
1541 	metrics->rmx_rttvar = ire->ire_metrics.iulp_rtt_sd * 1000;
1542 	metrics_set |= RTV_RTTVAR;
1543 	metrics->rmx_sendpipe = ire->ire_metrics.iulp_spipe;
1544 	metrics_set |= RTV_SPIPE;
1545 	metrics->rmx_recvpipe = ire->ire_metrics.iulp_rpipe;
1546 	metrics_set |= RTV_RPIPE;
1547 	return (metrics_set);
1548 }
1549 
1550 /*
1551  * Given two sets of metrics (src and dst), use the dst values if they are
1552  * set. If a dst value is not set but the src value is set, then we use
1553  * the src value.
1554  * dst is updated with the new values.
1555  * This is used to merge information from a dce_t and ire_metrics, where the
1556  * dce values takes precedence.
1557  */
1558 void
1559 rts_merge_metrics(iulp_t *dst, const iulp_t *src)
1560 {
1561 	if (!src->iulp_set)
1562 		return;
1563 
1564 	if (dst->iulp_ssthresh == 0)
1565 		dst->iulp_ssthresh = src->iulp_ssthresh;
1566 	if (dst->iulp_rtt == 0)
1567 		dst->iulp_rtt = src->iulp_rtt;
1568 	if (dst->iulp_rtt_sd == 0)
1569 		dst->iulp_rtt_sd = src->iulp_rtt_sd;
1570 	if (dst->iulp_spipe == 0)
1571 		dst->iulp_spipe = src->iulp_spipe;
1572 	if (dst->iulp_rpipe == 0)
1573 		dst->iulp_rpipe = src->iulp_rpipe;
1574 	if (dst->iulp_rtomax == 0)
1575 		dst->iulp_rtomax = src->iulp_rtomax;
1576 	if (dst->iulp_sack == 0)
1577 		dst->iulp_sack = src->iulp_sack;
1578 	if (dst->iulp_tstamp_ok == 0)
1579 		dst->iulp_tstamp_ok = src->iulp_tstamp_ok;
1580 	if (dst->iulp_wscale_ok == 0)
1581 		dst->iulp_wscale_ok = src->iulp_wscale_ok;
1582 	if (dst->iulp_ecn_ok == 0)
1583 		dst->iulp_ecn_ok = src->iulp_ecn_ok;
1584 	if (dst->iulp_pmtud_ok == 0)
1585 		dst->iulp_pmtud_ok = src->iulp_pmtud_ok;
1586 	if (dst->iulp_mtu == 0)
1587 		dst->iulp_mtu = src->iulp_mtu;
1588 }
1589 
1590 
1591 /*
1592  * Takes a pointer to a routing message and extracts necessary info by looking
1593  * at the rtm->rtm_addrs bits and store the requested sockaddrs in the pointers
1594  * passed (all of which must be valid).
1595  *
1596  * The bitmask of sockaddrs actually found in the message is returned, or zero
1597  * is returned in the case of an error.
1598  */
1599 static int
1600 rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp,
1601     in6_addr_t *net_maskp, in6_addr_t *authorp, in6_addr_t *if_addrp,
1602     in6_addr_t *in_src_addrp, ushort_t *indexp, sa_family_t *afp,
1603     tsol_rtsecattr_t *rtsecattr, int *error)
1604 {
1605 	struct sockaddr *sa;
1606 	int	i;
1607 	int	addr_bits;
1608 	int	length;
1609 	int	found_addrs = 0;
1610 	caddr_t	cp;
1611 	size_t	size;
1612 	struct sockaddr_dl *sdl;
1613 
1614 	*dst_addrp = ipv6_all_zeros;
1615 	*gw_addrp = ipv6_all_zeros;
1616 	*net_maskp = ipv6_all_zeros;
1617 	*authorp = ipv6_all_zeros;
1618 	*if_addrp = ipv6_all_zeros;
1619 	*in_src_addrp = ipv6_all_zeros;
1620 	*indexp = 0;
1621 	*afp = AF_UNSPEC;
1622 	rtsecattr->rtsa_cnt = 0;
1623 	*error = 0;
1624 
1625 	/*
1626 	 * At present we handle only RTA_DST, RTA_GATEWAY, RTA_NETMASK, RTA_IFP,
1627 	 * RTA_IFA and RTA_AUTHOR.  The rest will be added as we need them.
1628 	 */
1629 	cp = (caddr_t)&rtm[1];
1630 	length = rtm->rtm_msglen;
1631 	for (i = 0; (i < RTA_NUMBITS) && ((cp - (caddr_t)rtm) < length); i++) {
1632 		/*
1633 		 * The address family we are working with starts out as
1634 		 * AF_UNSPEC, but is set to the one specified with the
1635 		 * destination address.
1636 		 *
1637 		 * If the "working" address family that has been set to
1638 		 * something other than AF_UNSPEC, then the address family of
1639 		 * subsequent sockaddrs must either be AF_UNSPEC (for
1640 		 * compatibility with older programs) or must be the same as our
1641 		 * "working" one.
1642 		 *
1643 		 * This code assumes that RTA_DST (1) comes first in the loop.
1644 		 */
1645 		sa = (struct sockaddr *)cp;
1646 		addr_bits = (rtm->rtm_addrs & (1 << i));
1647 		if (addr_bits == 0)
1648 			continue;
1649 		switch (addr_bits) {
1650 		case RTA_DST:
1651 			size = rts_copyfromsockaddr(sa, dst_addrp);
1652 			*afp = sa->sa_family;
1653 			break;
1654 		case RTA_GATEWAY:
1655 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1656 				return (0);
1657 			size = rts_copyfromsockaddr(sa, gw_addrp);
1658 			break;
1659 		case RTA_NETMASK:
1660 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1661 				return (0);
1662 			size = rts_copyfromsockaddr(sa, net_maskp);
1663 			break;
1664 		case RTA_IFP:
1665 			if (sa->sa_family != AF_LINK &&
1666 			    sa->sa_family != AF_UNSPEC)
1667 				return (0);
1668 			sdl = (struct sockaddr_dl *)cp;
1669 			*indexp = sdl->sdl_index;
1670 			size = sizeof (struct sockaddr_dl);
1671 			break;
1672 		case RTA_SRC:
1673 			/* Source address of the incoming packet */
1674 			size = rts_copyfromsockaddr(sa, in_src_addrp);
1675 			*afp = sa->sa_family;
1676 			break;
1677 		case RTA_IFA:
1678 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1679 				return (0);
1680 			size = rts_copyfromsockaddr(sa, if_addrp);
1681 			break;
1682 		case RTA_AUTHOR:
1683 			if (sa->sa_family != *afp && sa->sa_family != AF_UNSPEC)
1684 				return (0);
1685 			size = rts_copyfromsockaddr(sa, authorp);
1686 			break;
1687 		default:
1688 			return (0);
1689 		}
1690 		if (size == 0)
1691 			return (0);
1692 		cp += size;
1693 		found_addrs |= addr_bits;
1694 	}
1695 
1696 	/*
1697 	 * Parse the routing message and look for any security-
1698 	 * related attributes for the route.  For each valid
1699 	 * attribute, allocate/obtain the corresponding kernel
1700 	 * route security attributes.
1701 	 */
1702 	if (((cp - (caddr_t)rtm) < length) && is_system_labeled()) {
1703 		*error = tsol_rtsa_init(rtm, rtsecattr, cp);
1704 		ASSERT(rtsecattr->rtsa_cnt <= TSOL_RTSA_REQUEST_MAX);
1705 	}
1706 
1707 	return (found_addrs);
1708 }
1709 
1710 /*
1711  * Fills the message with the given info.
1712  */
1713 static void
1714 rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
1715     ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, ipaddr_t author,
1716     ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
1717     const tsol_gc_t *gc)
1718 {
1719 	rt_msghdr_t	*rtm;
1720 	sin_t		*sin;
1721 	size_t		data_size, header_size;
1722 	uchar_t		*cp;
1723 	int		i;
1724 
1725 	ASSERT(mp != NULL);
1726 	/*
1727 	 * First find the type of the message
1728 	 * and its length.
1729 	 */
1730 	header_size = rts_header_msg_size(type);
1731 	/*
1732 	 * Now find the size of the data
1733 	 * that follows the message header.
1734 	 */
1735 	data_size = rts_data_msg_size(rtm_addrs, AF_INET, gc != NULL ? 1 : 0);
1736 
1737 	rtm = (rt_msghdr_t *)mp->b_rptr;
1738 	mp->b_wptr = &mp->b_rptr[header_size];
1739 	cp = mp->b_wptr;
1740 	bzero(cp, data_size);
1741 	for (i = 0; i < RTA_NUMBITS; i++) {
1742 		sin = (sin_t *)cp;
1743 		switch (rtm_addrs & (1 << i)) {
1744 		case RTA_DST:
1745 			sin->sin_addr.s_addr = dst;
1746 			sin->sin_family = AF_INET;
1747 			cp += sizeof (sin_t);
1748 			break;
1749 		case RTA_GATEWAY:
1750 			sin->sin_addr.s_addr = gateway;
1751 			sin->sin_family = AF_INET;
1752 			cp += sizeof (sin_t);
1753 			break;
1754 		case RTA_NETMASK:
1755 			sin->sin_addr.s_addr = mask;
1756 			sin->sin_family = AF_INET;
1757 			cp += sizeof (sin_t);
1758 			break;
1759 		case RTA_IFP:
1760 			cp += ill_dls_info((struct sockaddr_dl *)cp, ill);
1761 			break;
1762 		case RTA_IFA:
1763 			sin->sin_addr.s_addr = ifaddr;
1764 			sin->sin_family = AF_INET;
1765 			cp += sizeof (sin_t);
1766 			break;
1767 		case RTA_SRC:
1768 			sin->sin_addr.s_addr = src_addr;
1769 			sin->sin_family = AF_INET;
1770 			cp += sizeof (sin_t);
1771 			break;
1772 		case RTA_AUTHOR:
1773 			sin->sin_addr.s_addr = author;
1774 			sin->sin_family = AF_INET;
1775 			cp += sizeof (sin_t);
1776 			break;
1777 		case RTA_BRD:
1778 			/*
1779 			 * RTA_BRD is used typically to specify a point-to-point
1780 			 * destination address.
1781 			 */
1782 			sin->sin_addr.s_addr = brd_addr;
1783 			sin->sin_family = AF_INET;
1784 			cp += sizeof (sin_t);
1785 			break;
1786 		}
1787 	}
1788 
1789 	if (gc != NULL) {
1790 		rtm_ext_t *rtm_ext;
1791 		struct rtsa_s *rp_dst;
1792 		tsol_rtsecattr_t *rsap;
1793 
1794 		ASSERT(gc->gc_grp != NULL);
1795 		ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock));
1796 
1797 		rtm_ext = (rtm_ext_t *)cp;
1798 		rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR;
1799 		rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1);
1800 
1801 		rsap = (tsol_rtsecattr_t *)(rtm_ext + 1);
1802 		rsap->rtsa_cnt = 1;
1803 		rp_dst = rsap->rtsa_attr;
1804 
1805 		ASSERT(gc->gc_db != NULL);
1806 		bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
1807 		cp = (uchar_t *)rp_dst;
1808 	}
1809 
1810 	mp->b_wptr = cp;
1811 	mp->b_cont = NULL;
1812 	/*
1813 	 * set the fields that are common to
1814 	 * to different messages.
1815 	 */
1816 	rtm->rtm_msglen = (short)(header_size + data_size);
1817 	rtm->rtm_version = RTM_VERSION;
1818 	rtm->rtm_type = (uchar_t)type;
1819 }
1820 
1821 /*
1822  * Allocates and initializes a routing socket message.
1823  * Note that sacnt is either zero or one.
1824  */
1825 mblk_t *
1826 rts_alloc_msg(int type, int rtm_addrs, sa_family_t af, uint_t sacnt)
1827 {
1828 	size_t	length;
1829 	mblk_t	*mp;
1830 
1831 	length = RTS_MSG_SIZE(type, rtm_addrs, af, sacnt);
1832 	mp = allocb(length, BPRI_MED);
1833 	if (mp == NULL)
1834 		return (mp);
1835 	bzero(mp->b_rptr, length);
1836 	return (mp);
1837 }
1838 
1839 /*
1840  * Returns the size of the routing
1841  * socket message header size.
1842  */
1843 size_t
1844 rts_header_msg_size(int type)
1845 {
1846 	switch (type) {
1847 	case RTM_DELADDR:
1848 	case RTM_NEWADDR:
1849 	case RTM_CHGADDR:
1850 	case RTM_FREEADDR:
1851 		return (sizeof (ifa_msghdr_t));
1852 	case RTM_IFINFO:
1853 		return (sizeof (if_msghdr_t));
1854 	default:
1855 		return (sizeof (rt_msghdr_t));
1856 	}
1857 }
1858 
1859 /*
1860  * Returns the size of the message needed with the given rtm_addrs and family.
1861  *
1862  * It is assumed that all of the sockaddrs (with the exception of RTA_IFP) are
1863  * of the same family (currently either AF_INET or AF_INET6).
1864  */
1865 size_t
1866 rts_data_msg_size(int rtm_addrs, sa_family_t af, uint_t sacnt)
1867 {
1868 	int	i;
1869 	size_t	length = 0;
1870 
1871 	for (i = 0; i < RTA_NUMBITS; i++) {
1872 		switch (rtm_addrs & (1 << i)) {
1873 		case RTA_IFP:
1874 			length += sizeof (struct sockaddr_dl);
1875 			break;
1876 		case RTA_DST:
1877 		case RTA_GATEWAY:
1878 		case RTA_NETMASK:
1879 		case RTA_SRC:
1880 		case RTA_IFA:
1881 		case RTA_AUTHOR:
1882 		case RTA_BRD:
1883 			ASSERT(af == AF_INET || af == AF_INET6);
1884 			switch (af) {
1885 			case AF_INET:
1886 				length += sizeof (sin_t);
1887 				break;
1888 			case AF_INET6:
1889 				length += sizeof (sin6_t);
1890 				break;
1891 			}
1892 			break;
1893 		}
1894 	}
1895 	if (sacnt > 0)
1896 		length += sizeof (rtm_ext_t) + TSOL_RTSECATTR_SIZE(sacnt);
1897 
1898 	return (length);
1899 }
1900 
1901 /*
1902  * This routine is called to generate a message to the routing
1903  * socket indicating that a redirect has occured, a routing lookup
1904  * has failed, or that a protocol has detected timeouts to a particular
1905  * destination. This routine is called for message types RTM_LOSING,
1906  * RTM_REDIRECT, and RTM_MISS.
1907  */
1908 void
1909 ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
1910     ipaddr_t source, ipaddr_t author, int flags, int error, int rtm_addrs,
1911     ip_stack_t *ipst)
1912 {
1913 	rt_msghdr_t	*rtm;
1914 	mblk_t		*mp;
1915 
1916 	if (rtm_addrs == 0)
1917 		return;
1918 	mp = rts_alloc_msg(type, rtm_addrs, AF_INET, 0);
1919 	if (mp == NULL)
1920 		return;
1921 	rts_fill_msg(type, rtm_addrs, dst_addr, net_mask, gw_addr, source, 0,
1922 	    author, 0, NULL, mp, NULL);
1923 	rtm = (rt_msghdr_t *)mp->b_rptr;
1924 	rtm->rtm_flags = flags;
1925 	rtm->rtm_errno = error;
1926 	rtm->rtm_flags |= RTF_DONE;
1927 	rtm->rtm_addrs = rtm_addrs;
1928 	rts_queue_input(mp, NULL, AF_INET, RTSQ_ALL, ipst);
1929 }
1930 
1931 /*
1932  * This routine is called to generate a message to the routing
1933  * socket indicating that the status of a network interface has changed.
1934  * Message type generated RTM_IFINFO.
1935  */
1936 void
1937 ip_rts_ifmsg(const ipif_t *ipif, uint_t flags)
1938 {
1939 	ip_rts_xifmsg(ipif, 0, 0, flags);
1940 }
1941 
1942 void
1943 ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
1944 {
1945 	if_msghdr_t	*ifm;
1946 	mblk_t		*mp;
1947 	sa_family_t	af;
1948 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
1949 
1950 	/*
1951 	 * This message should be generated only
1952 	 * when the physical device is changing
1953 	 * state.
1954 	 */
1955 	if (ipif->ipif_id != 0)
1956 		return;
1957 	if (ipif->ipif_isv6) {
1958 		af = AF_INET6;
1959 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
1960 		if (mp == NULL)
1961 			return;
1962 		rts_fill_msg_v6(RTM_IFINFO, RTA_IFP, &ipv6_all_zeros,
1963 		    &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
1964 		    &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
1965 		    ipif->ipif_ill, mp, NULL);
1966 	} else {
1967 		af = AF_INET;
1968 		mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
1969 		if (mp == NULL)
1970 			return;
1971 		rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, 0,
1972 		    ipif->ipif_ill, mp, NULL);
1973 	}
1974 	ifm = (if_msghdr_t *)mp->b_rptr;
1975 	ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
1976 	ifm->ifm_flags = (ipif->ipif_flags | ipif->ipif_ill->ill_flags |
1977 	    ipif->ipif_ill->ill_phyint->phyint_flags | set) & ~clear;
1978 	rts_getifdata(&ifm->ifm_data, ipif);
1979 	ifm->ifm_addrs = RTA_IFP;
1980 
1981 	if (flags & RTSQ_DEFAULT) {
1982 		flags = RTSQ_ALL;
1983 		/*
1984 		 * If this message is for an underlying interface, prevent
1985 		 * "normal" (IPMP-unaware) routing sockets from seeing it.
1986 		 */
1987 		if (IS_UNDER_IPMP(ipif->ipif_ill))
1988 			flags &= ~RTSQ_NORMAL;
1989 	}
1990 
1991 	rts_queue_input(mp, NULL, af, flags, ipst);
1992 }
1993 
1994 /*
1995  * If cmd is RTM_ADD or RTM_DELETE, generate the rt_msghdr_t message;
1996  * otherwise (RTM_NEWADDR, RTM_DELADDR, RTM_CHGADDR and RTM_FREEADDR)
1997  * generate the ifa_msghdr_t message.
1998  */
1999 static void
2000 rts_new_rtsmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
2001 {
2002 	int		rtm_addrs;
2003 	mblk_t		*mp;
2004 	ifa_msghdr_t	*ifam;
2005 	rt_msghdr_t	*rtm;
2006 	sa_family_t	af;
2007 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2008 
2009 	/*
2010 	 * Do not report unspecified address if this is the RTM_CHGADDR or
2011 	 * RTM_FREEADDR message.
2012 	 */
2013 	if (cmd == RTM_CHGADDR || cmd == RTM_FREEADDR) {
2014 		if (!ipif->ipif_isv6) {
2015 			if (ipif->ipif_lcl_addr == INADDR_ANY)
2016 				return;
2017 		} else if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
2018 			return;
2019 		}
2020 	}
2021 
2022 	if (ipif->ipif_isv6)
2023 		af = AF_INET6;
2024 	else
2025 		af = AF_INET;
2026 
2027 	if (cmd == RTM_ADD || cmd == RTM_DELETE)
2028 		rtm_addrs = (RTA_DST | RTA_NETMASK);
2029 	else
2030 		rtm_addrs = (RTA_IFA | RTA_NETMASK | RTA_BRD | RTA_IFP);
2031 
2032 	mp = rts_alloc_msg(cmd, rtm_addrs, af, 0);
2033 	if (mp == NULL)
2034 		return;
2035 
2036 	if (cmd != RTM_ADD && cmd != RTM_DELETE) {
2037 		switch (af) {
2038 		case AF_INET:
2039 			rts_fill_msg(cmd, rtm_addrs, 0,
2040 			    ipif->ipif_net_mask, 0, ipif->ipif_lcl_addr,
2041 			    ipif->ipif_pp_dst_addr, 0,
2042 			    ipif->ipif_lcl_addr, ipif->ipif_ill,
2043 			    mp, NULL);
2044 			break;
2045 		case AF_INET6:
2046 			rts_fill_msg_v6(cmd, rtm_addrs,
2047 			    &ipv6_all_zeros, &ipif->ipif_v6net_mask,
2048 			    &ipv6_all_zeros, &ipif->ipif_v6lcl_addr,
2049 			    &ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros,
2050 			    &ipif->ipif_v6lcl_addr, ipif->ipif_ill,
2051 			    mp, NULL);
2052 			break;
2053 		}
2054 		ifam = (ifa_msghdr_t *)mp->b_rptr;
2055 		ifam->ifam_index =
2056 		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
2057 		ifam->ifam_metric = ipif->ipif_ill->ill_metric;
2058 		ifam->ifam_flags = ((cmd == RTM_NEWADDR) ? RTF_UP : 0);
2059 		ifam->ifam_addrs = rtm_addrs;
2060 	} else {
2061 		switch (af) {
2062 		case AF_INET:
2063 			rts_fill_msg(cmd, rtm_addrs,
2064 			    ipif->ipif_lcl_addr, ipif->ipif_net_mask, 0,
2065 			    0, 0, 0, 0, NULL, mp, NULL);
2066 			break;
2067 		case AF_INET6:
2068 			rts_fill_msg_v6(cmd, rtm_addrs,
2069 			    &ipif->ipif_v6lcl_addr,
2070 			    &ipif->ipif_v6net_mask, &ipv6_all_zeros,
2071 			    &ipv6_all_zeros, &ipv6_all_zeros,
2072 			    &ipv6_all_zeros, &ipv6_all_zeros,
2073 			    NULL, mp, NULL);
2074 			break;
2075 		}
2076 		rtm = (rt_msghdr_t *)mp->b_rptr;
2077 		rtm->rtm_index =
2078 		    ipif->ipif_ill->ill_phyint->phyint_ifindex;
2079 		rtm->rtm_flags = ((cmd == RTM_ADD) ? RTF_UP : 0);
2080 		rtm->rtm_errno = error;
2081 		if (error == 0)
2082 			rtm->rtm_flags |= RTF_DONE;
2083 		rtm->rtm_addrs = rtm_addrs;
2084 	}
2085 	rts_queue_input(mp, NULL, af, flags, ipst);
2086 }
2087 
2088 /*
2089  * This is called to generate messages to the routing socket
2090  * indicating a network interface has had addresses associated with it.
2091  * The structure of the code is based on the 4.4BSD-Lite2 <net/rtsock.c>.
2092  */
2093 void
2094 ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
2095 {
2096 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2097 
2098 	if (flags & RTSQ_DEFAULT) {
2099 		flags = RTSQ_ALL;
2100 		/*
2101 		 * If this message is for an underlying interface, prevent
2102 		 * "normal" (IPMP-unaware) routing sockets from seeing it.
2103 		 */
2104 		if (IS_UNDER_IPMP(ipif->ipif_ill))
2105 			flags &= ~RTSQ_NORMAL;
2106 	}
2107 
2108 	/*
2109 	 * Let conn_ixa caching know that source address selection
2110 	 * changed
2111 	 */
2112 	if (cmd == RTM_ADD || cmd == RTM_DELETE)
2113 		ip_update_source_selection(ipst);
2114 
2115 	/*
2116 	 * If the request is DELETE, send RTM_DELETE and RTM_DELADDR.
2117 	 * if the request is ADD, send RTM_NEWADDR and RTM_ADD.
2118 	 * otherwise simply send the request.
2119 	 */
2120 	switch (cmd) {
2121 	case RTM_ADD:
2122 		rts_new_rtsmsg(RTM_NEWADDR, error, ipif, flags);
2123 		rts_new_rtsmsg(RTM_ADD, error, ipif, flags);
2124 		break;
2125 	case RTM_DELETE:
2126 		rts_new_rtsmsg(RTM_DELETE, error, ipif, flags);
2127 		rts_new_rtsmsg(RTM_DELADDR, error, ipif, flags);
2128 		break;
2129 	default:
2130 		rts_new_rtsmsg(cmd, error, ipif, flags);
2131 		break;
2132 	}
2133 }
2134 
2135 /*
2136  * Based on the address family specified in a sockaddr, copy the address field
2137  * into an in6_addr_t.
2138  *
2139  * In the case of AF_UNSPEC, we assume the family is actually AF_INET for
2140  * compatibility with programs that leave the family cleared in the sockaddr.
2141  * Callers of rts_copyfromsockaddr should check the family themselves if they
2142  * wish to verify its value.
2143  *
2144  * In the case of AF_INET6, a check is made to ensure that address is not an
2145  * IPv4-mapped address.
2146  */
2147 size_t
2148 rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp)
2149 {
2150 	switch (sa->sa_family) {
2151 	case AF_INET:
2152 	case AF_UNSPEC:
2153 		IN6_IPADDR_TO_V4MAPPED(((sin_t *)sa)->sin_addr.s_addr, addrp);
2154 		return (sizeof (sin_t));
2155 	case AF_INET6:
2156 		*addrp = ((sin6_t *)sa)->sin6_addr;
2157 		if (IN6_IS_ADDR_V4MAPPED(addrp))
2158 			return (0);
2159 		return (sizeof (sin6_t));
2160 	default:
2161 		return (0);
2162 	}
2163 }
2164