xref: /illumos-gate/usr/src/uts/common/inet/ip/ipclassifier.c (revision c94be9439c4f0773ef60e2cec21d548359cfea20)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
24  */
25 
26 /*
27  * IP PACKET CLASSIFIER
28  *
29  * The IP packet classifier provides mapping between IP packets and persistent
30  * connection state for connection-oriented protocols. It also provides
31  * interface for managing connection states.
32  *
33  * The connection state is kept in conn_t data structure and contains, among
34  * other things:
35  *
36  *	o local/remote address and ports
37  *	o Transport protocol
38  *	o squeue for the connection (for TCP only)
39  *	o reference counter
40  *	o Connection state
41  *	o hash table linkage
42  *	o interface/ire information
43  *	o credentials
44  *	o ipsec policy
45  *	o send and receive functions.
46  *	o mutex lock.
47  *
48  * Connections use a reference counting scheme. They are freed when the
49  * reference counter drops to zero. A reference is incremented when connection
50  * is placed in a list or table, when incoming packet for the connection arrives
51  * and when connection is processed via squeue (squeue processing may be
52  * asynchronous and the reference protects the connection from being destroyed
53  * before its processing is finished).
54  *
55  * conn_recv is used to pass up packets to the ULP.
56  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
57  * a listener, and changes to tcp_input_listener as the listener has picked a
58  * good squeue. For other cases it is set to tcp_input_data.
59  *
60  * conn_recvicmp is used to pass up ICMP errors to the ULP.
61  *
62  * Classifier uses several hash tables:
63  *
64  *	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
65  *	ipcl_bind_fanout:	contains all connections in BOUND state
66  *	ipcl_proto_fanout:	IPv4 protocol fanout
67  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
68  *	ipcl_udp_fanout:	contains all UDP connections
69  *	ipcl_iptun_fanout:	contains all IP tunnel connections
70  *	ipcl_globalhash_fanout:	contains all connections
71  *
72  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
73  * which need to view all existing connections.
74  *
75  * All tables are protected by per-bucket locks. When both per-bucket lock and
76  * connection lock need to be held, the per-bucket lock should be acquired
77  * first, followed by the connection lock.
78  *
79  * All functions doing search in one of these tables increment a reference
80  * counter on the connection found (if any). This reference should be dropped
81  * when the caller has finished processing the connection.
82  *
83  *
84  * INTERFACES:
85  * ===========
86  *
87  * Connection Lookup:
88  * ------------------
89  *
90  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
91  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
92  *
93  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
94  * it can't find any associated connection. If the connection is found, its
95  * reference counter is incremented.
96  *
97  *	mp:	mblock, containing packet header. The full header should fit
98  *		into a single mblock. It should also contain at least full IP
99  *		and TCP or UDP header.
100  *
101  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
102  *
103  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
104  *		 the packet.
105  *
106  *	ira->ira_zoneid: The zone in which the returned connection must be; the
107  *		zoneid corresponding to the ire_zoneid on the IRE located for
108  *		the packet's destination address.
109  *
110  *	ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
111  *		IRAF_TX_SHARED_ADDR flags
112  *
113  *	For TCP connections, the lookup order is as follows:
114  *		5-tuple {src, dst, protocol, local port, remote port}
115  *			lookup in ipcl_conn_fanout table.
116  *		3-tuple {dst, remote port, protocol} lookup in
117  *			ipcl_bind_fanout table.
118  *
119  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
120  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
121  *	these interfaces do not handle cases where a packets belongs
122  *	to multiple UDP clients, which is handled in IP itself.
123  *
124  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
125  * determine which actual zone gets the segment.  This is used only in a
126  * labeled environment.  The matching rules are:
127  *
128  *	- If it's not a multilevel port, then the label on the packet selects
129  *	  the zone.  Unlabeled packets are delivered to the global zone.
130  *
131  *	- If it's a multilevel port, then only the zone registered to receive
132  *	  packets on that port matches.
133  *
134  * Also, in a labeled environment, packet labels need to be checked.  For fully
135  * bound TCP connections, we can assume that the packet label was checked
136  * during connection establishment, and doesn't need to be checked on each
137  * packet.  For others, though, we need to check for strict equality or, for
138  * multilevel ports, membership in the range or set.  This part currently does
139  * a tnrh lookup on each packet, but could be optimized to use cached results
140  * if that were necessary.  (SCTP doesn't come through here, but if it did,
141  * we would apply the same rules as TCP.)
142  *
143  * An implication of the above is that fully-bound TCP sockets must always use
144  * distinct 4-tuples; they can't be discriminated by label alone.
145  *
146  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
147  * as there's no connection set-up handshake and no shared state.
148  *
149  * Labels on looped-back packets within a single zone do not need to be
150  * checked, as all processes in the same zone have the same label.
151  *
152  * Finally, for unlabeled packets received by a labeled system, special rules
153  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
154  * socket in the zone whose label matches the default label of the sender, if
155  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
156  * receiver's label must dominate the sender's default label.
157  *
158  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
159  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
160  *					 ip_stack);
161  *
162  *	Lookup routine to find a exact match for {src, dst, local port,
163  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
164  *	ports are read from the IP and TCP header respectively.
165  *
166  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
167  *					 zoneid, ip_stack);
168  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
169  *					 zoneid, ip_stack);
170  *
171  *	Lookup routine to find a listener with the tuple {lport, laddr,
172  *	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
173  *	parameter interface index is also compared.
174  *
175  * void ipcl_walk(func, arg, ip_stack)
176  *
177  *	Apply 'func' to every connection available. The 'func' is called as
178  *	(*func)(connp, arg). The walk is non-atomic so connections may be
179  *	created and destroyed during the walk. The CONN_CONDEMNED and
180  *	CONN_INCIPIENT flags ensure that connections which are newly created
181  *	or being destroyed are not selected by the walker.
182  *
183  * Table Updates
184  * -------------
185  *
186  * int ipcl_conn_insert(connp);
187  * int ipcl_conn_insert_v4(connp);
188  * int ipcl_conn_insert_v6(connp);
189  *
190  *	Insert 'connp' in the ipcl_conn_fanout.
191  *	Arguments :
192  *		connp		conn_t to be inserted
193  *
194  *	Return value :
195  *		0		if connp was inserted
196  *		EADDRINUSE	if the connection with the same tuple
197  *				already exists.
198  *
199  * int ipcl_bind_insert(connp);
200  * int ipcl_bind_insert_v4(connp);
201  * int ipcl_bind_insert_v6(connp);
202  *
203  *	Insert 'connp' in ipcl_bind_fanout.
204  *	Arguments :
205  *		connp		conn_t to be inserted
206  *
207  *
208  * void ipcl_hash_remove(connp);
209  *
210  *	Removes the 'connp' from the connection fanout table.
211  *
212  * Connection Creation/Destruction
213  * -------------------------------
214  *
215  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
216  *
217  *	Creates a new conn based on the type flag, inserts it into
218  *	globalhash table.
219  *
220  *	type:	This flag determines the type of conn_t which needs to be
221  *		created i.e., which kmem_cache it comes from.
222  *		IPCL_TCPCONN	indicates a TCP connection
223  *		IPCL_SCTPCONN	indicates a SCTP connection
224  *		IPCL_UDPCONN	indicates a UDP conn_t.
225  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
226  *		IPCL_RTSCONN	indicates a RTS conn_t.
227  *		IPCL_IPCCONN	indicates all other connections.
228  *
229  * void ipcl_conn_destroy(connp)
230  *
231  *	Destroys the connection state, removes it from the global
232  *	connection hash table and frees its memory.
233  */
234 
235 #include <sys/types.h>
236 #include <sys/stream.h>
237 #include <sys/stropts.h>
238 #include <sys/sysmacros.h>
239 #include <sys/strsubr.h>
240 #include <sys/strsun.h>
241 #define	_SUN_TPI_VERSION 2
242 #include <sys/ddi.h>
243 #include <sys/cmn_err.h>
244 #include <sys/debug.h>
245 
246 #include <sys/systm.h>
247 #include <sys/param.h>
248 #include <sys/kmem.h>
249 #include <sys/isa_defs.h>
250 #include <inet/common.h>
251 #include <netinet/ip6.h>
252 #include <netinet/icmp6.h>
253 
254 #include <inet/ip.h>
255 #include <inet/ip_if.h>
256 #include <inet/ip_ire.h>
257 #include <inet/ip6.h>
258 #include <inet/ip_ndp.h>
259 #include <inet/ip_impl.h>
260 #include <inet/udp_impl.h>
261 #include <inet/sctp_ip.h>
262 #include <inet/sctp/sctp_impl.h>
263 #include <inet/rawip_impl.h>
264 #include <inet/rts_impl.h>
265 #include <inet/iptun/iptun_impl.h>
266 
267 #include <sys/cpuvar.h>
268 
269 #include <inet/ipclassifier.h>
270 #include <inet/tcp.h>
271 #include <inet/ipsec_impl.h>
272 
273 #include <sys/tsol/tnet.h>
274 #include <sys/sockio.h>
275 
276 /* Old value for compatibility. Setable in /etc/system */
277 uint_t tcp_conn_hash_size = 0;
278 
279 /* New value. Zero means choose automatically.  Setable in /etc/system */
280 uint_t ipcl_conn_hash_size = 0;
281 uint_t ipcl_conn_hash_memfactor = 8192;
282 uint_t ipcl_conn_hash_maxsize = 82500;
283 
284 /* bind/udp fanout table size */
285 uint_t ipcl_bind_fanout_size = 512;
286 uint_t ipcl_udp_fanout_size = 16384;
287 
288 /* Raw socket fanout size.  Must be a power of 2. */
289 uint_t ipcl_raw_fanout_size = 256;
290 
291 /*
292  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
293  * expect that most large deployments would have hundreds of tunnels, and
294  * thousands in the extreme case.
295  */
296 uint_t ipcl_iptun_fanout_size = 6143;
297 
298 /*
299  * Power of 2^N Primes useful for hashing for N of 0-28,
300  * these primes are the nearest prime <= 2^N - 2^(N-2).
301  */
302 
303 #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
304 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
305 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
306 		50331599, 100663291, 201326557, 0}
307 
308 /*
309  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
310  * are aligned on cache lines.
311  */
312 typedef union itc_s {
313 	conn_t	itc_conn;
314 	char	itcu_filler[CACHE_ALIGN(conn_s)];
315 } itc_t;
316 
317 struct kmem_cache  *tcp_conn_cache;
318 struct kmem_cache  *ip_conn_cache;
319 extern struct kmem_cache  *sctp_conn_cache;
320 struct kmem_cache  *udp_conn_cache;
321 struct kmem_cache  *rawip_conn_cache;
322 struct kmem_cache  *rts_conn_cache;
323 
324 extern void	tcp_timermp_free(tcp_t *);
325 extern mblk_t	*tcp_timermp_alloc(int);
326 
327 static int	ip_conn_constructor(void *, void *, int);
328 static void	ip_conn_destructor(void *, void *);
329 
330 static int	tcp_conn_constructor(void *, void *, int);
331 static void	tcp_conn_destructor(void *, void *);
332 
333 static int	udp_conn_constructor(void *, void *, int);
334 static void	udp_conn_destructor(void *, void *);
335 
336 static int	rawip_conn_constructor(void *, void *, int);
337 static void	rawip_conn_destructor(void *, void *);
338 
339 static int	rts_conn_constructor(void *, void *, int);
340 static void	rts_conn_destructor(void *, void *);
341 
342 /*
343  * Global (for all stack instances) init routine
344  */
345 void
346 ipcl_g_init(void)
347 {
348 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
349 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
350 	    ip_conn_constructor, ip_conn_destructor,
351 	    NULL, NULL, NULL, 0);
352 
353 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
354 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
355 	    tcp_conn_constructor, tcp_conn_destructor,
356 	    tcp_conn_reclaim, NULL, NULL, 0);
357 
358 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
359 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
360 	    udp_conn_constructor, udp_conn_destructor,
361 	    NULL, NULL, NULL, 0);
362 
363 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
364 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
365 	    rawip_conn_constructor, rawip_conn_destructor,
366 	    NULL, NULL, NULL, 0);
367 
368 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
369 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
370 	    rts_conn_constructor, rts_conn_destructor,
371 	    NULL, NULL, NULL, 0);
372 }
373 
374 /*
375  * ipclassifier intialization routine, sets up hash tables.
376  */
377 void
378 ipcl_init(ip_stack_t *ipst)
379 {
380 	int i;
381 	int sizes[] = P2Ps();
382 
383 	/*
384 	 * Calculate size of conn fanout table from /etc/system settings
385 	 */
386 	if (ipcl_conn_hash_size != 0) {
387 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
388 	} else if (tcp_conn_hash_size != 0) {
389 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
390 	} else {
391 		extern pgcnt_t freemem;
392 
393 		ipst->ips_ipcl_conn_fanout_size =
394 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
395 
396 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
397 			ipst->ips_ipcl_conn_fanout_size =
398 			    ipcl_conn_hash_maxsize;
399 		}
400 	}
401 
402 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
403 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
404 			break;
405 		}
406 	}
407 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
408 		/* Out of range, use the 2^16 value */
409 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
410 	}
411 
412 	/* Take values from /etc/system */
413 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
414 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
415 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
416 	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
417 
418 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
419 
420 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
421 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
422 
423 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
424 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
425 		    MUTEX_DEFAULT, NULL);
426 	}
427 
428 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
429 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
430 
431 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
432 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
433 		    MUTEX_DEFAULT, NULL);
434 	}
435 
436 	ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
437 	    sizeof (connf_t), KM_SLEEP);
438 	for (i = 0; i < IPPROTO_MAX; i++) {
439 		mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
440 		    MUTEX_DEFAULT, NULL);
441 	}
442 
443 	ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
444 	    sizeof (connf_t), KM_SLEEP);
445 	for (i = 0; i < IPPROTO_MAX; i++) {
446 		mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
447 		    MUTEX_DEFAULT, NULL);
448 	}
449 
450 	ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
451 	mutex_init(&ipst->ips_rts_clients->connf_lock,
452 	    NULL, MUTEX_DEFAULT, NULL);
453 
454 	ipst->ips_ipcl_udp_fanout = kmem_zalloc(
455 	    ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
456 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
457 		mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
458 		    MUTEX_DEFAULT, NULL);
459 	}
460 
461 	ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
462 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
463 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
464 		mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
465 		    MUTEX_DEFAULT, NULL);
466 	}
467 
468 	ipst->ips_ipcl_raw_fanout = kmem_zalloc(
469 	    ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
470 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
471 		mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
472 		    MUTEX_DEFAULT, NULL);
473 	}
474 
475 	ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
476 	    sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
477 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
478 		mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
479 		    NULL, MUTEX_DEFAULT, NULL);
480 	}
481 }
482 
483 void
484 ipcl_g_destroy(void)
485 {
486 	kmem_cache_destroy(ip_conn_cache);
487 	kmem_cache_destroy(tcp_conn_cache);
488 	kmem_cache_destroy(udp_conn_cache);
489 	kmem_cache_destroy(rawip_conn_cache);
490 	kmem_cache_destroy(rts_conn_cache);
491 }
492 
493 /*
494  * All user-level and kernel use of the stack must be gone
495  * by now.
496  */
497 void
498 ipcl_destroy(ip_stack_t *ipst)
499 {
500 	int i;
501 
502 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
503 		ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
504 		mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
505 	}
506 	kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
507 	    sizeof (connf_t));
508 	ipst->ips_ipcl_conn_fanout = NULL;
509 
510 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
511 		ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
512 		mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
513 	}
514 	kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
515 	    sizeof (connf_t));
516 	ipst->ips_ipcl_bind_fanout = NULL;
517 
518 	for (i = 0; i < IPPROTO_MAX; i++) {
519 		ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
520 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
521 	}
522 	kmem_free(ipst->ips_ipcl_proto_fanout_v4,
523 	    IPPROTO_MAX * sizeof (connf_t));
524 	ipst->ips_ipcl_proto_fanout_v4 = NULL;
525 
526 	for (i = 0; i < IPPROTO_MAX; i++) {
527 		ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
528 		mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
529 	}
530 	kmem_free(ipst->ips_ipcl_proto_fanout_v6,
531 	    IPPROTO_MAX * sizeof (connf_t));
532 	ipst->ips_ipcl_proto_fanout_v6 = NULL;
533 
534 	for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
535 		ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
536 		mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
537 	}
538 	kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
539 	    sizeof (connf_t));
540 	ipst->ips_ipcl_udp_fanout = NULL;
541 
542 	for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
543 		ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
544 		mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
545 	}
546 	kmem_free(ipst->ips_ipcl_iptun_fanout,
547 	    ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
548 	ipst->ips_ipcl_iptun_fanout = NULL;
549 
550 	for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
551 		ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
552 		mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
553 	}
554 	kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
555 	    sizeof (connf_t));
556 	ipst->ips_ipcl_raw_fanout = NULL;
557 
558 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
559 		ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
560 		mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
561 	}
562 	kmem_free(ipst->ips_ipcl_globalhash_fanout,
563 	    sizeof (connf_t) * CONN_G_HASH_SIZE);
564 	ipst->ips_ipcl_globalhash_fanout = NULL;
565 
566 	ASSERT(ipst->ips_rts_clients->connf_head == NULL);
567 	mutex_destroy(&ipst->ips_rts_clients->connf_lock);
568 	kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
569 	ipst->ips_rts_clients = NULL;
570 }
571 
572 /*
573  * conn creation routine. initialize the conn, sets the reference
574  * and inserts it in the global hash table.
575  */
576 conn_t *
577 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
578 {
579 	conn_t	*connp;
580 	struct kmem_cache *conn_cache;
581 
582 	switch (type) {
583 	case IPCL_SCTPCONN:
584 		if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
585 			return (NULL);
586 		sctp_conn_init(connp);
587 		netstack_hold(ns);
588 		connp->conn_netstack = ns;
589 		connp->conn_ixa->ixa_ipst = ns->netstack_ip;
590 		connp->conn_ixa->ixa_conn_id = (long)connp;
591 		ipcl_globalhash_insert(connp);
592 		return (connp);
593 
594 	case IPCL_TCPCONN:
595 		conn_cache = tcp_conn_cache;
596 		break;
597 
598 	case IPCL_UDPCONN:
599 		conn_cache = udp_conn_cache;
600 		break;
601 
602 	case IPCL_RAWIPCONN:
603 		conn_cache = rawip_conn_cache;
604 		break;
605 
606 	case IPCL_RTSCONN:
607 		conn_cache = rts_conn_cache;
608 		break;
609 
610 	case IPCL_IPCCONN:
611 		conn_cache = ip_conn_cache;
612 		break;
613 
614 	default:
615 		conn_cache = NULL;
616 		connp = NULL;
617 		ASSERT(0);
618 	}
619 
620 	if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
621 		return (NULL);
622 
623 	connp->conn_ref = 1;
624 	netstack_hold(ns);
625 	connp->conn_netstack = ns;
626 	connp->conn_ixa->ixa_ipst = ns->netstack_ip;
627 	connp->conn_ixa->ixa_conn_id = (long)connp;
628 	ipcl_globalhash_insert(connp);
629 	return (connp);
630 }
631 
632 void
633 ipcl_conn_destroy(conn_t *connp)
634 {
635 	mblk_t	*mp;
636 	netstack_t	*ns = connp->conn_netstack;
637 
638 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
639 	ASSERT(connp->conn_ref == 0);
640 	ASSERT(connp->conn_ioctlref == 0);
641 
642 	DTRACE_PROBE1(conn__destroy, conn_t *, connp);
643 
644 	if (connp->conn_cred != NULL) {
645 		crfree(connp->conn_cred);
646 		connp->conn_cred = NULL;
647 		/* ixa_cred done in ipcl_conn_cleanup below */
648 	}
649 
650 	if (connp->conn_ht_iphc != NULL) {
651 		kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
652 		connp->conn_ht_iphc = NULL;
653 		connp->conn_ht_iphc_allocated = 0;
654 		connp->conn_ht_iphc_len = 0;
655 		connp->conn_ht_ulp = NULL;
656 		connp->conn_ht_ulp_len = 0;
657 	}
658 	ip_pkt_free(&connp->conn_xmit_ipp);
659 
660 	ipcl_globalhash_remove(connp);
661 
662 	if (connp->conn_latch != NULL) {
663 		IPLATCH_REFRELE(connp->conn_latch);
664 		connp->conn_latch = NULL;
665 	}
666 	if (connp->conn_latch_in_policy != NULL) {
667 		IPPOL_REFRELE(connp->conn_latch_in_policy);
668 		connp->conn_latch_in_policy = NULL;
669 	}
670 	if (connp->conn_latch_in_action != NULL) {
671 		IPACT_REFRELE(connp->conn_latch_in_action);
672 		connp->conn_latch_in_action = NULL;
673 	}
674 	if (connp->conn_policy != NULL) {
675 		IPPH_REFRELE(connp->conn_policy, ns);
676 		connp->conn_policy = NULL;
677 	}
678 
679 	if (connp->conn_ipsec_opt_mp != NULL) {
680 		freemsg(connp->conn_ipsec_opt_mp);
681 		connp->conn_ipsec_opt_mp = NULL;
682 	}
683 
684 	if (connp->conn_flags & IPCL_TCPCONN) {
685 		tcp_t *tcp = connp->conn_tcp;
686 
687 		tcp_free(tcp);
688 		mp = tcp->tcp_timercache;
689 
690 		tcp->tcp_tcps = NULL;
691 
692 		/*
693 		 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
694 		 * the mblk.
695 		 */
696 		if (tcp->tcp_rsrv_mp != NULL) {
697 			freeb(tcp->tcp_rsrv_mp);
698 			tcp->tcp_rsrv_mp = NULL;
699 			mutex_destroy(&tcp->tcp_rsrv_mp_lock);
700 		}
701 
702 		ipcl_conn_cleanup(connp);
703 		connp->conn_flags = IPCL_TCPCONN;
704 		if (ns != NULL) {
705 			ASSERT(tcp->tcp_tcps == NULL);
706 			connp->conn_netstack = NULL;
707 			connp->conn_ixa->ixa_ipst = NULL;
708 			netstack_rele(ns);
709 		}
710 
711 		bzero(tcp, sizeof (tcp_t));
712 
713 		tcp->tcp_timercache = mp;
714 		tcp->tcp_connp = connp;
715 		kmem_cache_free(tcp_conn_cache, connp);
716 		return;
717 	}
718 
719 	if (connp->conn_flags & IPCL_SCTPCONN) {
720 		ASSERT(ns != NULL);
721 		sctp_free(connp);
722 		return;
723 	}
724 
725 	ipcl_conn_cleanup(connp);
726 	if (ns != NULL) {
727 		connp->conn_netstack = NULL;
728 		connp->conn_ixa->ixa_ipst = NULL;
729 		netstack_rele(ns);
730 	}
731 
732 	/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
733 	if (connp->conn_flags & IPCL_UDPCONN) {
734 		connp->conn_flags = IPCL_UDPCONN;
735 		kmem_cache_free(udp_conn_cache, connp);
736 	} else if (connp->conn_flags & IPCL_RAWIPCONN) {
737 		connp->conn_flags = IPCL_RAWIPCONN;
738 		connp->conn_proto = IPPROTO_ICMP;
739 		connp->conn_ixa->ixa_protocol = connp->conn_proto;
740 		kmem_cache_free(rawip_conn_cache, connp);
741 	} else if (connp->conn_flags & IPCL_RTSCONN) {
742 		connp->conn_flags = IPCL_RTSCONN;
743 		kmem_cache_free(rts_conn_cache, connp);
744 	} else {
745 		connp->conn_flags = IPCL_IPCCONN;
746 		ASSERT(connp->conn_flags & IPCL_IPCCONN);
747 		ASSERT(connp->conn_priv == NULL);
748 		kmem_cache_free(ip_conn_cache, connp);
749 	}
750 }
751 
752 /*
753  * Running in cluster mode - deregister listener information
754  */
755 static void
756 ipcl_conn_unlisten(conn_t *connp)
757 {
758 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
759 	ASSERT(connp->conn_lport != 0);
760 
761 	if (cl_inet_unlisten != NULL) {
762 		sa_family_t	addr_family;
763 		uint8_t		*laddrp;
764 
765 		if (connp->conn_ipversion == IPV6_VERSION) {
766 			addr_family = AF_INET6;
767 			laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
768 		} else {
769 			addr_family = AF_INET;
770 			laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
771 		}
772 		(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
773 		    IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
774 	}
775 	connp->conn_flags &= ~IPCL_CL_LISTENER;
776 }
777 
778 /*
779  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
780  * which table the conn belonged to). So for debugging we can see which hash
781  * table this connection was in.
782  */
783 #define	IPCL_HASH_REMOVE(connp)	{					\
784 	connf_t	*connfp = (connp)->conn_fanout;				\
785 	ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));			\
786 	if (connfp != NULL) {						\
787 		mutex_enter(&connfp->connf_lock);			\
788 		if ((connp)->conn_next != NULL)				\
789 			(connp)->conn_next->conn_prev =			\
790 			    (connp)->conn_prev;				\
791 		if ((connp)->conn_prev != NULL)				\
792 			(connp)->conn_prev->conn_next =			\
793 			    (connp)->conn_next;				\
794 		else							\
795 			connfp->connf_head = (connp)->conn_next;	\
796 		(connp)->conn_fanout = NULL;				\
797 		(connp)->conn_next = NULL;				\
798 		(connp)->conn_prev = NULL;				\
799 		(connp)->conn_flags |= IPCL_REMOVED;			\
800 		if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)	\
801 			ipcl_conn_unlisten((connp));			\
802 		CONN_DEC_REF((connp));					\
803 		mutex_exit(&connfp->connf_lock);			\
804 	}								\
805 }
806 
807 void
808 ipcl_hash_remove(conn_t *connp)
809 {
810 	uint8_t		protocol = connp->conn_proto;
811 
812 	IPCL_HASH_REMOVE(connp);
813 	if (protocol == IPPROTO_RSVP)
814 		ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
815 }
816 
817 /*
818  * The whole purpose of this function is allow removal of
819  * a conn_t from the connected hash for timewait reclaim.
820  * This is essentially a TW reclaim fastpath where timewait
821  * collector checks under fanout lock (so no one else can
822  * get access to the conn_t) that refcnt is 2 i.e. one for
823  * TCP and one for the classifier hash list. If ref count
824  * is indeed 2, we can just remove the conn under lock and
825  * avoid cleaning up the conn under squeue. This gives us
826  * improved performance.
827  */
828 void
829 ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
830 {
831 	ASSERT(MUTEX_HELD(&connfp->connf_lock));
832 	ASSERT(MUTEX_HELD(&connp->conn_lock));
833 	ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
834 
835 	if ((connp)->conn_next != NULL) {
836 		(connp)->conn_next->conn_prev = (connp)->conn_prev;
837 	}
838 	if ((connp)->conn_prev != NULL) {
839 		(connp)->conn_prev->conn_next = (connp)->conn_next;
840 	} else {
841 		connfp->connf_head = (connp)->conn_next;
842 	}
843 	(connp)->conn_fanout = NULL;
844 	(connp)->conn_next = NULL;
845 	(connp)->conn_prev = NULL;
846 	(connp)->conn_flags |= IPCL_REMOVED;
847 	ASSERT((connp)->conn_ref == 2);
848 	(connp)->conn_ref--;
849 }
850 
851 #define	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {		\
852 	ASSERT((connp)->conn_fanout == NULL);				\
853 	ASSERT((connp)->conn_next == NULL);				\
854 	ASSERT((connp)->conn_prev == NULL);				\
855 	if ((connfp)->connf_head != NULL) {				\
856 		(connfp)->connf_head->conn_prev = (connp);		\
857 		(connp)->conn_next = (connfp)->connf_head;		\
858 	}								\
859 	(connp)->conn_fanout = (connfp);				\
860 	(connfp)->connf_head = (connp);					\
861 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
862 	    IPCL_CONNECTED;						\
863 	CONN_INC_REF(connp);						\
864 }
865 
866 #define	IPCL_HASH_INSERT_CONNECTED(connfp, connp) {			\
867 	IPCL_HASH_REMOVE((connp));					\
868 	mutex_enter(&(connfp)->connf_lock);				\
869 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);		\
870 	mutex_exit(&(connfp)->connf_lock);				\
871 }
872 
873 #define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
874 	conn_t *pconnp = NULL, *nconnp;					\
875 	IPCL_HASH_REMOVE((connp));					\
876 	mutex_enter(&(connfp)->connf_lock);				\
877 	nconnp = (connfp)->connf_head;					\
878 	while (nconnp != NULL &&					\
879 	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
880 		pconnp = nconnp;					\
881 		nconnp = nconnp->conn_next;				\
882 	}								\
883 	if (pconnp != NULL) {						\
884 		pconnp->conn_next = (connp);				\
885 		(connp)->conn_prev = pconnp;				\
886 	} else {							\
887 		(connfp)->connf_head = (connp);				\
888 	}								\
889 	if (nconnp != NULL) {						\
890 		(connp)->conn_next = nconnp;				\
891 		nconnp->conn_prev = (connp);				\
892 	}								\
893 	(connp)->conn_fanout = (connfp);				\
894 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
895 	    IPCL_BOUND;							\
896 	CONN_INC_REF(connp);						\
897 	mutex_exit(&(connfp)->connf_lock);				\
898 }
899 
900 #define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
901 	conn_t **list, *prev, *next;					\
902 	boolean_t isv4mapped =						\
903 	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
904 	IPCL_HASH_REMOVE((connp));					\
905 	mutex_enter(&(connfp)->connf_lock);				\
906 	list = &(connfp)->connf_head;					\
907 	prev = NULL;							\
908 	while ((next = *list) != NULL) {				\
909 		if (isv4mapped &&					\
910 		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
911 		    connp->conn_zoneid == next->conn_zoneid) {		\
912 			(connp)->conn_next = next;			\
913 			if (prev != NULL)				\
914 				prev = next->conn_prev;			\
915 			next->conn_prev = (connp);			\
916 			break;						\
917 		}							\
918 		list = &next->conn_next;				\
919 		prev = next;						\
920 	}								\
921 	(connp)->conn_prev = prev;					\
922 	*list = (connp);						\
923 	(connp)->conn_fanout = (connfp);				\
924 	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
925 	    IPCL_BOUND;							\
926 	CONN_INC_REF((connp));						\
927 	mutex_exit(&(connfp)->connf_lock);				\
928 }
929 
930 void
931 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
932 {
933 	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
934 }
935 
936 /*
937  * Because the classifier is used to classify inbound packets, the destination
938  * address is meant to be our local tunnel address (tunnel source), and the
939  * source the remote tunnel address (tunnel destination).
940  *
941  * Note that conn_proto can't be used for fanout since the upper protocol
942  * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
943  */
944 conn_t *
945 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
946 {
947 	connf_t	*connfp;
948 	conn_t	*connp;
949 
950 	/* first look for IPv4 tunnel links */
951 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
952 	mutex_enter(&connfp->connf_lock);
953 	for (connp = connfp->connf_head; connp != NULL;
954 	    connp = connp->conn_next) {
955 		if (IPCL_IPTUN_MATCH(connp, *dst, *src))
956 			break;
957 	}
958 	if (connp != NULL)
959 		goto done;
960 
961 	mutex_exit(&connfp->connf_lock);
962 
963 	/* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
964 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
965 	    INADDR_ANY)];
966 	mutex_enter(&connfp->connf_lock);
967 	for (connp = connfp->connf_head; connp != NULL;
968 	    connp = connp->conn_next) {
969 		if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
970 			break;
971 	}
972 done:
973 	if (connp != NULL)
974 		CONN_INC_REF(connp);
975 	mutex_exit(&connfp->connf_lock);
976 	return (connp);
977 }
978 
979 conn_t *
980 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
981 {
982 	connf_t	*connfp;
983 	conn_t	*connp;
984 
985 	/* Look for an IPv6 tunnel link */
986 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
987 	mutex_enter(&connfp->connf_lock);
988 	for (connp = connfp->connf_head; connp != NULL;
989 	    connp = connp->conn_next) {
990 		if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
991 			CONN_INC_REF(connp);
992 			break;
993 		}
994 	}
995 	mutex_exit(&connfp->connf_lock);
996 	return (connp);
997 }
998 
999 /*
1000  * This function is used only for inserting SCTP raw socket now.
1001  * This may change later.
1002  *
1003  * Note that only one raw socket can be bound to a port.  The param
1004  * lport is in network byte order.
1005  */
1006 static int
1007 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1008 {
1009 	connf_t	*connfp;
1010 	conn_t	*oconnp;
1011 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1012 
1013 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1014 
1015 	/* Check for existing raw socket already bound to the port. */
1016 	mutex_enter(&connfp->connf_lock);
1017 	for (oconnp = connfp->connf_head; oconnp != NULL;
1018 	    oconnp = oconnp->conn_next) {
1019 		if (oconnp->conn_lport == lport &&
1020 		    oconnp->conn_zoneid == connp->conn_zoneid &&
1021 		    oconnp->conn_family == connp->conn_family &&
1022 		    ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1023 		    IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1024 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1025 		    IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1026 		    IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1027 		    &connp->conn_laddr_v6))) {
1028 			break;
1029 		}
1030 	}
1031 	mutex_exit(&connfp->connf_lock);
1032 	if (oconnp != NULL)
1033 		return (EADDRNOTAVAIL);
1034 
1035 	if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1036 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1037 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1038 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1039 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1040 		} else {
1041 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1042 		}
1043 	} else {
1044 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1045 	}
1046 	return (0);
1047 }
1048 
1049 static int
1050 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1051 {
1052 	connf_t	*connfp;
1053 	conn_t	*tconnp;
1054 	ipaddr_t laddr = connp->conn_laddr_v4;
1055 	ipaddr_t faddr = connp->conn_faddr_v4;
1056 
1057 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1058 	mutex_enter(&connfp->connf_lock);
1059 	for (tconnp = connfp->connf_head; tconnp != NULL;
1060 	    tconnp = tconnp->conn_next) {
1061 		if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1062 			/* A tunnel is already bound to these addresses. */
1063 			mutex_exit(&connfp->connf_lock);
1064 			return (EADDRINUSE);
1065 		}
1066 	}
1067 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1068 	mutex_exit(&connfp->connf_lock);
1069 	return (0);
1070 }
1071 
1072 static int
1073 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1074 {
1075 	connf_t	*connfp;
1076 	conn_t	*tconnp;
1077 	in6_addr_t *laddr = &connp->conn_laddr_v6;
1078 	in6_addr_t *faddr = &connp->conn_faddr_v6;
1079 
1080 	connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1081 	mutex_enter(&connfp->connf_lock);
1082 	for (tconnp = connfp->connf_head; tconnp != NULL;
1083 	    tconnp = tconnp->conn_next) {
1084 		if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1085 			/* A tunnel is already bound to these addresses. */
1086 			mutex_exit(&connfp->connf_lock);
1087 			return (EADDRINUSE);
1088 		}
1089 	}
1090 	IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1091 	mutex_exit(&connfp->connf_lock);
1092 	return (0);
1093 }
1094 
1095 /*
1096  * Check for a MAC exemption conflict on a labeled system.  Note that for
1097  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1098  * transport layer.  This check is for binding all other protocols.
1099  *
1100  * Returns true if there's a conflict.
1101  */
1102 static boolean_t
1103 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1104 {
1105 	connf_t	*connfp;
1106 	conn_t *tconn;
1107 
1108 	connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1109 	mutex_enter(&connfp->connf_lock);
1110 	for (tconn = connfp->connf_head; tconn != NULL;
1111 	    tconn = tconn->conn_next) {
1112 		/* We don't allow v4 fallback for v6 raw socket */
1113 		if (connp->conn_family != tconn->conn_family)
1114 			continue;
1115 		/* If neither is exempt, then there's no conflict */
1116 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1117 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1118 			continue;
1119 		/* We are only concerned about sockets for a different zone */
1120 		if (connp->conn_zoneid == tconn->conn_zoneid)
1121 			continue;
1122 		/* If both are bound to different specific addrs, ok */
1123 		if (connp->conn_laddr_v4 != INADDR_ANY &&
1124 		    tconn->conn_laddr_v4 != INADDR_ANY &&
1125 		    connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1126 			continue;
1127 		/* These two conflict; fail */
1128 		break;
1129 	}
1130 	mutex_exit(&connfp->connf_lock);
1131 	return (tconn != NULL);
1132 }
1133 
1134 static boolean_t
1135 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1136 {
1137 	connf_t	*connfp;
1138 	conn_t *tconn;
1139 
1140 	connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1141 	mutex_enter(&connfp->connf_lock);
1142 	for (tconn = connfp->connf_head; tconn != NULL;
1143 	    tconn = tconn->conn_next) {
1144 		/* We don't allow v4 fallback for v6 raw socket */
1145 		if (connp->conn_family != tconn->conn_family)
1146 			continue;
1147 		/* If neither is exempt, then there's no conflict */
1148 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1149 		    (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1150 			continue;
1151 		/* We are only concerned about sockets for a different zone */
1152 		if (connp->conn_zoneid == tconn->conn_zoneid)
1153 			continue;
1154 		/* If both are bound to different addrs, ok */
1155 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1156 		    !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1157 		    !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1158 		    &tconn->conn_laddr_v6))
1159 			continue;
1160 		/* These two conflict; fail */
1161 		break;
1162 	}
1163 	mutex_exit(&connfp->connf_lock);
1164 	return (tconn != NULL);
1165 }
1166 
1167 /*
1168  * (v4, v6) bind hash insertion routines
1169  * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1170  */
1171 
1172 int
1173 ipcl_bind_insert(conn_t *connp)
1174 {
1175 	if (connp->conn_ipversion == IPV6_VERSION)
1176 		return (ipcl_bind_insert_v6(connp));
1177 	else
1178 		return (ipcl_bind_insert_v4(connp));
1179 }
1180 
1181 int
1182 ipcl_bind_insert_v4(conn_t *connp)
1183 {
1184 	connf_t	*connfp;
1185 	int	ret = 0;
1186 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1187 	uint16_t	lport = connp->conn_lport;
1188 	uint8_t		protocol = connp->conn_proto;
1189 
1190 	if (IPCL_IS_IPTUN(connp))
1191 		return (ipcl_iptun_hash_insert(connp, ipst));
1192 
1193 	switch (protocol) {
1194 	default:
1195 		if (is_system_labeled() &&
1196 		    check_exempt_conflict_v4(connp, ipst))
1197 			return (EADDRINUSE);
1198 		/* FALLTHROUGH */
1199 	case IPPROTO_UDP:
1200 		if (protocol == IPPROTO_UDP) {
1201 			connfp = &ipst->ips_ipcl_udp_fanout[
1202 			    IPCL_UDP_HASH(lport, ipst)];
1203 		} else {
1204 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1205 		}
1206 
1207 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1208 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1209 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1210 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1211 		} else {
1212 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1213 		}
1214 		if (protocol == IPPROTO_RSVP)
1215 			ill_set_inputfn_all(ipst);
1216 		break;
1217 
1218 	case IPPROTO_TCP:
1219 		/* Insert it in the Bind Hash */
1220 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1221 		connfp = &ipst->ips_ipcl_bind_fanout[
1222 		    IPCL_BIND_HASH(lport, ipst)];
1223 		if (connp->conn_laddr_v4 != INADDR_ANY) {
1224 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1225 		} else {
1226 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1227 		}
1228 		if (cl_inet_listen != NULL) {
1229 			ASSERT(connp->conn_ipversion == IPV4_VERSION);
1230 			connp->conn_flags |= IPCL_CL_LISTENER;
1231 			(*cl_inet_listen)(
1232 			    connp->conn_netstack->netstack_stackid,
1233 			    IPPROTO_TCP, AF_INET,
1234 			    (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1235 		}
1236 		break;
1237 
1238 	case IPPROTO_SCTP:
1239 		ret = ipcl_sctp_hash_insert(connp, lport);
1240 		break;
1241 	}
1242 
1243 	return (ret);
1244 }
1245 
1246 int
1247 ipcl_bind_insert_v6(conn_t *connp)
1248 {
1249 	connf_t		*connfp;
1250 	int		ret = 0;
1251 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1252 	uint16_t	lport = connp->conn_lport;
1253 	uint8_t		protocol = connp->conn_proto;
1254 
1255 	if (IPCL_IS_IPTUN(connp)) {
1256 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1257 	}
1258 
1259 	switch (protocol) {
1260 	default:
1261 		if (is_system_labeled() &&
1262 		    check_exempt_conflict_v6(connp, ipst))
1263 			return (EADDRINUSE);
1264 		/* FALLTHROUGH */
1265 	case IPPROTO_UDP:
1266 		if (protocol == IPPROTO_UDP) {
1267 			connfp = &ipst->ips_ipcl_udp_fanout[
1268 			    IPCL_UDP_HASH(lport, ipst)];
1269 		} else {
1270 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1271 		}
1272 
1273 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1274 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1275 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1276 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1277 		} else {
1278 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1279 		}
1280 		break;
1281 
1282 	case IPPROTO_TCP:
1283 		/* Insert it in the Bind Hash */
1284 		ASSERT(connp->conn_zoneid != ALL_ZONES);
1285 		connfp = &ipst->ips_ipcl_bind_fanout[
1286 		    IPCL_BIND_HASH(lport, ipst)];
1287 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1288 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1289 		} else {
1290 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1291 		}
1292 		if (cl_inet_listen != NULL) {
1293 			sa_family_t	addr_family;
1294 			uint8_t		*laddrp;
1295 
1296 			if (connp->conn_ipversion == IPV6_VERSION) {
1297 				addr_family = AF_INET6;
1298 				laddrp =
1299 				    (uint8_t *)&connp->conn_bound_addr_v6;
1300 			} else {
1301 				addr_family = AF_INET;
1302 				laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1303 			}
1304 			connp->conn_flags |= IPCL_CL_LISTENER;
1305 			(*cl_inet_listen)(
1306 			    connp->conn_netstack->netstack_stackid,
1307 			    IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1308 		}
1309 		break;
1310 
1311 	case IPPROTO_SCTP:
1312 		ret = ipcl_sctp_hash_insert(connp, lport);
1313 		break;
1314 	}
1315 
1316 	return (ret);
1317 }
1318 
1319 /*
1320  * ipcl_conn_hash insertion routines.
1321  * The caller has already set conn_proto and the addresses/ports in the conn_t.
1322  */
1323 
1324 int
1325 ipcl_conn_insert(conn_t *connp)
1326 {
1327 	if (connp->conn_ipversion == IPV6_VERSION)
1328 		return (ipcl_conn_insert_v6(connp));
1329 	else
1330 		return (ipcl_conn_insert_v4(connp));
1331 }
1332 
1333 int
1334 ipcl_conn_insert_v4(conn_t *connp)
1335 {
1336 	connf_t		*connfp;
1337 	conn_t		*tconnp;
1338 	int		ret = 0;
1339 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1340 	uint16_t	lport = connp->conn_lport;
1341 	uint8_t		protocol = connp->conn_proto;
1342 
1343 	if (IPCL_IS_IPTUN(connp))
1344 		return (ipcl_iptun_hash_insert(connp, ipst));
1345 
1346 	switch (protocol) {
1347 	case IPPROTO_TCP:
1348 		/*
1349 		 * For TCP, we check whether the connection tuple already
1350 		 * exists before allowing the connection to proceed.  We
1351 		 * also allow indexing on the zoneid. This is to allow
1352 		 * multiple shared stack zones to have the same tcp
1353 		 * connection tuple. In practice this only happens for
1354 		 * INADDR_LOOPBACK as it's the only local address which
1355 		 * doesn't have to be unique.
1356 		 */
1357 		connfp = &ipst->ips_ipcl_conn_fanout[
1358 		    IPCL_CONN_HASH(connp->conn_faddr_v4,
1359 		    connp->conn_ports, ipst)];
1360 		mutex_enter(&connfp->connf_lock);
1361 		for (tconnp = connfp->connf_head; tconnp != NULL;
1362 		    tconnp = tconnp->conn_next) {
1363 			if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1364 			    connp->conn_faddr_v4, connp->conn_laddr_v4,
1365 			    connp->conn_ports) &&
1366 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1367 				/* Already have a conn. bail out */
1368 				mutex_exit(&connfp->connf_lock);
1369 				return (EADDRINUSE);
1370 			}
1371 		}
1372 		if (connp->conn_fanout != NULL) {
1373 			/*
1374 			 * Probably a XTI/TLI application trying to do a
1375 			 * rebind. Let it happen.
1376 			 */
1377 			mutex_exit(&connfp->connf_lock);
1378 			IPCL_HASH_REMOVE(connp);
1379 			mutex_enter(&connfp->connf_lock);
1380 		}
1381 
1382 		ASSERT(connp->conn_recv != NULL);
1383 		ASSERT(connp->conn_recvicmp != NULL);
1384 
1385 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1386 		mutex_exit(&connfp->connf_lock);
1387 		break;
1388 
1389 	case IPPROTO_SCTP:
1390 		/*
1391 		 * The raw socket may have already been bound, remove it
1392 		 * from the hash first.
1393 		 */
1394 		IPCL_HASH_REMOVE(connp);
1395 		ret = ipcl_sctp_hash_insert(connp, lport);
1396 		break;
1397 
1398 	default:
1399 		/*
1400 		 * Check for conflicts among MAC exempt bindings.  For
1401 		 * transports with port numbers, this is done by the upper
1402 		 * level per-transport binding logic.  For all others, it's
1403 		 * done here.
1404 		 */
1405 		if (is_system_labeled() &&
1406 		    check_exempt_conflict_v4(connp, ipst))
1407 			return (EADDRINUSE);
1408 		/* FALLTHROUGH */
1409 
1410 	case IPPROTO_UDP:
1411 		if (protocol == IPPROTO_UDP) {
1412 			connfp = &ipst->ips_ipcl_udp_fanout[
1413 			    IPCL_UDP_HASH(lport, ipst)];
1414 		} else {
1415 			connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1416 		}
1417 
1418 		if (connp->conn_faddr_v4 != INADDR_ANY) {
1419 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1420 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
1421 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1422 		} else {
1423 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1424 		}
1425 		break;
1426 	}
1427 
1428 	return (ret);
1429 }
1430 
1431 int
1432 ipcl_conn_insert_v6(conn_t *connp)
1433 {
1434 	connf_t		*connfp;
1435 	conn_t		*tconnp;
1436 	int		ret = 0;
1437 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
1438 	uint16_t	lport = connp->conn_lport;
1439 	uint8_t		protocol = connp->conn_proto;
1440 	uint_t		ifindex = connp->conn_bound_if;
1441 
1442 	if (IPCL_IS_IPTUN(connp))
1443 		return (ipcl_iptun_hash_insert_v6(connp, ipst));
1444 
1445 	switch (protocol) {
1446 	case IPPROTO_TCP:
1447 
1448 		/*
1449 		 * For tcp, we check whether the connection tuple already
1450 		 * exists before allowing the connection to proceed.  We
1451 		 * also allow indexing on the zoneid. This is to allow
1452 		 * multiple shared stack zones to have the same tcp
1453 		 * connection tuple. In practice this only happens for
1454 		 * ipv6_loopback as it's the only local address which
1455 		 * doesn't have to be unique.
1456 		 */
1457 		connfp = &ipst->ips_ipcl_conn_fanout[
1458 		    IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1459 		    ipst)];
1460 		mutex_enter(&connfp->connf_lock);
1461 		for (tconnp = connfp->connf_head; tconnp != NULL;
1462 		    tconnp = tconnp->conn_next) {
1463 			/* NOTE: need to match zoneid. Bug in onnv-gate */
1464 			if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1465 			    connp->conn_faddr_v6, connp->conn_laddr_v6,
1466 			    connp->conn_ports) &&
1467 			    (tconnp->conn_bound_if == 0 ||
1468 			    tconnp->conn_bound_if == ifindex) &&
1469 			    IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1470 				/* Already have a conn. bail out */
1471 				mutex_exit(&connfp->connf_lock);
1472 				return (EADDRINUSE);
1473 			}
1474 		}
1475 		if (connp->conn_fanout != NULL) {
1476 			/*
1477 			 * Probably a XTI/TLI application trying to do a
1478 			 * rebind. Let it happen.
1479 			 */
1480 			mutex_exit(&connfp->connf_lock);
1481 			IPCL_HASH_REMOVE(connp);
1482 			mutex_enter(&connfp->connf_lock);
1483 		}
1484 		IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1485 		mutex_exit(&connfp->connf_lock);
1486 		break;
1487 
1488 	case IPPROTO_SCTP:
1489 		IPCL_HASH_REMOVE(connp);
1490 		ret = ipcl_sctp_hash_insert(connp, lport);
1491 		break;
1492 
1493 	default:
1494 		if (is_system_labeled() &&
1495 		    check_exempt_conflict_v6(connp, ipst))
1496 			return (EADDRINUSE);
1497 		/* FALLTHROUGH */
1498 	case IPPROTO_UDP:
1499 		if (protocol == IPPROTO_UDP) {
1500 			connfp = &ipst->ips_ipcl_udp_fanout[
1501 			    IPCL_UDP_HASH(lport, ipst)];
1502 		} else {
1503 			connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1504 		}
1505 
1506 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1507 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1508 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1509 			IPCL_HASH_INSERT_BOUND(connfp, connp);
1510 		} else {
1511 			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1512 		}
1513 		break;
1514 	}
1515 
1516 	return (ret);
1517 }
1518 
1519 /*
1520  * v4 packet classifying function. looks up the fanout table to
1521  * find the conn, the packet belongs to. returns the conn with
1522  * the reference held, null otherwise.
1523  *
1524  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1525  * Lookup" comment block are applied.  Labels are also checked as described
1526  * above.  If the packet is from the inside (looped back), and is from the same
1527  * zone, then label checks are omitted.
1528  */
1529 conn_t *
1530 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1531     ip_recv_attr_t *ira, ip_stack_t *ipst)
1532 {
1533 	ipha_t	*ipha;
1534 	connf_t	*connfp, *bind_connfp;
1535 	uint16_t lport;
1536 	uint16_t fport;
1537 	uint32_t ports;
1538 	conn_t	*connp;
1539 	uint16_t  *up;
1540 	zoneid_t	zoneid = ira->ira_zoneid;
1541 
1542 	ipha = (ipha_t *)mp->b_rptr;
1543 	up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1544 
1545 	switch (protocol) {
1546 	case IPPROTO_TCP:
1547 		ports = *(uint32_t *)up;
1548 		connfp =
1549 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1550 		    ports, ipst)];
1551 		mutex_enter(&connfp->connf_lock);
1552 		for (connp = connfp->connf_head; connp != NULL;
1553 		    connp = connp->conn_next) {
1554 			if (IPCL_CONN_MATCH(connp, protocol,
1555 			    ipha->ipha_src, ipha->ipha_dst, ports) &&
1556 			    (connp->conn_zoneid == zoneid ||
1557 			    connp->conn_allzones ||
1558 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1559 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1560 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1561 				break;
1562 		}
1563 
1564 		if (connp != NULL) {
1565 			/*
1566 			 * We have a fully-bound TCP connection.
1567 			 *
1568 			 * For labeled systems, there's no need to check the
1569 			 * label here.  It's known to be good as we checked
1570 			 * before allowing the connection to become bound.
1571 			 */
1572 			CONN_INC_REF(connp);
1573 			mutex_exit(&connfp->connf_lock);
1574 			return (connp);
1575 		}
1576 
1577 		mutex_exit(&connfp->connf_lock);
1578 		lport = up[1];
1579 		bind_connfp =
1580 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1581 		mutex_enter(&bind_connfp->connf_lock);
1582 		for (connp = bind_connfp->connf_head; connp != NULL;
1583 		    connp = connp->conn_next) {
1584 			if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1585 			    lport) &&
1586 			    (connp->conn_zoneid == zoneid ||
1587 			    connp->conn_allzones ||
1588 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1589 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1590 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1591 				break;
1592 		}
1593 
1594 		/*
1595 		 * If the matching connection is SLP on a private address, then
1596 		 * the label on the packet must match the local zone's label.
1597 		 * Otherwise, it must be in the label range defined by tnrh.
1598 		 * This is ensured by tsol_receive_local.
1599 		 *
1600 		 * Note that we don't check tsol_receive_local for
1601 		 * the connected case.
1602 		 */
1603 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1604 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1605 		    ira, connp)) {
1606 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1607 			    char *, "connp(1) could not receive mp(2)",
1608 			    conn_t *, connp, mblk_t *, mp);
1609 			connp = NULL;
1610 		}
1611 
1612 		if (connp != NULL) {
1613 			/* Have a listener at least */
1614 			CONN_INC_REF(connp);
1615 			mutex_exit(&bind_connfp->connf_lock);
1616 			return (connp);
1617 		}
1618 
1619 		mutex_exit(&bind_connfp->connf_lock);
1620 		break;
1621 
1622 	case IPPROTO_UDP:
1623 		lport = up[1];
1624 		fport = up[0];
1625 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1626 		mutex_enter(&connfp->connf_lock);
1627 		for (connp = connfp->connf_head; connp != NULL;
1628 		    connp = connp->conn_next) {
1629 			if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1630 			    fport, ipha->ipha_src) &&
1631 			    (connp->conn_zoneid == zoneid ||
1632 			    connp->conn_allzones ||
1633 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1634 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1635 				break;
1636 		}
1637 
1638 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1639 		    !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1640 		    ira, connp)) {
1641 			DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1642 			    char *, "connp(1) could not receive mp(2)",
1643 			    conn_t *, connp, mblk_t *, mp);
1644 			connp = NULL;
1645 		}
1646 
1647 		if (connp != NULL) {
1648 			CONN_INC_REF(connp);
1649 			mutex_exit(&connfp->connf_lock);
1650 			return (connp);
1651 		}
1652 
1653 		/*
1654 		 * We shouldn't come here for multicast/broadcast packets
1655 		 */
1656 		mutex_exit(&connfp->connf_lock);
1657 
1658 		break;
1659 
1660 	case IPPROTO_ENCAP:
1661 	case IPPROTO_IPV6:
1662 		return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1663 		    &ipha->ipha_dst, ipst));
1664 	}
1665 
1666 	return (NULL);
1667 }
1668 
1669 conn_t *
1670 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1671     ip_recv_attr_t *ira, ip_stack_t *ipst)
1672 {
1673 	ip6_t		*ip6h;
1674 	connf_t		*connfp, *bind_connfp;
1675 	uint16_t	lport;
1676 	uint16_t	fport;
1677 	tcpha_t		*tcpha;
1678 	uint32_t	ports;
1679 	conn_t		*connp;
1680 	uint16_t	*up;
1681 	zoneid_t	zoneid = ira->ira_zoneid;
1682 
1683 	ip6h = (ip6_t *)mp->b_rptr;
1684 
1685 	switch (protocol) {
1686 	case IPPROTO_TCP:
1687 		tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1688 		up = &tcpha->tha_lport;
1689 		ports = *(uint32_t *)up;
1690 
1691 		connfp =
1692 		    &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1693 		    ports, ipst)];
1694 		mutex_enter(&connfp->connf_lock);
1695 		for (connp = connfp->connf_head; connp != NULL;
1696 		    connp = connp->conn_next) {
1697 			if (IPCL_CONN_MATCH_V6(connp, protocol,
1698 			    ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1699 			    (connp->conn_zoneid == zoneid ||
1700 			    connp->conn_allzones ||
1701 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1702 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1703 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1704 				break;
1705 		}
1706 
1707 		if (connp != NULL) {
1708 			/*
1709 			 * We have a fully-bound TCP connection.
1710 			 *
1711 			 * For labeled systems, there's no need to check the
1712 			 * label here.  It's known to be good as we checked
1713 			 * before allowing the connection to become bound.
1714 			 */
1715 			CONN_INC_REF(connp);
1716 			mutex_exit(&connfp->connf_lock);
1717 			return (connp);
1718 		}
1719 
1720 		mutex_exit(&connfp->connf_lock);
1721 
1722 		lport = up[1];
1723 		bind_connfp =
1724 		    &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1725 		mutex_enter(&bind_connfp->connf_lock);
1726 		for (connp = bind_connfp->connf_head; connp != NULL;
1727 		    connp = connp->conn_next) {
1728 			if (IPCL_BIND_MATCH_V6(connp, protocol,
1729 			    ip6h->ip6_dst, lport) &&
1730 			    (connp->conn_zoneid == zoneid ||
1731 			    connp->conn_allzones ||
1732 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1733 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1734 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1735 				break;
1736 		}
1737 
1738 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1739 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1740 		    ira, connp)) {
1741 			DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1742 			    char *, "connp(1) could not receive mp(2)",
1743 			    conn_t *, connp, mblk_t *, mp);
1744 			connp = NULL;
1745 		}
1746 
1747 		if (connp != NULL) {
1748 			/* Have a listner at least */
1749 			CONN_INC_REF(connp);
1750 			mutex_exit(&bind_connfp->connf_lock);
1751 			return (connp);
1752 		}
1753 
1754 		mutex_exit(&bind_connfp->connf_lock);
1755 		break;
1756 
1757 	case IPPROTO_UDP:
1758 		up = (uint16_t *)&mp->b_rptr[hdr_len];
1759 		lport = up[1];
1760 		fport = up[0];
1761 		connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1762 		mutex_enter(&connfp->connf_lock);
1763 		for (connp = connfp->connf_head; connp != NULL;
1764 		    connp = connp->conn_next) {
1765 			if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1766 			    fport, ip6h->ip6_src) &&
1767 			    (connp->conn_zoneid == zoneid ||
1768 			    connp->conn_allzones ||
1769 			    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1770 			    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1771 			    (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1772 				break;
1773 		}
1774 
1775 		if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1776 		    !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1777 		    ira, connp)) {
1778 			DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1779 			    char *, "connp(1) could not receive mp(2)",
1780 			    conn_t *, connp, mblk_t *, mp);
1781 			connp = NULL;
1782 		}
1783 
1784 		if (connp != NULL) {
1785 			CONN_INC_REF(connp);
1786 			mutex_exit(&connfp->connf_lock);
1787 			return (connp);
1788 		}
1789 
1790 		/*
1791 		 * We shouldn't come here for multicast/broadcast packets
1792 		 */
1793 		mutex_exit(&connfp->connf_lock);
1794 		break;
1795 	case IPPROTO_ENCAP:
1796 	case IPPROTO_IPV6:
1797 		return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1798 		    &ip6h->ip6_dst, ipst));
1799 	}
1800 
1801 	return (NULL);
1802 }
1803 
1804 /*
1805  * wrapper around ipcl_classify_(v4,v6) routines.
1806  */
1807 conn_t *
1808 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1809 {
1810 	if (ira->ira_flags & IRAF_IS_IPV4) {
1811 		return (ipcl_classify_v4(mp, ira->ira_protocol,
1812 		    ira->ira_ip_hdr_length, ira, ipst));
1813 	} else {
1814 		return (ipcl_classify_v6(mp, ira->ira_protocol,
1815 		    ira->ira_ip_hdr_length, ira, ipst));
1816 	}
1817 }
1818 
1819 /*
1820  * Only used to classify SCTP RAW sockets
1821  */
1822 conn_t *
1823 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1824     ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1825 {
1826 	connf_t		*connfp;
1827 	conn_t		*connp;
1828 	in_port_t	lport;
1829 	int		ipversion;
1830 	const void	*dst;
1831 	zoneid_t	zoneid = ira->ira_zoneid;
1832 
1833 	lport = ((uint16_t *)&ports)[1];
1834 	if (ira->ira_flags & IRAF_IS_IPV4) {
1835 		dst = (const void *)&ipha->ipha_dst;
1836 		ipversion = IPV4_VERSION;
1837 	} else {
1838 		dst = (const void *)&ip6h->ip6_dst;
1839 		ipversion = IPV6_VERSION;
1840 	}
1841 
1842 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1843 	mutex_enter(&connfp->connf_lock);
1844 	for (connp = connfp->connf_head; connp != NULL;
1845 	    connp = connp->conn_next) {
1846 		/* We don't allow v4 fallback for v6 raw socket. */
1847 		if (ipversion != connp->conn_ipversion)
1848 			continue;
1849 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1850 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1851 			if (ipversion == IPV4_VERSION) {
1852 				if (!IPCL_CONN_MATCH(connp, protocol,
1853 				    ipha->ipha_src, ipha->ipha_dst, ports))
1854 					continue;
1855 			} else {
1856 				if (!IPCL_CONN_MATCH_V6(connp, protocol,
1857 				    ip6h->ip6_src, ip6h->ip6_dst, ports))
1858 					continue;
1859 			}
1860 		} else {
1861 			if (ipversion == IPV4_VERSION) {
1862 				if (!IPCL_BIND_MATCH(connp, protocol,
1863 				    ipha->ipha_dst, lport))
1864 					continue;
1865 			} else {
1866 				if (!IPCL_BIND_MATCH_V6(connp, protocol,
1867 				    ip6h->ip6_dst, lport))
1868 					continue;
1869 			}
1870 		}
1871 
1872 		if (connp->conn_zoneid == zoneid ||
1873 		    connp->conn_allzones ||
1874 		    ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1875 		    (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1876 		    (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1877 			break;
1878 	}
1879 
1880 	if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1881 	    !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1882 		DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1883 		    char *, "connp(1) could not receive mp(2)",
1884 		    conn_t *, connp, mblk_t *, mp);
1885 		connp = NULL;
1886 	}
1887 
1888 	if (connp != NULL)
1889 		goto found;
1890 	mutex_exit(&connfp->connf_lock);
1891 
1892 	/* Try to look for a wildcard SCTP RAW socket match. */
1893 	connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1894 	mutex_enter(&connfp->connf_lock);
1895 	for (connp = connfp->connf_head; connp != NULL;
1896 	    connp = connp->conn_next) {
1897 		/* We don't allow v4 fallback for v6 raw socket. */
1898 		if (ipversion != connp->conn_ipversion)
1899 			continue;
1900 		if (!IPCL_ZONE_MATCH(connp, zoneid))
1901 			continue;
1902 
1903 		if (ipversion == IPV4_VERSION) {
1904 			if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1905 				break;
1906 		} else {
1907 			if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1908 				break;
1909 			}
1910 		}
1911 	}
1912 
1913 	if (connp != NULL)
1914 		goto found;
1915 
1916 	mutex_exit(&connfp->connf_lock);
1917 	return (NULL);
1918 
1919 found:
1920 	ASSERT(connp != NULL);
1921 	CONN_INC_REF(connp);
1922 	mutex_exit(&connfp->connf_lock);
1923 	return (connp);
1924 }
1925 
1926 /* ARGSUSED */
1927 static int
1928 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1929 {
1930 	itc_t	*itc = (itc_t *)buf;
1931 	conn_t	*connp = &itc->itc_conn;
1932 	tcp_t	*tcp = (tcp_t *)&itc[1];
1933 
1934 	bzero(connp, sizeof (conn_t));
1935 	bzero(tcp, sizeof (tcp_t));
1936 
1937 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1938 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1939 	cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1940 	tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1941 	if (tcp->tcp_timercache == NULL)
1942 		return (ENOMEM);
1943 	connp->conn_tcp = tcp;
1944 	connp->conn_flags = IPCL_TCPCONN;
1945 	connp->conn_proto = IPPROTO_TCP;
1946 	tcp->tcp_connp = connp;
1947 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1948 
1949 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1950 	if (connp->conn_ixa == NULL) {
1951 		tcp_timermp_free(tcp);
1952 		return (ENOMEM);
1953 	}
1954 	connp->conn_ixa->ixa_refcnt = 1;
1955 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
1956 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1957 	return (0);
1958 }
1959 
1960 /* ARGSUSED */
1961 static void
1962 tcp_conn_destructor(void *buf, void *cdrarg)
1963 {
1964 	itc_t	*itc = (itc_t *)buf;
1965 	conn_t	*connp = &itc->itc_conn;
1966 	tcp_t	*tcp = (tcp_t *)&itc[1];
1967 
1968 	ASSERT(connp->conn_flags & IPCL_TCPCONN);
1969 	ASSERT(tcp->tcp_connp == connp);
1970 	ASSERT(connp->conn_tcp == tcp);
1971 	tcp_timermp_free(tcp);
1972 	mutex_destroy(&connp->conn_lock);
1973 	cv_destroy(&connp->conn_cv);
1974 	cv_destroy(&connp->conn_sq_cv);
1975 	rw_destroy(&connp->conn_ilg_lock);
1976 
1977 	/* Can be NULL if constructor failed */
1978 	if (connp->conn_ixa != NULL) {
1979 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1980 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
1981 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
1982 		ixa_refrele(connp->conn_ixa);
1983 	}
1984 }
1985 
1986 /* ARGSUSED */
1987 static int
1988 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1989 {
1990 	itc_t	*itc = (itc_t *)buf;
1991 	conn_t	*connp = &itc->itc_conn;
1992 
1993 	bzero(connp, sizeof (conn_t));
1994 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1995 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1996 	connp->conn_flags = IPCL_IPCCONN;
1997 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1998 
1999 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2000 	if (connp->conn_ixa == NULL)
2001 		return (ENOMEM);
2002 	connp->conn_ixa->ixa_refcnt = 1;
2003 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2004 	return (0);
2005 }
2006 
2007 /* ARGSUSED */
2008 static void
2009 ip_conn_destructor(void *buf, void *cdrarg)
2010 {
2011 	itc_t	*itc = (itc_t *)buf;
2012 	conn_t	*connp = &itc->itc_conn;
2013 
2014 	ASSERT(connp->conn_flags & IPCL_IPCCONN);
2015 	ASSERT(connp->conn_priv == NULL);
2016 	mutex_destroy(&connp->conn_lock);
2017 	cv_destroy(&connp->conn_cv);
2018 	rw_destroy(&connp->conn_ilg_lock);
2019 
2020 	/* Can be NULL if constructor failed */
2021 	if (connp->conn_ixa != NULL) {
2022 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2023 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2024 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2025 		ixa_refrele(connp->conn_ixa);
2026 	}
2027 }
2028 
2029 /* ARGSUSED */
2030 static int
2031 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2032 {
2033 	itc_t	*itc = (itc_t *)buf;
2034 	conn_t	*connp = &itc->itc_conn;
2035 	udp_t	*udp = (udp_t *)&itc[1];
2036 
2037 	bzero(connp, sizeof (conn_t));
2038 	bzero(udp, sizeof (udp_t));
2039 
2040 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2041 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2042 	connp->conn_udp = udp;
2043 	connp->conn_flags = IPCL_UDPCONN;
2044 	connp->conn_proto = IPPROTO_UDP;
2045 	udp->udp_connp = connp;
2046 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2047 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2048 	if (connp->conn_ixa == NULL)
2049 		return (ENOMEM);
2050 	connp->conn_ixa->ixa_refcnt = 1;
2051 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2052 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2053 	return (0);
2054 }
2055 
2056 /* ARGSUSED */
2057 static void
2058 udp_conn_destructor(void *buf, void *cdrarg)
2059 {
2060 	itc_t	*itc = (itc_t *)buf;
2061 	conn_t	*connp = &itc->itc_conn;
2062 	udp_t	*udp = (udp_t *)&itc[1];
2063 
2064 	ASSERT(connp->conn_flags & IPCL_UDPCONN);
2065 	ASSERT(udp->udp_connp == connp);
2066 	ASSERT(connp->conn_udp == udp);
2067 	mutex_destroy(&connp->conn_lock);
2068 	cv_destroy(&connp->conn_cv);
2069 	rw_destroy(&connp->conn_ilg_lock);
2070 
2071 	/* Can be NULL if constructor failed */
2072 	if (connp->conn_ixa != NULL) {
2073 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2074 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2075 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2076 		ixa_refrele(connp->conn_ixa);
2077 	}
2078 }
2079 
2080 /* ARGSUSED */
2081 static int
2082 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2083 {
2084 	itc_t	*itc = (itc_t *)buf;
2085 	conn_t	*connp = &itc->itc_conn;
2086 	icmp_t	*icmp = (icmp_t *)&itc[1];
2087 
2088 	bzero(connp, sizeof (conn_t));
2089 	bzero(icmp, sizeof (icmp_t));
2090 
2091 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2092 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2093 	connp->conn_icmp = icmp;
2094 	connp->conn_flags = IPCL_RAWIPCONN;
2095 	connp->conn_proto = IPPROTO_ICMP;
2096 	icmp->icmp_connp = connp;
2097 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2098 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2099 	if (connp->conn_ixa == NULL)
2100 		return (ENOMEM);
2101 	connp->conn_ixa->ixa_refcnt = 1;
2102 	connp->conn_ixa->ixa_protocol = connp->conn_proto;
2103 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2104 	return (0);
2105 }
2106 
2107 /* ARGSUSED */
2108 static void
2109 rawip_conn_destructor(void *buf, void *cdrarg)
2110 {
2111 	itc_t	*itc = (itc_t *)buf;
2112 	conn_t	*connp = &itc->itc_conn;
2113 	icmp_t	*icmp = (icmp_t *)&itc[1];
2114 
2115 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2116 	ASSERT(icmp->icmp_connp == connp);
2117 	ASSERT(connp->conn_icmp == icmp);
2118 	mutex_destroy(&connp->conn_lock);
2119 	cv_destroy(&connp->conn_cv);
2120 	rw_destroy(&connp->conn_ilg_lock);
2121 
2122 	/* Can be NULL if constructor failed */
2123 	if (connp->conn_ixa != NULL) {
2124 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2125 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2126 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2127 		ixa_refrele(connp->conn_ixa);
2128 	}
2129 }
2130 
2131 /* ARGSUSED */
2132 static int
2133 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2134 {
2135 	itc_t	*itc = (itc_t *)buf;
2136 	conn_t	*connp = &itc->itc_conn;
2137 	rts_t	*rts = (rts_t *)&itc[1];
2138 
2139 	bzero(connp, sizeof (conn_t));
2140 	bzero(rts, sizeof (rts_t));
2141 
2142 	mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2143 	cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2144 	connp->conn_rts = rts;
2145 	connp->conn_flags = IPCL_RTSCONN;
2146 	rts->rts_connp = connp;
2147 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2148 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2149 	if (connp->conn_ixa == NULL)
2150 		return (ENOMEM);
2151 	connp->conn_ixa->ixa_refcnt = 1;
2152 	connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2153 	return (0);
2154 }
2155 
2156 /* ARGSUSED */
2157 static void
2158 rts_conn_destructor(void *buf, void *cdrarg)
2159 {
2160 	itc_t	*itc = (itc_t *)buf;
2161 	conn_t	*connp = &itc->itc_conn;
2162 	rts_t	*rts = (rts_t *)&itc[1];
2163 
2164 	ASSERT(connp->conn_flags & IPCL_RTSCONN);
2165 	ASSERT(rts->rts_connp == connp);
2166 	ASSERT(connp->conn_rts == rts);
2167 	mutex_destroy(&connp->conn_lock);
2168 	cv_destroy(&connp->conn_cv);
2169 	rw_destroy(&connp->conn_ilg_lock);
2170 
2171 	/* Can be NULL if constructor failed */
2172 	if (connp->conn_ixa != NULL) {
2173 		ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2174 		ASSERT(connp->conn_ixa->ixa_ire == NULL);
2175 		ASSERT(connp->conn_ixa->ixa_nce == NULL);
2176 		ixa_refrele(connp->conn_ixa);
2177 	}
2178 }
2179 
2180 /*
2181  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2182  * in the conn_t.
2183  *
2184  * Below we list all the pointers in the conn_t as a documentation aid.
2185  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2186  * If you add any pointers to the conn_t please add an ASSERT here
2187  * and #ifdef it out if it can't be actually asserted to be NULL.
2188  * In any case, we bzero most of the conn_t at the end of the function.
2189  */
2190 void
2191 ipcl_conn_cleanup(conn_t *connp)
2192 {
2193 	ip_xmit_attr_t	*ixa;
2194 
2195 	ASSERT(connp->conn_latch == NULL);
2196 	ASSERT(connp->conn_latch_in_policy == NULL);
2197 	ASSERT(connp->conn_latch_in_action == NULL);
2198 #ifdef notdef
2199 	ASSERT(connp->conn_rq == NULL);
2200 	ASSERT(connp->conn_wq == NULL);
2201 #endif
2202 	ASSERT(connp->conn_cred == NULL);
2203 	ASSERT(connp->conn_g_fanout == NULL);
2204 	ASSERT(connp->conn_g_next == NULL);
2205 	ASSERT(connp->conn_g_prev == NULL);
2206 	ASSERT(connp->conn_policy == NULL);
2207 	ASSERT(connp->conn_fanout == NULL);
2208 	ASSERT(connp->conn_next == NULL);
2209 	ASSERT(connp->conn_prev == NULL);
2210 	ASSERT(connp->conn_oper_pending_ill == NULL);
2211 	ASSERT(connp->conn_ilg == NULL);
2212 	ASSERT(connp->conn_drain_next == NULL);
2213 	ASSERT(connp->conn_drain_prev == NULL);
2214 #ifdef notdef
2215 	/* conn_idl is not cleared when removed from idl list */
2216 	ASSERT(connp->conn_idl == NULL);
2217 #endif
2218 	ASSERT(connp->conn_ipsec_opt_mp == NULL);
2219 #ifdef notdef
2220 	/* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2221 	ASSERT(connp->conn_netstack == NULL);
2222 #endif
2223 
2224 	ASSERT(connp->conn_helper_info == NULL);
2225 	ASSERT(connp->conn_ixa != NULL);
2226 	ixa = connp->conn_ixa;
2227 	ASSERT(ixa->ixa_refcnt == 1);
2228 	/* Need to preserve ixa_protocol */
2229 	ixa_cleanup(ixa);
2230 	ixa->ixa_flags = 0;
2231 
2232 	/* Clear out the conn_t fields that are not preserved */
2233 	bzero(&connp->conn_start_clr,
2234 	    sizeof (conn_t) -
2235 	    ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2236 }
2237 
2238 /*
2239  * All conns are inserted in a global multi-list for the benefit of
2240  * walkers. The walk is guaranteed to walk all open conns at the time
2241  * of the start of the walk exactly once. This property is needed to
2242  * achieve some cleanups during unplumb of interfaces. This is achieved
2243  * as follows.
2244  *
2245  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2246  * call the insert and delete functions below at creation and deletion
2247  * time respectively. The conn never moves or changes its position in this
2248  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2249  * won't increase due to walkers, once the conn deletion has started. Note
2250  * that we can't remove the conn from the global list and then wait for
2251  * the refcnt to drop to zero, since walkers would then see a truncated
2252  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2253  * conns until ip_open is ready to make them globally visible.
2254  * The global round robin multi-list locks are held only to get the
2255  * next member/insertion/deletion and contention should be negligible
2256  * if the multi-list is much greater than the number of cpus.
2257  */
2258 void
2259 ipcl_globalhash_insert(conn_t *connp)
2260 {
2261 	int	index;
2262 	struct connf_s	*connfp;
2263 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
2264 
2265 	/*
2266 	 * No need for atomic here. Approximate even distribution
2267 	 * in the global lists is sufficient.
2268 	 */
2269 	ipst->ips_conn_g_index++;
2270 	index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2271 
2272 	connp->conn_g_prev = NULL;
2273 	/*
2274 	 * Mark as INCIPIENT, so that walkers will ignore this
2275 	 * for now, till ip_open is ready to make it visible globally.
2276 	 */
2277 	connp->conn_state_flags |= CONN_INCIPIENT;
2278 
2279 	connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2280 	/* Insert at the head of the list */
2281 	mutex_enter(&connfp->connf_lock);
2282 	connp->conn_g_next = connfp->connf_head;
2283 	if (connp->conn_g_next != NULL)
2284 		connp->conn_g_next->conn_g_prev = connp;
2285 	connfp->connf_head = connp;
2286 
2287 	/* The fanout bucket this conn points to */
2288 	connp->conn_g_fanout = connfp;
2289 
2290 	mutex_exit(&connfp->connf_lock);
2291 }
2292 
2293 void
2294 ipcl_globalhash_remove(conn_t *connp)
2295 {
2296 	struct connf_s	*connfp;
2297 
2298 	/*
2299 	 * We were never inserted in the global multi list.
2300 	 * IPCL_NONE variety is never inserted in the global multilist
2301 	 * since it is presumed to not need any cleanup and is transient.
2302 	 */
2303 	if (connp->conn_g_fanout == NULL)
2304 		return;
2305 
2306 	connfp = connp->conn_g_fanout;
2307 	mutex_enter(&connfp->connf_lock);
2308 	if (connp->conn_g_prev != NULL)
2309 		connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2310 	else
2311 		connfp->connf_head = connp->conn_g_next;
2312 	if (connp->conn_g_next != NULL)
2313 		connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2314 	mutex_exit(&connfp->connf_lock);
2315 
2316 	/* Better to stumble on a null pointer than to corrupt memory */
2317 	connp->conn_g_next = NULL;
2318 	connp->conn_g_prev = NULL;
2319 	connp->conn_g_fanout = NULL;
2320 }
2321 
2322 /*
2323  * Walk the list of all conn_t's in the system, calling the function provided
2324  * With the specified argument for each.
2325  * Applies to both IPv4 and IPv6.
2326  *
2327  * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2328  * conn_oper_pending_ill). To guard against stale pointers
2329  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2330  * unplumbed or removed. New conn_t's that are created while we are walking
2331  * may be missed by this walk, because they are not necessarily inserted
2332  * at the tail of the list. They are new conn_t's and thus don't have any
2333  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2334  * is created to the struct that is going away.
2335  */
2336 void
2337 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2338 {
2339 	int	i;
2340 	conn_t	*connp;
2341 	conn_t	*prev_connp;
2342 
2343 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2344 		mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2345 		prev_connp = NULL;
2346 		connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2347 		while (connp != NULL) {
2348 			mutex_enter(&connp->conn_lock);
2349 			if (connp->conn_state_flags &
2350 			    (CONN_CONDEMNED | CONN_INCIPIENT)) {
2351 				mutex_exit(&connp->conn_lock);
2352 				connp = connp->conn_g_next;
2353 				continue;
2354 			}
2355 			CONN_INC_REF_LOCKED(connp);
2356 			mutex_exit(&connp->conn_lock);
2357 			mutex_exit(
2358 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2359 			(*func)(connp, arg);
2360 			if (prev_connp != NULL)
2361 				CONN_DEC_REF(prev_connp);
2362 			mutex_enter(
2363 			    &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2364 			prev_connp = connp;
2365 			connp = connp->conn_g_next;
2366 		}
2367 		mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2368 		if (prev_connp != NULL)
2369 			CONN_DEC_REF(prev_connp);
2370 	}
2371 }
2372 
2373 /*
2374  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2375  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2376  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2377  * (peer tcp in ESTABLISHED state).
2378  */
2379 conn_t *
2380 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2381     ip_stack_t *ipst)
2382 {
2383 	uint32_t ports;
2384 	uint16_t *pports = (uint16_t *)&ports;
2385 	connf_t	*connfp;
2386 	conn_t	*tconnp;
2387 	boolean_t zone_chk;
2388 
2389 	/*
2390 	 * If either the source of destination address is loopback, then
2391 	 * both endpoints must be in the same Zone.  Otherwise, both of
2392 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2393 	 * state) and the endpoints may reside in different Zones.
2394 	 */
2395 	zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2396 	    ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2397 
2398 	pports[0] = tcpha->tha_fport;
2399 	pports[1] = tcpha->tha_lport;
2400 
2401 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2402 	    ports, ipst)];
2403 
2404 	mutex_enter(&connfp->connf_lock);
2405 	for (tconnp = connfp->connf_head; tconnp != NULL;
2406 	    tconnp = tconnp->conn_next) {
2407 
2408 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2409 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2410 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2411 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2412 
2413 			ASSERT(tconnp != connp);
2414 			CONN_INC_REF(tconnp);
2415 			mutex_exit(&connfp->connf_lock);
2416 			return (tconnp);
2417 		}
2418 	}
2419 	mutex_exit(&connfp->connf_lock);
2420 	return (NULL);
2421 }
2422 
2423 /*
2424  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2425  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2426  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2427  * (peer tcp in ESTABLISHED state).
2428  */
2429 conn_t *
2430 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2431     ip_stack_t *ipst)
2432 {
2433 	uint32_t ports;
2434 	uint16_t *pports = (uint16_t *)&ports;
2435 	connf_t	*connfp;
2436 	conn_t	*tconnp;
2437 	boolean_t zone_chk;
2438 
2439 	/*
2440 	 * If either the source of destination address is loopback, then
2441 	 * both endpoints must be in the same Zone.  Otherwise, both of
2442 	 * the addresses are system-wide unique (tcp is in ESTABLISHED
2443 	 * state) and the endpoints may reside in different Zones.  We
2444 	 * don't do Zone check for link local address(es) because the
2445 	 * current Zone implementation treats each link local address as
2446 	 * being unique per system node, i.e. they belong to global Zone.
2447 	 */
2448 	zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2449 	    IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2450 
2451 	pports[0] = tcpha->tha_fport;
2452 	pports[1] = tcpha->tha_lport;
2453 
2454 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2455 	    ports, ipst)];
2456 
2457 	mutex_enter(&connfp->connf_lock);
2458 	for (tconnp = connfp->connf_head; tconnp != NULL;
2459 	    tconnp = tconnp->conn_next) {
2460 
2461 		/* We skip conn_bound_if check here as this is loopback tcp */
2462 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2463 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2464 		    tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2465 		    (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2466 
2467 			ASSERT(tconnp != connp);
2468 			CONN_INC_REF(tconnp);
2469 			mutex_exit(&connfp->connf_lock);
2470 			return (tconnp);
2471 		}
2472 	}
2473 	mutex_exit(&connfp->connf_lock);
2474 	return (NULL);
2475 }
2476 
2477 /*
2478  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2479  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2480  * Only checks for connected entries i.e. no INADDR_ANY checks.
2481  */
2482 conn_t *
2483 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2484     ip_stack_t *ipst)
2485 {
2486 	uint32_t ports;
2487 	uint16_t *pports;
2488 	connf_t	*connfp;
2489 	conn_t	*tconnp;
2490 
2491 	pports = (uint16_t *)&ports;
2492 	pports[0] = tcpha->tha_fport;
2493 	pports[1] = tcpha->tha_lport;
2494 
2495 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2496 	    ports, ipst)];
2497 
2498 	mutex_enter(&connfp->connf_lock);
2499 	for (tconnp = connfp->connf_head; tconnp != NULL;
2500 	    tconnp = tconnp->conn_next) {
2501 
2502 		if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2503 		    ipha->ipha_dst, ipha->ipha_src, ports) &&
2504 		    tconnp->conn_tcp->tcp_state >= min_state) {
2505 
2506 			CONN_INC_REF(tconnp);
2507 			mutex_exit(&connfp->connf_lock);
2508 			return (tconnp);
2509 		}
2510 	}
2511 	mutex_exit(&connfp->connf_lock);
2512 	return (NULL);
2513 }
2514 
2515 /*
2516  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2517  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2518  * Only checks for connected entries i.e. no INADDR_ANY checks.
2519  * Match on ifindex in addition to addresses.
2520  */
2521 conn_t *
2522 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2523     uint_t ifindex, ip_stack_t *ipst)
2524 {
2525 	tcp_t	*tcp;
2526 	uint32_t ports;
2527 	uint16_t *pports;
2528 	connf_t	*connfp;
2529 	conn_t	*tconnp;
2530 
2531 	pports = (uint16_t *)&ports;
2532 	pports[0] = tcpha->tha_fport;
2533 	pports[1] = tcpha->tha_lport;
2534 
2535 	connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2536 	    ports, ipst)];
2537 
2538 	mutex_enter(&connfp->connf_lock);
2539 	for (tconnp = connfp->connf_head; tconnp != NULL;
2540 	    tconnp = tconnp->conn_next) {
2541 
2542 		tcp = tconnp->conn_tcp;
2543 		if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2544 		    ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2545 		    tcp->tcp_state >= min_state &&
2546 		    (tconnp->conn_bound_if == 0 ||
2547 		    tconnp->conn_bound_if == ifindex)) {
2548 
2549 			CONN_INC_REF(tconnp);
2550 			mutex_exit(&connfp->connf_lock);
2551 			return (tconnp);
2552 		}
2553 	}
2554 	mutex_exit(&connfp->connf_lock);
2555 	return (NULL);
2556 }
2557 
2558 /*
2559  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2560  * a listener when changing state.
2561  */
2562 conn_t *
2563 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2564     ip_stack_t *ipst)
2565 {
2566 	connf_t		*bind_connfp;
2567 	conn_t		*connp;
2568 	tcp_t		*tcp;
2569 
2570 	/*
2571 	 * Avoid false matches for packets sent to an IP destination of
2572 	 * all zeros.
2573 	 */
2574 	if (laddr == 0)
2575 		return (NULL);
2576 
2577 	ASSERT(zoneid != ALL_ZONES);
2578 
2579 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2580 	mutex_enter(&bind_connfp->connf_lock);
2581 	for (connp = bind_connfp->connf_head; connp != NULL;
2582 	    connp = connp->conn_next) {
2583 		tcp = connp->conn_tcp;
2584 		if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2585 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2586 		    (tcp->tcp_listener == NULL)) {
2587 			CONN_INC_REF(connp);
2588 			mutex_exit(&bind_connfp->connf_lock);
2589 			return (connp);
2590 		}
2591 	}
2592 	mutex_exit(&bind_connfp->connf_lock);
2593 	return (NULL);
2594 }
2595 
2596 /*
2597  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2598  * a listener when changing state.
2599  */
2600 conn_t *
2601 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2602     zoneid_t zoneid, ip_stack_t *ipst)
2603 {
2604 	connf_t		*bind_connfp;
2605 	conn_t		*connp = NULL;
2606 	tcp_t		*tcp;
2607 
2608 	/*
2609 	 * Avoid false matches for packets sent to an IP destination of
2610 	 * all zeros.
2611 	 */
2612 	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2613 		return (NULL);
2614 
2615 	ASSERT(zoneid != ALL_ZONES);
2616 
2617 	bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2618 	mutex_enter(&bind_connfp->connf_lock);
2619 	for (connp = bind_connfp->connf_head; connp != NULL;
2620 	    connp = connp->conn_next) {
2621 		tcp = connp->conn_tcp;
2622 		if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2623 		    IPCL_ZONE_MATCH(connp, zoneid) &&
2624 		    (connp->conn_bound_if == 0 ||
2625 		    connp->conn_bound_if == ifindex) &&
2626 		    tcp->tcp_listener == NULL) {
2627 			CONN_INC_REF(connp);
2628 			mutex_exit(&bind_connfp->connf_lock);
2629 			return (connp);
2630 		}
2631 	}
2632 	mutex_exit(&bind_connfp->connf_lock);
2633 	return (NULL);
2634 }
2635 
2636 /*
2637  * ipcl_get_next_conn
2638  *	get the next entry in the conn global list
2639  *	and put a reference on the next_conn.
2640  *	decrement the reference on the current conn.
2641  *
2642  * This is an iterator based walker function that also provides for
2643  * some selection by the caller. It walks through the conn_hash bucket
2644  * searching for the next valid connp in the list, and selects connections
2645  * that are neither closed nor condemned. It also REFHOLDS the conn
2646  * thus ensuring that the conn exists when the caller uses the conn.
2647  */
2648 conn_t *
2649 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2650 {
2651 	conn_t	*next_connp;
2652 
2653 	if (connfp == NULL)
2654 		return (NULL);
2655 
2656 	mutex_enter(&connfp->connf_lock);
2657 
2658 	next_connp = (connp == NULL) ?
2659 	    connfp->connf_head : connp->conn_g_next;
2660 
2661 	while (next_connp != NULL) {
2662 		mutex_enter(&next_connp->conn_lock);
2663 		if (!(next_connp->conn_flags & conn_flags) ||
2664 		    (next_connp->conn_state_flags &
2665 		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
2666 			/*
2667 			 * This conn has been condemned or
2668 			 * is closing, or the flags don't match
2669 			 */
2670 			mutex_exit(&next_connp->conn_lock);
2671 			next_connp = next_connp->conn_g_next;
2672 			continue;
2673 		}
2674 		CONN_INC_REF_LOCKED(next_connp);
2675 		mutex_exit(&next_connp->conn_lock);
2676 		break;
2677 	}
2678 
2679 	mutex_exit(&connfp->connf_lock);
2680 
2681 	if (connp != NULL)
2682 		CONN_DEC_REF(connp);
2683 
2684 	return (next_connp);
2685 }
2686 
2687 #ifdef CONN_DEBUG
2688 /*
2689  * Trace of the last NBUF refhold/refrele
2690  */
2691 int
2692 conn_trace_ref(conn_t *connp)
2693 {
2694 	int	last;
2695 	conn_trace_t	*ctb;
2696 
2697 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2698 	last = connp->conn_trace_last;
2699 	last++;
2700 	if (last == CONN_TRACE_MAX)
2701 		last = 0;
2702 
2703 	ctb = &connp->conn_trace_buf[last];
2704 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2705 	connp->conn_trace_last = last;
2706 	return (1);
2707 }
2708 
2709 int
2710 conn_untrace_ref(conn_t *connp)
2711 {
2712 	int	last;
2713 	conn_trace_t	*ctb;
2714 
2715 	ASSERT(MUTEX_HELD(&connp->conn_lock));
2716 	last = connp->conn_trace_last;
2717 	last++;
2718 	if (last == CONN_TRACE_MAX)
2719 		last = 0;
2720 
2721 	ctb = &connp->conn_trace_buf[last];
2722 	ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2723 	connp->conn_trace_last = last;
2724 	return (1);
2725 }
2726 #endif
2727 
2728 mib2_socketInfoEntry_t *
2729 conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie)
2730 {
2731 	vnode_t *vn = NULL;
2732 	vattr_t attr;
2733 	uint64_t flags = 0;
2734 
2735 	/*
2736 	 * If the connection is closing, it is not safe to make an upcall or
2737 	 * access the stream associated with the connection.
2738 	 * The callers of this function have a reference on connp itself
2739 	 * so, as long as it is not closing, it's safe to continue.
2740 	 */
2741 	mutex_enter(&connp->conn_lock);
2742 
2743 	if ((connp->conn_state_flags & CONN_CLOSING)) {
2744 		mutex_exit(&connp->conn_lock);
2745 		return (NULL);
2746 	}
2747 
2748 	mutex_exit(&connp->conn_lock);
2749 
2750 	if (connp->conn_upper_handle != NULL) {
2751 		vn = (*connp->conn_upcalls->su_get_vnode)
2752 		    (connp->conn_upper_handle);
2753 	} else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) {
2754 		vn = STREAM(connp->conn_rq)->sd_pvnode;
2755 		if (vn != NULL)
2756 			VN_HOLD(vn);
2757 		flags |= MIB2_SOCKINFO_STREAM;
2758 	}
2759 
2760 	if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) {
2761 		if (vn != NULL)
2762 			VN_RELE(vn);
2763 		return (NULL);
2764 	}
2765 
2766 	VN_RELE(vn);
2767 
2768 	bzero(sie, sizeof (*sie));
2769 
2770 	sie->sie_flags = flags;
2771 	sie->sie_inode = attr.va_nodeid;
2772 	sie->sie_dev = attr.va_rdev;
2773 
2774 	return (sie);
2775 }
2776