xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision 5626beece2e5dedec7197ecf325cfaa1854a6c2e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
25  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/kmem_impl.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/suntpi.h>
51 #include <sys/ddi.h>
52 #include <sys/esunddi.h>
53 #include <sys/flock.h>
54 #include <sys/modctl.h>
55 #include <sys/vtrace.h>
56 #include <sys/cmn_err.h>
57 #include <sys/pathname.h>
58 
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
65 
66 #include <sys/tiuser.h>
67 #define	_SUN_TPI_VERSION	2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
70 
71 #include <c2/audit.h>
72 
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
78 
79 #include <sys/zone.h>
80 
81 #include <fs/sockfs/nl7c.h>
82 #include <fs/sockfs/nl7curi.h>
83 
84 #include <fs/sockfs/sockcommon.h>
85 #include <fs/sockfs/socktpi.h>
86 #include <fs/sockfs/socktpi_impl.h>
87 
88 /*
89  * Possible failures when memory can't be allocated. The documented behavior:
90  *
91  * 		5.5:			4.X:		XNET:
92  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
93  *							EINTR
94  *	(4.X does not document EINTR but returns it)
95  * bind:	ENOSR			-		ENOBUFS/ENOSR
96  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
97  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
98  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
99  *	(4.X getpeername and getsockname do not fail in practice)
100  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
101  * listen:	-			-		ENOBUFS
102  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
103  *							EINTR
104  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
105  *							EINTR
106  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
107  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
108  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
109  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
110  *
111  * Resolution. When allocation fails:
112  *	recv: return EINTR
113  *	send: return EINTR
114  *	connect, accept: EINTR
115  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
116  *	socket, socketpair: ENOBUFS
117  *	getpeername, getsockname: sleep
118  *	getsockopt, setsockopt: sleep
119  */
120 
121 #ifdef SOCK_TEST
122 /*
123  * Variables that make sockfs do something other than the standard TPI
124  * for the AF_INET transports.
125  *
126  * solisten_tpi_tcp:
127  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
128  *	the transport is already bound. This is needed to avoid loosing the
129  *	port number should listen() do a T_UNBIND_REQ followed by a
130  *	O_T_BIND_REQ.
131  *
132  * soconnect_tpi_udp:
133  *	UDP and ICMP can handle a T_CONN_REQ.
134  *	This is needed to make the sequence of connect(), getsockname()
135  *	return the local IP address used to send packets to the connected to
136  *	destination.
137  *
138  * soconnect_tpi_tcp:
139  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
140  *	Set this to non-zero to send TPI conformant messages to TCP in this
141  *	respect. This is a performance optimization.
142  *
143  * soaccept_tpi_tcp:
144  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
145  *	This is a performance optimization that has been picked up in XTI.
146  *
147  * soaccept_tpi_multioptions:
148  *	When inheriting SOL_SOCKET options from the listener to the accepting
149  *	socket send them as a single message for AF_INET{,6}.
150  */
151 int solisten_tpi_tcp = 0;
152 int soconnect_tpi_udp = 0;
153 int soconnect_tpi_tcp = 0;
154 int soaccept_tpi_tcp = 0;
155 int soaccept_tpi_multioptions = 1;
156 #else /* SOCK_TEST */
157 #define	soconnect_tpi_tcp	0
158 #define	soconnect_tpi_udp	0
159 #define	solisten_tpi_tcp	0
160 #define	soaccept_tpi_tcp	0
161 #define	soaccept_tpi_multioptions	1
162 #endif /* SOCK_TEST */
163 
164 #ifdef SOCK_TEST
165 extern int do_useracc;
166 extern clock_t sock_test_timelimit;
167 #endif /* SOCK_TEST */
168 
169 extern uint32_t ucredsize;
170 
171 /*
172  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
173  * applications working. Turn on this flag to disable these checks.
174  */
175 int xnet_skip_checks = 0;
176 int xnet_check_print = 0;
177 int xnet_truncate_print = 0;
178 
179 static void sotpi_destroy(struct sonode *);
180 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
181     int, int *, cred_t *cr);
182 
183 static boolean_t	sotpi_info_create(struct sonode *, int);
184 static void		sotpi_info_init(struct sonode *);
185 static void 		sotpi_info_fini(struct sonode *);
186 static void 		sotpi_info_destroy(struct sonode *);
187 
188 /*
189  * Do direct function call to the transport layer below; this would
190  * also allow the transport to utilize read-side synchronous stream
191  * interface if necessary.  This is a /etc/system tunable that must
192  * not be modified on a running system.  By default this is enabled
193  * for performance reasons and may be disabled for debugging purposes.
194  */
195 boolean_t socktpi_direct = B_TRUE;
196 
197 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
198 
199 extern	void sigintr(k_sigset_t *, int);
200 extern	void sigunintr(k_sigset_t *);
201 
202 static int	sotpi_unbind(struct sonode *, int);
203 
204 /* TPI sockfs sonode operations */
205 int 		sotpi_init(struct sonode *, struct sonode *, struct cred *,
206 		    int);
207 static int	sotpi_accept(struct sonode *, int, struct cred *,
208 		    struct sonode **);
209 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
210 		    int, struct cred *);
211 static int	sotpi_listen(struct sonode *, int, struct cred *);
212 static int	sotpi_connect(struct sonode *, struct sockaddr *,
213 		    socklen_t, int, int, struct cred *);
214 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
215 		    struct uio *, struct cred *);
216 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
217 		    struct uio *, struct cred *);
218 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
219 		    struct cred *, mblk_t **);
220 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
221 		    struct uio *, void *, t_uscalar_t, int);
222 static int	sodgram_direct(struct sonode *, struct sockaddr *,
223 		    socklen_t, struct uio *, int);
224 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
225 		    socklen_t *, boolean_t, struct cred *);
226 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
227 		    socklen_t *, struct cred *);
228 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
229 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
230 		    socklen_t *, int, struct cred *);
231 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
232 		    socklen_t, struct cred *);
233 static int 	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
234 		    int32_t *);
235 static int 	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
236 		    struct cred *, int32_t *);
237 static int 	sotpi_poll(struct sonode *, short, int, short *,
238 		    struct pollhead **);
239 static int 	sotpi_close(struct sonode *, int, struct cred *);
240 
241 static int	i_sotpi_info_constructor(sotpi_info_t *);
242 static void 	i_sotpi_info_destructor(sotpi_info_t *);
243 
244 sonodeops_t sotpi_sonodeops = {
245 	sotpi_init,		/* sop_init		*/
246 	sotpi_accept,		/* sop_accept		*/
247 	sotpi_bind,		/* sop_bind		*/
248 	sotpi_listen,		/* sop_listen		*/
249 	sotpi_connect,		/* sop_connect		*/
250 	sotpi_recvmsg,		/* sop_recvmsg		*/
251 	sotpi_sendmsg,		/* sop_sendmsg		*/
252 	sotpi_sendmblk,		/* sop_sendmblk		*/
253 	sotpi_getpeername,	/* sop_getpeername	*/
254 	sotpi_getsockname,	/* sop_getsockname	*/
255 	sotpi_shutdown,		/* sop_shutdown		*/
256 	sotpi_getsockopt,	/* sop_getsockopt	*/
257 	sotpi_setsockopt,	/* sop_setsockopt	*/
258 	sotpi_ioctl,		/* sop_ioctl		*/
259 	sotpi_poll,		/* sop_poll		*/
260 	sotpi_close,		/* sop_close		*/
261 };
262 
263 /*
264  * Return a TPI socket vnode.
265  *
266  * Note that sockets assume that the driver will clone (either itself
267  * or by using the clone driver) i.e. a socket() call will always
268  * result in a new vnode being created.
269  */
270 
271 /*
272  * Common create code for socket and accept. If tso is set the values
273  * from that node is used instead of issuing a T_INFO_REQ.
274  */
275 
276 /* ARGSUSED */
277 static struct sonode *
278 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
279     int version, int sflags, int *errorp, cred_t *cr)
280 {
281 	struct sonode	*so;
282 	kmem_cache_t 	*cp;
283 	int		sfamily = family;
284 
285 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
286 
287 	if (family == AF_NCA) {
288 		/*
289 		 * The request is for an NCA socket so for NL7C use the
290 		 * INET domain instead and mark NL7C_AF_NCA below.
291 		 */
292 		family = AF_INET;
293 		/*
294 		 * NL7C is not supported in the non-global zone,
295 		 * we enforce this restriction here.
296 		 */
297 		if (getzoneid() != GLOBAL_ZONEID) {
298 			*errorp = ENOTSUP;
299 			return (NULL);
300 		}
301 	}
302 
303 	/*
304 	 * to be compatible with old tpi socket implementation ignore
305 	 * sleep flag (sflags) passed in
306 	 */
307 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
308 	so = kmem_cache_alloc(cp, KM_SLEEP);
309 	if (so == NULL) {
310 		*errorp = ENOMEM;
311 		return (NULL);
312 	}
313 
314 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
315 	sotpi_info_init(so);
316 
317 	if (sfamily == AF_NCA) {
318 		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
319 	}
320 
321 	if (version == SOV_DEFAULT)
322 		version = so_default_version;
323 
324 	so->so_version = (short)version;
325 	*errorp = 0;
326 
327 	return (so);
328 }
329 
330 static void
331 sotpi_destroy(struct sonode *so)
332 {
333 	kmem_cache_t *cp;
334 	struct sockparams *origsp;
335 
336 	/*
337 	 * If there is a new dealloc function (ie. smod_destroy_func),
338 	 * then it should check the correctness of the ops.
339 	 */
340 
341 	ASSERT(so->so_ops == &sotpi_sonodeops);
342 
343 	origsp = SOTOTPI(so)->sti_orig_sp;
344 
345 	sotpi_info_fini(so);
346 
347 	if (so->so_state & SS_FALLBACK_COMP) {
348 		/*
349 		 * A fallback happend, which means that a sotpi_info_t struct
350 		 * was allocated (as opposed to being allocated from the TPI
351 		 * sonode cache. Therefore we explicitly free the struct
352 		 * here.
353 		 */
354 		sotpi_info_destroy(so);
355 		ASSERT(origsp != NULL);
356 
357 		origsp->sp_smod_info->smod_sock_destroy_func(so);
358 		SOCKPARAMS_DEC_REF(origsp);
359 	} else {
360 		sonode_fini(so);
361 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
362 		    socktpi_cache;
363 		kmem_cache_free(cp, so);
364 	}
365 }
366 
367 /* ARGSUSED1 */
368 int
369 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
370 {
371 	major_t maj;
372 	dev_t newdev;
373 	struct vnode *vp;
374 	int error = 0;
375 	struct stdata *stp;
376 
377 	sotpi_info_t *sti = SOTOTPI(so);
378 
379 	dprint(1, ("sotpi_init()\n"));
380 
381 	/*
382 	 * over write the sleep flag passed in but that is ok
383 	 * as tpi socket does not honor sleep flag.
384 	 */
385 	flags |= FREAD|FWRITE;
386 
387 	/*
388 	 * Record in so_flag that it is a clone.
389 	 */
390 	if (getmajor(sti->sti_dev) == clone_major)
391 		so->so_flag |= SOCLONE;
392 
393 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
394 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
395 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
396 	    so->so_protocol == IPPROTO_IP)) {
397 		/* Tell tcp or udp that it's talking to sockets */
398 		flags |= SO_SOCKSTR;
399 
400 		/*
401 		 * Here we indicate to socktpi_open() our attempt to
402 		 * make direct calls between sockfs and transport.
403 		 * The final decision is left to socktpi_open().
404 		 */
405 		sti->sti_direct = 1;
406 
407 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
408 		if (so->so_type == SOCK_STREAM && tso != NULL) {
409 			if (SOTOTPI(tso)->sti_direct) {
410 				/*
411 				 * Inherit sti_direct from listener and pass
412 				 * SO_ACCEPTOR open flag to tcp, indicating
413 				 * that this is an accept fast-path instance.
414 				 */
415 				flags |= SO_ACCEPTOR;
416 			} else {
417 				/*
418 				 * sti_direct is not set on listener, meaning
419 				 * that the listener has been converted from
420 				 * a socket to a stream.  Ensure that the
421 				 * acceptor inherits these settings.
422 				 */
423 				sti->sti_direct = 0;
424 				flags &= ~SO_SOCKSTR;
425 			}
426 		}
427 	}
428 
429 	/*
430 	 * Tell local transport that it is talking to sockets.
431 	 */
432 	if (so->so_family == AF_UNIX) {
433 		flags |= SO_SOCKSTR;
434 	}
435 
436 	vp = SOTOV(so);
437 	newdev = vp->v_rdev;
438 	maj = getmajor(newdev);
439 	ASSERT(STREAMSTAB(maj));
440 
441 	error = stropen(vp, &newdev, flags, cr);
442 
443 	stp = vp->v_stream;
444 	if (error == 0) {
445 		if (so->so_flag & SOCLONE)
446 			ASSERT(newdev != vp->v_rdev);
447 		mutex_enter(&so->so_lock);
448 		sti->sti_dev = newdev;
449 		vp->v_rdev = newdev;
450 		mutex_exit(&so->so_lock);
451 
452 		if (stp->sd_flag & STRISTTY) {
453 			/*
454 			 * this is a post SVR4 tty driver - a socket can not
455 			 * be a controlling terminal. Fail the open.
456 			 */
457 			(void) sotpi_close(so, flags, cr);
458 			return (ENOTTY);	/* XXX */
459 		}
460 
461 		ASSERT(stp->sd_wrq != NULL);
462 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
463 
464 		/*
465 		 * If caller is interested in doing direct function call
466 		 * interface to/from transport module, probe the module
467 		 * directly beneath the streamhead to see if it qualifies.
468 		 *
469 		 * We turn off the direct interface when qualifications fail.
470 		 * In the acceptor case, we simply turn off the sti_direct
471 		 * flag on the socket. We do the fallback after the accept
472 		 * has completed, before the new socket is returned to the
473 		 * application.
474 		 */
475 		if (sti->sti_direct) {
476 			queue_t *tq = stp->sd_wrq->q_next;
477 
478 			/*
479 			 * sti_direct is currently supported and tested
480 			 * only for tcp/udp; this is the main reason to
481 			 * have the following assertions.
482 			 */
483 			ASSERT(so->so_family == AF_INET ||
484 			    so->so_family == AF_INET6);
485 			ASSERT(so->so_protocol == IPPROTO_UDP ||
486 			    so->so_protocol == IPPROTO_TCP ||
487 			    so->so_protocol == IPPROTO_IP);
488 			ASSERT(so->so_type == SOCK_DGRAM ||
489 			    so->so_type == SOCK_STREAM);
490 
491 			/*
492 			 * Abort direct call interface if the module directly
493 			 * underneath the stream head is not defined with the
494 			 * _D_DIRECT flag.  This could happen in the tcp or
495 			 * udp case, when some other module is autopushed
496 			 * above it, or for some reasons the expected module
497 			 * isn't purely D_MP (which is the main requirement).
498 			 */
499 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
500 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
501 				int rval;
502 
503 				/* Continue on without direct calls */
504 				sti->sti_direct = 0;
505 
506 				/*
507 				 * Cannot issue ioctl on fallback socket since
508 				 * there is no conn associated with the queue.
509 				 * The fallback downcall will notify the proto
510 				 * of the change.
511 				 */
512 				if (!(flags & SO_ACCEPTOR) &&
513 				    !(flags & SO_FALLBACK)) {
514 					if ((error = strioctl(vp,
515 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
516 					    cr, &rval)) != 0) {
517 						(void) sotpi_close(so, flags,
518 						    cr);
519 						return (error);
520 					}
521 				}
522 			}
523 		}
524 
525 		if (flags & SO_FALLBACK) {
526 			/*
527 			 * The stream created does not have a conn.
528 			 * do stream set up after conn has been assigned
529 			 */
530 			return (error);
531 		}
532 		if (error = so_strinit(so, tso)) {
533 			(void) sotpi_close(so, flags, cr);
534 			return (error);
535 		}
536 
537 		/* Wildcard */
538 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
539 			int protocol = so->so_protocol;
540 			/*
541 			 * Issue SO_PROTOTYPE setsockopt.
542 			 */
543 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
544 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
545 			if (error != 0) {
546 				(void) sotpi_close(so, flags, cr);
547 				/*
548 				 * Setsockopt often fails with ENOPROTOOPT but
549 				 * socket() should fail with
550 				 * EPROTONOSUPPORT/EPROTOTYPE.
551 				 */
552 				return (EPROTONOSUPPORT);
553 			}
554 		}
555 
556 	} else {
557 		/*
558 		 * While the same socket can not be reopened (unlike specfs)
559 		 * the stream head sets STREOPENFAIL when the autopush fails.
560 		 */
561 		if ((stp != NULL) &&
562 		    (stp->sd_flag & STREOPENFAIL)) {
563 			/*
564 			 * Open failed part way through.
565 			 */
566 			mutex_enter(&stp->sd_lock);
567 			stp->sd_flag &= ~STREOPENFAIL;
568 			mutex_exit(&stp->sd_lock);
569 			(void) sotpi_close(so, flags, cr);
570 			return (error);
571 			/*NOTREACHED*/
572 		}
573 		ASSERT(stp == NULL);
574 	}
575 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
576 	    "sockfs open:maj %d vp %p so %p error %d",
577 	    maj, vp, so, error);
578 	return (error);
579 }
580 
581 /*
582  * Bind the socket to an unspecified address in sockfs only.
583  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
584  * required in all cases.
585  */
586 static void
587 so_automatic_bind(struct sonode *so)
588 {
589 	sotpi_info_t *sti = SOTOTPI(so);
590 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
591 
592 	ASSERT(MUTEX_HELD(&so->so_lock));
593 	ASSERT(!(so->so_state & SS_ISBOUND));
594 	ASSERT(sti->sti_unbind_mp);
595 
596 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
597 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
598 	sti->sti_laddr_sa->sa_family = so->so_family;
599 	so->so_state |= SS_ISBOUND;
600 }
601 
602 
603 /*
604  * bind the socket.
605  *
606  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
607  * are passed in we allow rebinding. Note that for backwards compatibility
608  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
609  * Thus the rebinding code is currently not executed.
610  *
611  * The constraints for rebinding are:
612  * - it is a SOCK_DGRAM, or
613  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
614  *   and no listen() has been done.
615  * This rebinding code was added based on some language in the XNET book
616  * about not returning EINVAL it the protocol allows rebinding. However,
617  * this language is not present in the Posix socket draft. Thus maybe the
618  * rebinding logic should be deleted from the source.
619  *
620  * A null "name" can be used to unbind the socket if:
621  * - it is a SOCK_DGRAM, or
622  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
623  *   and no listen() has been done.
624  */
625 /* ARGSUSED */
626 static int
627 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
628     socklen_t namelen, int backlog, int flags, struct cred *cr)
629 {
630 	struct T_bind_req	bind_req;
631 	struct T_bind_ack	*bind_ack;
632 	int			error = 0;
633 	mblk_t			*mp;
634 	void			*addr;
635 	t_uscalar_t		addrlen;
636 	int			unbind_on_err = 1;
637 	boolean_t		clear_acceptconn_on_err = B_FALSE;
638 	boolean_t		restore_backlog_on_err = B_FALSE;
639 	int			save_so_backlog;
640 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
641 	boolean_t		tcp_udp_xport;
642 	void			*nl7c = NULL;
643 	sotpi_info_t		*sti = SOTOTPI(so);
644 
645 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
646 	    (void *)so, (void *)name, namelen, backlog, flags,
647 	    pr_state(so->so_state, so->so_mode)));
648 
649 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
650 
651 	if (!(flags & _SOBIND_LOCK_HELD)) {
652 		mutex_enter(&so->so_lock);
653 		so_lock_single(so);	/* Set SOLOCKED */
654 	} else {
655 		ASSERT(MUTEX_HELD(&so->so_lock));
656 		ASSERT(so->so_flag & SOLOCKED);
657 	}
658 
659 	/*
660 	 * Make sure that there is a preallocated unbind_req message
661 	 * before binding. This message allocated when the socket is
662 	 * created  but it might be have been consumed.
663 	 */
664 	if (sti->sti_unbind_mp == NULL) {
665 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
666 		/* NOTE: holding so_lock while sleeping */
667 		sti->sti_unbind_mp =
668 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
669 		    cr);
670 	}
671 
672 	if (flags & _SOBIND_REBIND) {
673 		/*
674 		 * Called from solisten after doing an sotpi_unbind() or
675 		 * potentially without the unbind (latter for AF_INET{,6}).
676 		 */
677 		ASSERT(name == NULL && namelen == 0);
678 
679 		if (so->so_family == AF_UNIX) {
680 			ASSERT(sti->sti_ux_bound_vp);
681 			addr = &sti->sti_ux_laddr;
682 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
683 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
684 			    "addr 0x%p, vp %p\n",
685 			    addrlen,
686 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
687 			    (void *)sti->sti_ux_bound_vp));
688 		} else {
689 			addr = sti->sti_laddr_sa;
690 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
691 		}
692 	} else if (flags & _SOBIND_UNSPEC) {
693 		ASSERT(name == NULL && namelen == 0);
694 
695 		/*
696 		 * The caller checked SS_ISBOUND but not necessarily
697 		 * under so_lock
698 		 */
699 		if (so->so_state & SS_ISBOUND) {
700 			/* No error */
701 			goto done;
702 		}
703 
704 		/* Set an initial local address */
705 		switch (so->so_family) {
706 		case AF_UNIX:
707 			/*
708 			 * Use an address with same size as struct sockaddr
709 			 * just like BSD.
710 			 */
711 			sti->sti_laddr_len =
712 			    (socklen_t)sizeof (struct sockaddr);
713 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
714 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
715 			sti->sti_laddr_sa->sa_family = so->so_family;
716 
717 			/*
718 			 * Pass down an address with the implicit bind
719 			 * magic number and the rest all zeros.
720 			 * The transport will return a unique address.
721 			 */
722 			sti->sti_ux_laddr.soua_vp = NULL;
723 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
724 			addr = &sti->sti_ux_laddr;
725 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
726 			break;
727 
728 		case AF_INET:
729 		case AF_INET6:
730 			/*
731 			 * An unspecified bind in TPI has a NULL address.
732 			 * Set the address in sockfs to have the sa_family.
733 			 */
734 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
735 			    (socklen_t)sizeof (sin_t) :
736 			    (socklen_t)sizeof (sin6_t);
737 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
738 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
739 			sti->sti_laddr_sa->sa_family = so->so_family;
740 			addr = NULL;
741 			addrlen = 0;
742 			break;
743 
744 		default:
745 			/*
746 			 * An unspecified bind in TPI has a NULL address.
747 			 * Set the address in sockfs to be zero length.
748 			 *
749 			 * Can not assume there is a sa_family for all
750 			 * protocol families. For example, AF_X25 does not
751 			 * have a family field.
752 			 */
753 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
754 			sti->sti_laddr_len = 0;	/* XXX correct? */
755 			addr = NULL;
756 			addrlen = 0;
757 			break;
758 		}
759 
760 	} else {
761 		if (so->so_state & SS_ISBOUND) {
762 			/*
763 			 * If it is ok to rebind the socket, first unbind
764 			 * with the transport. A rebind to the NULL address
765 			 * is interpreted as an unbind.
766 			 * Note that a bind to NULL in BSD does unbind the
767 			 * socket but it fails with EINVAL.
768 			 * Note that regular sockets set SOV_SOCKBSD i.e.
769 			 * _SOBIND_SOCKBSD gets set here hence no type of
770 			 * socket does currently allow rebinding.
771 			 *
772 			 * If the name is NULL just do an unbind.
773 			 */
774 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
775 			    name != NULL) {
776 				error = EINVAL;
777 				unbind_on_err = 0;
778 				eprintsoline(so, error);
779 				goto done;
780 			}
781 			if ((so->so_mode & SM_CONNREQUIRED) &&
782 			    (so->so_state & SS_CANTREBIND)) {
783 				error = EINVAL;
784 				unbind_on_err = 0;
785 				eprintsoline(so, error);
786 				goto done;
787 			}
788 			error = sotpi_unbind(so, 0);
789 			if (error) {
790 				eprintsoline(so, error);
791 				goto done;
792 			}
793 			ASSERT(!(so->so_state & SS_ISBOUND));
794 			if (name == NULL) {
795 				so->so_state &=
796 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
797 				goto done;
798 			}
799 		}
800 
801 		/* X/Open requires this check */
802 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
803 			if (xnet_check_print) {
804 				printf("sockfs: X/Open bind state check "
805 				    "caused EINVAL\n");
806 			}
807 			error = EINVAL;
808 			goto done;
809 		}
810 
811 		switch (so->so_family) {
812 		case AF_UNIX:
813 			/*
814 			 * All AF_UNIX addresses are nul terminated
815 			 * when copied (copyin_name) in so the minimum
816 			 * length is 3 bytes.
817 			 */
818 			if (name == NULL ||
819 			    (ssize_t)namelen <= sizeof (short) + 1) {
820 				error = EISDIR;
821 				eprintsoline(so, error);
822 				goto done;
823 			}
824 			/*
825 			 * Verify so_family matches the bound family.
826 			 * BSD does not check this for AF_UNIX resulting
827 			 * in funny mknods.
828 			 */
829 			if (name->sa_family != so->so_family) {
830 				error = EAFNOSUPPORT;
831 				goto done;
832 			}
833 			break;
834 		case AF_INET:
835 			if (name == NULL) {
836 				error = EINVAL;
837 				eprintsoline(so, error);
838 				goto done;
839 			}
840 			if ((size_t)namelen != sizeof (sin_t)) {
841 				error = name->sa_family != so->so_family ?
842 				    EAFNOSUPPORT : EINVAL;
843 				eprintsoline(so, error);
844 				goto done;
845 			}
846 			if ((flags & _SOBIND_XPG4_2) &&
847 			    (name->sa_family != so->so_family)) {
848 				/*
849 				 * This check has to be made for X/Open
850 				 * sockets however application failures have
851 				 * been observed when it is applied to
852 				 * all sockets.
853 				 */
854 				error = EAFNOSUPPORT;
855 				eprintsoline(so, error);
856 				goto done;
857 			}
858 			/*
859 			 * Force a zero sa_family to match so_family.
860 			 *
861 			 * Some programs like inetd(1M) don't set the
862 			 * family field. Other programs leave
863 			 * sin_family set to garbage - SunOS 4.X does
864 			 * not check the family field on a bind.
865 			 * We use the family field that
866 			 * was passed in to the socket() call.
867 			 */
868 			name->sa_family = so->so_family;
869 			break;
870 
871 		case AF_INET6: {
872 #ifdef DEBUG
873 			sin6_t *sin6 = (sin6_t *)name;
874 #endif /* DEBUG */
875 
876 			if (name == NULL) {
877 				error = EINVAL;
878 				eprintsoline(so, error);
879 				goto done;
880 			}
881 			if ((size_t)namelen != sizeof (sin6_t)) {
882 				error = name->sa_family != so->so_family ?
883 				    EAFNOSUPPORT : EINVAL;
884 				eprintsoline(so, error);
885 				goto done;
886 			}
887 			if (name->sa_family != so->so_family) {
888 				/*
889 				 * With IPv6 we require the family to match
890 				 * unlike in IPv4.
891 				 */
892 				error = EAFNOSUPPORT;
893 				eprintsoline(so, error);
894 				goto done;
895 			}
896 #ifdef DEBUG
897 			/*
898 			 * Verify that apps don't forget to clear
899 			 * sin6_scope_id etc
900 			 */
901 			if (sin6->sin6_scope_id != 0 &&
902 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
903 				zcmn_err(getzoneid(), CE_WARN,
904 				    "bind with uninitialized sin6_scope_id "
905 				    "(%d) on socket. Pid = %d\n",
906 				    (int)sin6->sin6_scope_id,
907 				    (int)curproc->p_pid);
908 			}
909 			if (sin6->__sin6_src_id != 0) {
910 				zcmn_err(getzoneid(), CE_WARN,
911 				    "bind with uninitialized __sin6_src_id "
912 				    "(%d) on socket. Pid = %d\n",
913 				    (int)sin6->__sin6_src_id,
914 				    (int)curproc->p_pid);
915 			}
916 #endif /* DEBUG */
917 			break;
918 		}
919 		default:
920 			/*
921 			 * Don't do any length or sa_family check to allow
922 			 * non-sockaddr style addresses.
923 			 */
924 			if (name == NULL) {
925 				error = EINVAL;
926 				eprintsoline(so, error);
927 				goto done;
928 			}
929 			break;
930 		}
931 
932 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
933 			error = ENAMETOOLONG;
934 			eprintsoline(so, error);
935 			goto done;
936 		}
937 		/*
938 		 * Save local address.
939 		 */
940 		sti->sti_laddr_len = (socklen_t)namelen;
941 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
942 		bcopy(name, sti->sti_laddr_sa, namelen);
943 
944 		addr = sti->sti_laddr_sa;
945 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
946 		switch (so->so_family) {
947 		case AF_INET6:
948 		case AF_INET:
949 			break;
950 		case AF_UNIX: {
951 			struct sockaddr_un *soun =
952 			    (struct sockaddr_un *)sti->sti_laddr_sa;
953 			struct vnode *vp, *rvp;
954 			struct vattr vattr;
955 
956 			ASSERT(sti->sti_ux_bound_vp == NULL);
957 			/*
958 			 * Create vnode for the specified path name.
959 			 * Keep vnode held with a reference in sti_ux_bound_vp.
960 			 * Use the vnode pointer as the address used in the
961 			 * bind with the transport.
962 			 *
963 			 * Use the same mode as in BSD. In particular this does
964 			 * not observe the umask.
965 			 */
966 			/* MAXPATHLEN + soun_family + nul termination */
967 			if (sti->sti_laddr_len >
968 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
969 				error = ENAMETOOLONG;
970 				eprintsoline(so, error);
971 				goto done;
972 			}
973 			vattr.va_type = VSOCK;
974 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
975 			vattr.va_mask = AT_TYPE|AT_MODE;
976 			/* NOTE: holding so_lock */
977 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
978 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
979 			if (error) {
980 				if (error == EEXIST)
981 					error = EADDRINUSE;
982 				eprintsoline(so, error);
983 				goto done;
984 			}
985 			/*
986 			 * Establish pointer from the underlying filesystem
987 			 * vnode to the socket node.
988 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
989 			 * cross-linkage between the underlying filesystem
990 			 * node and the socket node.
991 			 */
992 
993 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
994 				VN_HOLD(rvp);
995 				VN_RELE(vp);
996 				vp = rvp;
997 			}
998 
999 			ASSERT(SOTOV(so)->v_stream);
1000 			mutex_enter(&vp->v_lock);
1001 			vp->v_stream = SOTOV(so)->v_stream;
1002 			sti->sti_ux_bound_vp = vp;
1003 			mutex_exit(&vp->v_lock);
1004 
1005 			/*
1006 			 * Use the vnode pointer value as a unique address
1007 			 * (together with the magic number to avoid conflicts
1008 			 * with implicit binds) in the transport provider.
1009 			 */
1010 			sti->sti_ux_laddr.soua_vp =
1011 			    (void *)sti->sti_ux_bound_vp;
1012 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1013 			addr = &sti->sti_ux_laddr;
1014 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1015 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1016 			    addrlen,
1017 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1018 			break;
1019 		}
1020 		} /* end switch (so->so_family) */
1021 	}
1022 
1023 	/*
1024 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1025 	 * the transport can start passing up T_CONN_IND messages
1026 	 * as soon as it receives the bind req and strsock_proto()
1027 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1028 	 */
1029 	if (flags & _SOBIND_LISTEN) {
1030 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1031 			clear_acceptconn_on_err = B_TRUE;
1032 		save_so_backlog = so->so_backlog;
1033 		restore_backlog_on_err = B_TRUE;
1034 		so->so_state |= SS_ACCEPTCONN;
1035 		so->so_backlog = backlog;
1036 	}
1037 
1038 	/*
1039 	 * If NL7C addr(s) have been configured check for addr/port match,
1040 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1041 	 *
1042 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1043 	 * family sockets only. If match mark as such.
1044 	 */
1045 	if (nl7c_enabled && ((addr != NULL &&
1046 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1047 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1048 	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1049 		/*
1050 		 * NL7C is not supported in non-global zones,
1051 		 * we enforce this restriction here.
1052 		 */
1053 		if (so->so_zoneid == GLOBAL_ZONEID) {
1054 			/* An NL7C socket, mark it */
1055 			sti->sti_nl7c_flags |= NL7C_ENABLED;
1056 			if (nl7c == NULL) {
1057 				/*
1058 				 * Was an AF_NCA bind() so add it to the
1059 				 * addr list for reporting purposes.
1060 				 */
1061 				nl7c = nl7c_add_addr(addr, addrlen);
1062 			}
1063 		} else
1064 			nl7c = NULL;
1065 	}
1066 
1067 	/*
1068 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1069 	 * for other transports we will send in a O_T_BIND_REQ.
1070 	 */
1071 	if (tcp_udp_xport &&
1072 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1073 		PRIM_type = T_BIND_REQ;
1074 
1075 	bind_req.PRIM_type = PRIM_type;
1076 	bind_req.ADDR_length = addrlen;
1077 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1078 	bind_req.CONIND_number = backlog;
1079 	/* NOTE: holding so_lock while sleeping */
1080 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1081 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1082 	sti->sti_laddr_valid = 0;
1083 
1084 	/* Done using sti_laddr_sa - can drop the lock */
1085 	mutex_exit(&so->so_lock);
1086 
1087 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1088 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1089 	if (error) {
1090 		eprintsoline(so, error);
1091 		mutex_enter(&so->so_lock);
1092 		goto done;
1093 	}
1094 
1095 	mutex_enter(&so->so_lock);
1096 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1097 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1098 	if (error) {
1099 		eprintsoline(so, error);
1100 		goto done;
1101 	}
1102 	ASSERT(mp);
1103 	/*
1104 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1105 	 * strsock_proto while the lock was dropped above, the bind
1106 	 * is allowed to complete.
1107 	 */
1108 
1109 	/* Mark as bound. This will be undone if we detect errors below. */
1110 	if (flags & _SOBIND_NOXLATE) {
1111 		ASSERT(so->so_family == AF_UNIX);
1112 		sti->sti_faddr_noxlate = 1;
1113 	}
1114 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1115 	so->so_state |= SS_ISBOUND;
1116 	ASSERT(sti->sti_unbind_mp);
1117 
1118 	/* note that we've already set SS_ACCEPTCONN above */
1119 
1120 	/*
1121 	 * Recompute addrlen - an unspecied bind sent down an
1122 	 * address of length zero but we expect the appropriate length
1123 	 * in return.
1124 	 */
1125 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1126 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1127 
1128 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1129 	/*
1130 	 * The alignment restriction is really too strict but
1131 	 * we want enough alignment to inspect the fields of
1132 	 * a sockaddr_in.
1133 	 */
1134 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1135 	    bind_ack->ADDR_length,
1136 	    __TPI_ALIGN_SIZE);
1137 	if (addr == NULL) {
1138 		freemsg(mp);
1139 		error = EPROTO;
1140 		eprintsoline(so, error);
1141 		goto done;
1142 	}
1143 	if (!(flags & _SOBIND_UNSPEC)) {
1144 		/*
1145 		 * Verify that the transport didn't return something we
1146 		 * did not want e.g. an address other than what we asked for.
1147 		 *
1148 		 * NOTE: These checks would go away if/when we switch to
1149 		 * using the new TPI (in which the transport would fail
1150 		 * the request instead of assigning a different address).
1151 		 *
1152 		 * NOTE2: For protocols that we don't know (i.e. any
1153 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1154 		 * cannot know if the transport should be expected to
1155 		 * return the same address as that requested.
1156 		 *
1157 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1158 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1159 		 *
1160 		 * For example, in the case of netatalk it may be
1161 		 * inappropriate for the transport to return the
1162 		 * requested address (as it may have allocated a local
1163 		 * port number in behaviour similar to that of an
1164 		 * AF_INET bind request with a port number of zero).
1165 		 *
1166 		 * Given the definition of O_T_BIND_REQ, where the
1167 		 * transport may bind to an address other than the
1168 		 * requested address, it's not possible to determine
1169 		 * whether a returned address that differs from the
1170 		 * requested address is a reason to fail (because the
1171 		 * requested address was not available) or succeed
1172 		 * (because the transport allocated an appropriate
1173 		 * address and/or port).
1174 		 *
1175 		 * sockfs currently requires that the transport return
1176 		 * the requested address in the T_BIND_ACK, unless
1177 		 * there is code here to allow for any discrepancy.
1178 		 * Such code exists for AF_INET and AF_INET6.
1179 		 *
1180 		 * Netatalk chooses to return the requested address
1181 		 * rather than the (correct) allocated address.  This
1182 		 * means that netatalk violates the TPI specification
1183 		 * (and would not function correctly if used from a
1184 		 * TLI application), but it does mean that it works
1185 		 * with sockfs.
1186 		 *
1187 		 * As noted above, using the newer XTI bind primitive
1188 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1189 		 * allow sockfs to be more sure about whether or not
1190 		 * the bind request had succeeded (as transports are
1191 		 * not permitted to bind to a different address than
1192 		 * that requested - they must return failure).
1193 		 * Unfortunately, support for T_BIND_REQ may not be
1194 		 * present in all transport implementations (netatalk,
1195 		 * for example, doesn't have it), making the
1196 		 * transition difficult.
1197 		 */
1198 		if (bind_ack->ADDR_length != addrlen) {
1199 			/* Assumes that the requested address was in use */
1200 			freemsg(mp);
1201 			error = EADDRINUSE;
1202 			eprintsoline(so, error);
1203 			goto done;
1204 		}
1205 
1206 		switch (so->so_family) {
1207 		case AF_INET6:
1208 		case AF_INET: {
1209 			sin_t *rname, *aname;
1210 
1211 			rname = (sin_t *)addr;
1212 			aname = (sin_t *)sti->sti_laddr_sa;
1213 
1214 			/*
1215 			 * Take advantage of the alignment
1216 			 * of sin_port and sin6_port which fall
1217 			 * in the same place in their data structures.
1218 			 * Just use sin_port for either address family.
1219 			 *
1220 			 * This may become a problem if (heaven forbid)
1221 			 * there's a separate ipv6port_reserved... :-P
1222 			 *
1223 			 * Binding to port 0 has the semantics of letting
1224 			 * the transport bind to any port.
1225 			 *
1226 			 * If the transport is TCP or UDP since we had sent
1227 			 * a T_BIND_REQ we would not get a port other than
1228 			 * what we asked for.
1229 			 */
1230 			if (tcp_udp_xport) {
1231 				/*
1232 				 * Pick up the new port number if we bound to
1233 				 * port 0.
1234 				 */
1235 				if (aname->sin_port == 0)
1236 					aname->sin_port = rname->sin_port;
1237 				sti->sti_laddr_valid = 1;
1238 				break;
1239 			}
1240 			if (aname->sin_port != 0 &&
1241 			    aname->sin_port != rname->sin_port) {
1242 				freemsg(mp);
1243 				error = EADDRINUSE;
1244 				eprintsoline(so, error);
1245 				goto done;
1246 			}
1247 			/*
1248 			 * Pick up the new port number if we bound to port 0.
1249 			 */
1250 			aname->sin_port = rname->sin_port;
1251 
1252 			/*
1253 			 * Unfortunately, addresses aren't _quite_ the same.
1254 			 */
1255 			if (so->so_family == AF_INET) {
1256 				if (aname->sin_addr.s_addr !=
1257 				    rname->sin_addr.s_addr) {
1258 					freemsg(mp);
1259 					error = EADDRNOTAVAIL;
1260 					eprintsoline(so, error);
1261 					goto done;
1262 				}
1263 			} else {
1264 				sin6_t *rname6 = (sin6_t *)rname;
1265 				sin6_t *aname6 = (sin6_t *)aname;
1266 
1267 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1268 				    &rname6->sin6_addr)) {
1269 					freemsg(mp);
1270 					error = EADDRNOTAVAIL;
1271 					eprintsoline(so, error);
1272 					goto done;
1273 				}
1274 			}
1275 			break;
1276 		}
1277 		case AF_UNIX:
1278 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1279 				freemsg(mp);
1280 				error = EADDRINUSE;
1281 				eprintsoline(so, error);
1282 				eprintso(so,
1283 				    ("addrlen %d, addr 0x%x, vp %p\n",
1284 				    addrlen, *((int *)addr),
1285 				    (void *)sti->sti_ux_bound_vp));
1286 				goto done;
1287 			}
1288 			sti->sti_laddr_valid = 1;
1289 			break;
1290 		default:
1291 			/*
1292 			 * NOTE: This assumes that addresses can be
1293 			 * byte-compared for equivalence.
1294 			 */
1295 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1296 				freemsg(mp);
1297 				error = EADDRINUSE;
1298 				eprintsoline(so, error);
1299 				goto done;
1300 			}
1301 			/*
1302 			 * Don't mark sti_laddr_valid, as we cannot be
1303 			 * sure that the returned address is the real
1304 			 * bound address when talking to an unknown
1305 			 * transport.
1306 			 */
1307 			break;
1308 		}
1309 	} else {
1310 		/*
1311 		 * Save for returned address for getsockname.
1312 		 * Needed for unspecific bind unless transport supports
1313 		 * the TI_GETMYNAME ioctl.
1314 		 * Do this for AF_INET{,6} even though they do, as
1315 		 * caching info here is much better performance than
1316 		 * a TPI/STREAMS trip to the transport for getsockname.
1317 		 * Any which can't for some reason _must_ _not_ set
1318 		 * sti_laddr_valid here for the caching version of
1319 		 * getsockname to not break;
1320 		 */
1321 		switch (so->so_family) {
1322 		case AF_UNIX:
1323 			/*
1324 			 * Record the address bound with the transport
1325 			 * for use by socketpair.
1326 			 */
1327 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1328 			sti->sti_laddr_valid = 1;
1329 			break;
1330 		case AF_INET:
1331 		case AF_INET6:
1332 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1333 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1334 			sti->sti_laddr_valid = 1;
1335 			break;
1336 		default:
1337 			/*
1338 			 * Don't mark sti_laddr_valid, as we cannot be
1339 			 * sure that the returned address is the real
1340 			 * bound address when talking to an unknown
1341 			 * transport.
1342 			 */
1343 			break;
1344 		}
1345 	}
1346 
1347 	if (nl7c != NULL) {
1348 		/* Register listen()er sonode pointer with NL7C */
1349 		nl7c_listener_addr(nl7c, so);
1350 	}
1351 
1352 	freemsg(mp);
1353 
1354 done:
1355 	if (error) {
1356 		/* reset state & backlog to values held on entry */
1357 		if (clear_acceptconn_on_err == B_TRUE)
1358 			so->so_state &= ~SS_ACCEPTCONN;
1359 		if (restore_backlog_on_err == B_TRUE)
1360 			so->so_backlog = save_so_backlog;
1361 
1362 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1363 			int err;
1364 
1365 			err = sotpi_unbind(so, 0);
1366 			/* LINTED - statement has no consequent: if */
1367 			if (err) {
1368 				eprintsoline(so, error);
1369 			} else {
1370 				ASSERT(!(so->so_state & SS_ISBOUND));
1371 			}
1372 		}
1373 	}
1374 	if (!(flags & _SOBIND_LOCK_HELD)) {
1375 		so_unlock_single(so, SOLOCKED);
1376 		mutex_exit(&so->so_lock);
1377 	} else {
1378 		ASSERT(MUTEX_HELD(&so->so_lock));
1379 		ASSERT(so->so_flag & SOLOCKED);
1380 	}
1381 	return (error);
1382 }
1383 
1384 /* bind the socket */
1385 static int
1386 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1387     int flags, struct cred *cr)
1388 {
1389 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1390 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1391 
1392 	flags &= ~_SOBIND_SOCKETPAIR;
1393 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1394 }
1395 
1396 /*
1397  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1398  * address, or when listen needs to unbind and bind.
1399  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1400  * so that a sobind can pick them up.
1401  */
1402 static int
1403 sotpi_unbind(struct sonode *so, int flags)
1404 {
1405 	struct T_unbind_req	unbind_req;
1406 	int			error = 0;
1407 	mblk_t			*mp;
1408 	sotpi_info_t		*sti = SOTOTPI(so);
1409 
1410 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1411 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1412 
1413 	ASSERT(MUTEX_HELD(&so->so_lock));
1414 	ASSERT(so->so_flag & SOLOCKED);
1415 
1416 	if (!(so->so_state & SS_ISBOUND)) {
1417 		error = EINVAL;
1418 		eprintsoline(so, error);
1419 		goto done;
1420 	}
1421 
1422 	mutex_exit(&so->so_lock);
1423 
1424 	/*
1425 	 * Flush the read and write side (except stream head read queue)
1426 	 * and send down T_UNBIND_REQ.
1427 	 */
1428 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1429 
1430 	unbind_req.PRIM_type = T_UNBIND_REQ;
1431 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1432 	    0, _ALLOC_SLEEP, CRED());
1433 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1434 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1435 	mutex_enter(&so->so_lock);
1436 	if (error) {
1437 		eprintsoline(so, error);
1438 		goto done;
1439 	}
1440 
1441 	error = sowaitokack(so, T_UNBIND_REQ);
1442 	if (error) {
1443 		eprintsoline(so, error);
1444 		goto done;
1445 	}
1446 
1447 	/*
1448 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1449 	 * strsock_proto while the lock was dropped above, the unbind
1450 	 * is allowed to complete.
1451 	 */
1452 	if (!(flags & _SOUNBIND_REBIND)) {
1453 		/*
1454 		 * Clear out bound address.
1455 		 */
1456 		vnode_t *vp;
1457 
1458 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1459 			sti->sti_ux_bound_vp = NULL;
1460 			vn_rele_stream(vp);
1461 		}
1462 		/* Clear out address */
1463 		sti->sti_laddr_len = 0;
1464 	}
1465 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1466 	sti->sti_laddr_valid = 0;
1467 
1468 done:
1469 
1470 	/* If the caller held the lock don't release it here */
1471 	ASSERT(MUTEX_HELD(&so->so_lock));
1472 	ASSERT(so->so_flag & SOLOCKED);
1473 
1474 	return (error);
1475 }
1476 
1477 /*
1478  * listen on the socket.
1479  * For TPI conforming transports this has to first unbind with the transport
1480  * and then bind again using the new backlog.
1481  */
1482 /* ARGSUSED */
1483 int
1484 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1485 {
1486 	int		error = 0;
1487 	sotpi_info_t	*sti = SOTOTPI(so);
1488 
1489 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1490 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1491 
1492 	if (sti->sti_serv_type == T_CLTS)
1493 		return (EOPNOTSUPP);
1494 
1495 	/*
1496 	 * If the socket is ready to accept connections already, then
1497 	 * return without doing anything.  This avoids a problem where
1498 	 * a second listen() call fails if a connection is pending and
1499 	 * leaves the socket unbound. Only when we are not unbinding
1500 	 * with the transport can we safely increase the backlog.
1501 	 */
1502 	if (so->so_state & SS_ACCEPTCONN &&
1503 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1504 	    /*CONSTCOND*/
1505 	    !solisten_tpi_tcp))
1506 		return (0);
1507 
1508 	if (so->so_state & SS_ISCONNECTED)
1509 		return (EINVAL);
1510 
1511 	mutex_enter(&so->so_lock);
1512 	so_lock_single(so);	/* Set SOLOCKED */
1513 
1514 	/*
1515 	 * If the listen doesn't change the backlog we do nothing.
1516 	 * This avoids an EPROTO error from the transport.
1517 	 */
1518 	if ((so->so_state & SS_ACCEPTCONN) &&
1519 	    so->so_backlog == backlog)
1520 		goto done;
1521 
1522 	if (!(so->so_state & SS_ISBOUND)) {
1523 		/*
1524 		 * Must have been explicitly bound in the UNIX domain.
1525 		 */
1526 		if (so->so_family == AF_UNIX) {
1527 			error = EINVAL;
1528 			goto done;
1529 		}
1530 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1531 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1532 	} else if (backlog > 0) {
1533 		/*
1534 		 * AF_INET{,6} hack to avoid losing the port.
1535 		 * Assumes that all AF_INET{,6} transports can handle a
1536 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1537 		 * has already bound thus it is possible to avoid the unbind.
1538 		 */
1539 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1540 		    /*CONSTCOND*/
1541 		    !solisten_tpi_tcp)) {
1542 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1543 			if (error)
1544 				goto done;
1545 		}
1546 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1547 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1548 	} else {
1549 		so->so_state |= SS_ACCEPTCONN;
1550 		so->so_backlog = backlog;
1551 	}
1552 	if (error)
1553 		goto done;
1554 	ASSERT(so->so_state & SS_ACCEPTCONN);
1555 done:
1556 	so_unlock_single(so, SOLOCKED);
1557 	mutex_exit(&so->so_lock);
1558 	return (error);
1559 }
1560 
1561 /*
1562  * Disconnect either a specified seqno or all (-1).
1563  * The former is used on listening sockets only.
1564  *
1565  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1566  * the current use of sodisconnect(seqno == -1) is only for shutdown
1567  * so there is no point (and potentially incorrect) to unbind.
1568  */
1569 static int
1570 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1571 {
1572 	struct T_discon_req	discon_req;
1573 	int			error = 0;
1574 	mblk_t			*mp;
1575 
1576 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1577 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1578 
1579 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1580 		mutex_enter(&so->so_lock);
1581 		so_lock_single(so);	/* Set SOLOCKED */
1582 	} else {
1583 		ASSERT(MUTEX_HELD(&so->so_lock));
1584 		ASSERT(so->so_flag & SOLOCKED);
1585 	}
1586 
1587 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1588 		error = EINVAL;
1589 		eprintsoline(so, error);
1590 		goto done;
1591 	}
1592 
1593 	mutex_exit(&so->so_lock);
1594 	/*
1595 	 * Flush the write side (unless this is a listener)
1596 	 * and then send down a T_DISCON_REQ.
1597 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1598 	 * and other messages.)
1599 	 */
1600 	if (!(so->so_state & SS_ACCEPTCONN))
1601 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1602 
1603 	discon_req.PRIM_type = T_DISCON_REQ;
1604 	discon_req.SEQ_number = seqno;
1605 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1606 	    0, _ALLOC_SLEEP, CRED());
1607 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1608 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1609 	mutex_enter(&so->so_lock);
1610 	if (error) {
1611 		eprintsoline(so, error);
1612 		goto done;
1613 	}
1614 
1615 	error = sowaitokack(so, T_DISCON_REQ);
1616 	if (error) {
1617 		eprintsoline(so, error);
1618 		goto done;
1619 	}
1620 	/*
1621 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1622 	 * strsock_proto while the lock was dropped above, the disconnect
1623 	 * is allowed to complete. However, it is not possible to
1624 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1625 	 */
1626 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1627 	SOTOTPI(so)->sti_laddr_valid = 0;
1628 	SOTOTPI(so)->sti_faddr_valid = 0;
1629 done:
1630 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1631 		so_unlock_single(so, SOLOCKED);
1632 		mutex_exit(&so->so_lock);
1633 	} else {
1634 		/* If the caller held the lock don't release it here */
1635 		ASSERT(MUTEX_HELD(&so->so_lock));
1636 		ASSERT(so->so_flag & SOLOCKED);
1637 	}
1638 	return (error);
1639 }
1640 
1641 /* ARGSUSED */
1642 int
1643 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1644     struct sonode **nsop)
1645 {
1646 	struct T_conn_ind	*conn_ind;
1647 	struct T_conn_res	*conn_res;
1648 	int			error = 0;
1649 	mblk_t			*mp, *ack_mp;
1650 	struct sonode		*nso;
1651 	vnode_t			*nvp;
1652 	void			*src;
1653 	t_uscalar_t		srclen;
1654 	void			*opt;
1655 	t_uscalar_t		optlen;
1656 	t_scalar_t		PRIM_type;
1657 	t_scalar_t		SEQ_number;
1658 	size_t			sinlen;
1659 	sotpi_info_t		*sti = SOTOTPI(so);
1660 	sotpi_info_t		*nsti;
1661 
1662 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1663 	    (void *)so, fflag, (void *)nsop,
1664 	    pr_state(so->so_state, so->so_mode)));
1665 
1666 	/*
1667 	 * Defer single-threading the accepting socket until
1668 	 * the T_CONN_IND has been received and parsed and the
1669 	 * new sonode has been opened.
1670 	 */
1671 
1672 	/* Check that we are not already connected */
1673 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1674 		goto conn_bad;
1675 again:
1676 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1677 		goto e_bad;
1678 
1679 	ASSERT(mp != NULL);
1680 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1681 
1682 	/*
1683 	 * Save SEQ_number for error paths.
1684 	 */
1685 	SEQ_number = conn_ind->SEQ_number;
1686 
1687 	srclen = conn_ind->SRC_length;
1688 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1689 	if (src == NULL) {
1690 		error = EPROTO;
1691 		freemsg(mp);
1692 		eprintsoline(so, error);
1693 		goto disconnect_unlocked;
1694 	}
1695 	optlen = conn_ind->OPT_length;
1696 	switch (so->so_family) {
1697 	case AF_INET:
1698 	case AF_INET6:
1699 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1700 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1701 			    &opt, conn_ind->OPT_length);
1702 		} else {
1703 			/*
1704 			 * The transport (in this case TCP) hasn't sent up
1705 			 * a pointer to an instance for the accept fast-path.
1706 			 * Disable fast-path completely because the call to
1707 			 * sotpi_create() below would otherwise create an
1708 			 * incomplete TCP instance, which would lead to
1709 			 * problems when sockfs sends a normal T_CONN_RES
1710 			 * message down the new stream.
1711 			 */
1712 			if (sti->sti_direct) {
1713 				int rval;
1714 				/*
1715 				 * For consistency we inform tcp to disable
1716 				 * direct interface on the listener, though
1717 				 * we can certainly live without doing this
1718 				 * because no data will ever travel upstream
1719 				 * on the listening socket.
1720 				 */
1721 				sti->sti_direct = 0;
1722 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1723 				    0, 0, K_TO_K, cr, &rval);
1724 			}
1725 			opt = NULL;
1726 			optlen = 0;
1727 		}
1728 		break;
1729 	case AF_UNIX:
1730 	default:
1731 		if (optlen != 0) {
1732 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1733 			    __TPI_ALIGN_SIZE);
1734 			if (opt == NULL) {
1735 				error = EPROTO;
1736 				freemsg(mp);
1737 				eprintsoline(so, error);
1738 				goto disconnect_unlocked;
1739 			}
1740 		}
1741 		if (so->so_family == AF_UNIX) {
1742 			if (!sti->sti_faddr_noxlate) {
1743 				src = NULL;
1744 				srclen = 0;
1745 			}
1746 			/* Extract src address from options */
1747 			if (optlen != 0)
1748 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1749 		}
1750 		break;
1751 	}
1752 
1753 	/*
1754 	 * Create the new socket.
1755 	 */
1756 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1757 	if (nso == NULL) {
1758 		ASSERT(error != 0);
1759 		/*
1760 		 * Accept can not fail with ENOBUFS. sotpi_create
1761 		 * sleeps waiting for memory until a signal is caught
1762 		 * so return EINTR.
1763 		 */
1764 		freemsg(mp);
1765 		if (error == ENOBUFS)
1766 			error = EINTR;
1767 		goto e_disc_unl;
1768 	}
1769 	nvp = SOTOV(nso);
1770 	nsti = SOTOTPI(nso);
1771 
1772 #ifdef DEBUG
1773 	/*
1774 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1775 	 * it's inherited early to allow debugging of the accept code itself.
1776 	 */
1777 	nso->so_options |= so->so_options & SO_DEBUG;
1778 #endif /* DEBUG */
1779 
1780 	/*
1781 	 * Save the SRC address from the T_CONN_IND
1782 	 * for getpeername to work on AF_UNIX and on transports that do not
1783 	 * support TI_GETPEERNAME.
1784 	 *
1785 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1786 	 * copyin_name().
1787 	 */
1788 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1789 		error = EINVAL;
1790 		freemsg(mp);
1791 		eprintsoline(so, error);
1792 		goto disconnect_vp_unlocked;
1793 	}
1794 	nsti->sti_faddr_len = (socklen_t)srclen;
1795 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1796 	bcopy(src, nsti->sti_faddr_sa, srclen);
1797 	nsti->sti_faddr_valid = 1;
1798 
1799 	/*
1800 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1801 	 */
1802 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1803 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1804 		cred_t	*cr;
1805 		pid_t	cpid;
1806 
1807 		cr = msg_getcred(mp, &cpid);
1808 		if (cr != NULL) {
1809 			crhold(cr);
1810 			nso->so_peercred = cr;
1811 			nso->so_cpid = cpid;
1812 		}
1813 		freemsg(mp);
1814 
1815 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1816 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1817 		if (mp == NULL) {
1818 			/*
1819 			 * Accept can not fail with ENOBUFS.
1820 			 * A signal was caught so return EINTR.
1821 			 */
1822 			error = EINTR;
1823 			eprintsoline(so, error);
1824 			goto disconnect_vp_unlocked;
1825 		}
1826 		conn_res = (struct T_conn_res *)mp->b_rptr;
1827 	} else {
1828 		/*
1829 		 * For efficency reasons we use msg_extractcred; no crhold
1830 		 * needed since db_credp is cleared (i.e., we move the cred
1831 		 * from the message to so_peercred.
1832 		 */
1833 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1834 
1835 		mp->b_rptr = DB_BASE(mp);
1836 		conn_res = (struct T_conn_res *)mp->b_rptr;
1837 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1838 
1839 		mblk_setcred(mp, cr, curproc->p_pid);
1840 	}
1841 
1842 	/*
1843 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1844 	 * (or AF_INET6) it also has to be bound in the transport provider.
1845 	 * We set the local address in the sonode from the T_OK_ACK of the
1846 	 * T_CONN_RES. For this reason the address we bind to here isn't
1847 	 * important.
1848 	 */
1849 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1850 	    /*CONSTCOND*/
1851 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1852 		/*
1853 		 * Optimization for AF_INET{,6} transports
1854 		 * that can handle a T_CONN_RES without being bound.
1855 		 */
1856 		mutex_enter(&nso->so_lock);
1857 		so_automatic_bind(nso);
1858 		mutex_exit(&nso->so_lock);
1859 	} else {
1860 		/* Perform NULL bind with the transport provider. */
1861 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1862 		    cr)) != 0) {
1863 			ASSERT(error != ENOBUFS);
1864 			freemsg(mp);
1865 			eprintsoline(nso, error);
1866 			goto disconnect_vp_unlocked;
1867 		}
1868 	}
1869 
1870 	/*
1871 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1872 	 * so that any data arriving on the new socket will cause the
1873 	 * appropriate signals to be delivered for the new socket.
1874 	 *
1875 	 * No other thread (except strsock_proto and strsock_misc)
1876 	 * can access the new socket thus we relax the locking.
1877 	 */
1878 	nso->so_pgrp = so->so_pgrp;
1879 	nso->so_state |= so->so_state & SS_ASYNC;
1880 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1881 
1882 	if (nso->so_pgrp != 0) {
1883 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1884 			eprintsoline(nso, error);
1885 			error = 0;
1886 			nso->so_pgrp = 0;
1887 		}
1888 	}
1889 
1890 	/*
1891 	 * Make note of the socket level options. TCP and IP level options
1892 	 * are already inherited. We could do all this after accept is
1893 	 * successful but doing it here simplifies code and no harm done
1894 	 * for error case.
1895 	 */
1896 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1897 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1898 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1899 	nso->so_sndbuf = so->so_sndbuf;
1900 	nso->so_rcvbuf = so->so_rcvbuf;
1901 	if (nso->so_options & SO_LINGER)
1902 		nso->so_linger = so->so_linger;
1903 
1904 	/*
1905 	 * Note that the following sti_direct code path should be
1906 	 * removed once we are confident that the direct sockets
1907 	 * do not result in any degradation.
1908 	 */
1909 	if (sti->sti_direct) {
1910 
1911 		ASSERT(opt != NULL);
1912 
1913 		conn_res->OPT_length = optlen;
1914 		conn_res->OPT_offset = MBLKL(mp);
1915 		bcopy(&opt, mp->b_wptr, optlen);
1916 		mp->b_wptr += optlen;
1917 		conn_res->PRIM_type = T_CONN_RES;
1918 		conn_res->ACCEPTOR_id = 0;
1919 		PRIM_type = T_CONN_RES;
1920 
1921 		/* Send down the T_CONN_RES on acceptor STREAM */
1922 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1923 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1924 		if (error) {
1925 			mutex_enter(&so->so_lock);
1926 			so_lock_single(so);
1927 			eprintsoline(so, error);
1928 			goto disconnect_vp;
1929 		}
1930 		mutex_enter(&nso->so_lock);
1931 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1932 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1933 		if (error) {
1934 			mutex_exit(&nso->so_lock);
1935 			mutex_enter(&so->so_lock);
1936 			so_lock_single(so);
1937 			eprintsoline(so, error);
1938 			goto disconnect_vp;
1939 		}
1940 		if (nso->so_family == AF_INET) {
1941 			sin_t *sin;
1942 
1943 			sin = (sin_t *)(ack_mp->b_rptr +
1944 			    sizeof (struct T_ok_ack));
1945 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1946 			nsti->sti_laddr_len = sizeof (sin_t);
1947 		} else {
1948 			sin6_t *sin6;
1949 
1950 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1951 			    sizeof (struct T_ok_ack));
1952 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1953 			nsti->sti_laddr_len = sizeof (sin6_t);
1954 		}
1955 		freemsg(ack_mp);
1956 
1957 		nso->so_state |= SS_ISCONNECTED;
1958 		nso->so_proto_handle = (sock_lower_handle_t)opt;
1959 		nsti->sti_laddr_valid = 1;
1960 
1961 		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1962 			/*
1963 			 * A NL7C marked listen()er so the new socket
1964 			 * inherits the listen()er's NL7C state, except
1965 			 * for NL7C_POLLIN.
1966 			 *
1967 			 * Only call NL7C to process the new socket if
1968 			 * the listen socket allows blocking i/o.
1969 			 */
1970 			nsti->sti_nl7c_flags =
1971 			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
1972 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1973 				/*
1974 				 * Nonblocking accept() just make it
1975 				 * persist to defer processing to the
1976 				 * read-side syscall (e.g. read).
1977 				 */
1978 				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
1979 			} else if (nl7c_process(nso, B_FALSE)) {
1980 				/*
1981 				 * NL7C has completed processing on the
1982 				 * socket, close the socket and back to
1983 				 * the top to await the next T_CONN_IND.
1984 				 */
1985 				mutex_exit(&nso->so_lock);
1986 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1987 				    cr, NULL);
1988 				VN_RELE(nvp);
1989 				goto again;
1990 			}
1991 			/* Pass the new socket out */
1992 		}
1993 
1994 		mutex_exit(&nso->so_lock);
1995 
1996 		/*
1997 		 * It's possible, through the use of autopush for example,
1998 		 * that the acceptor stream may not support sti_direct
1999 		 * semantics. If the new socket does not support sti_direct
2000 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
2001 		 * as we would in the I_PUSH case.
2002 		 */
2003 		if (nsti->sti_direct == 0) {
2004 			int	rval;
2005 
2006 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2007 			    0, 0, K_TO_K, cr, &rval)) != 0) {
2008 				mutex_enter(&so->so_lock);
2009 				so_lock_single(so);
2010 				eprintsoline(so, error);
2011 				goto disconnect_vp;
2012 			}
2013 		}
2014 
2015 		/*
2016 		 * Pass out new socket.
2017 		 */
2018 		if (nsop != NULL)
2019 			*nsop = nso;
2020 
2021 		return (0);
2022 	}
2023 
2024 	/*
2025 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2026 	 * which don't support the FireEngine accept fast-path. It is also
2027 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2028 	 * again. Neither sockfs nor TCP attempt to find out if some other
2029 	 * random module has been inserted in between (in which case we
2030 	 * should follow TLI accept behaviour). We blindly assume the worst
2031 	 * case and revert back to old behaviour i.e. TCP will not send us
2032 	 * any option (eager) and the accept should happen on the listener
2033 	 * queue. Any queued T_conn_ind have already got their options removed
2034 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
2035 	 */
2036 	/*
2037 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2038 	 */
2039 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2040 #ifdef	_ILP32
2041 		queue_t	*q;
2042 
2043 		/*
2044 		 * Find read queue in driver
2045 		 * Can safely do this since we "own" nso/nvp.
2046 		 */
2047 		q = strvp2wq(nvp)->q_next;
2048 		while (SAMESTR(q))
2049 			q = q->q_next;
2050 		q = RD(q);
2051 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2052 #else
2053 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2054 #endif	/* _ILP32 */
2055 		conn_res->PRIM_type = O_T_CONN_RES;
2056 		PRIM_type = O_T_CONN_RES;
2057 	} else {
2058 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2059 		conn_res->PRIM_type = T_CONN_RES;
2060 		PRIM_type = T_CONN_RES;
2061 	}
2062 	conn_res->SEQ_number = SEQ_number;
2063 	conn_res->OPT_length = 0;
2064 	conn_res->OPT_offset = 0;
2065 
2066 	mutex_enter(&so->so_lock);
2067 	so_lock_single(so);	/* Set SOLOCKED */
2068 	mutex_exit(&so->so_lock);
2069 
2070 	error = kstrputmsg(SOTOV(so), mp, NULL,
2071 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2072 	mutex_enter(&so->so_lock);
2073 	if (error) {
2074 		eprintsoline(so, error);
2075 		goto disconnect_vp;
2076 	}
2077 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2078 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2079 	if (error) {
2080 		eprintsoline(so, error);
2081 		goto disconnect_vp;
2082 	}
2083 	mutex_exit(&so->so_lock);
2084 	/*
2085 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2086 	 * that to set the local address. If this is not present
2087 	 * then we zero out the address and don't set the
2088 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2089 	 * the pathname from the listening socket.
2090 	 * In the case where this is TCP or an AF_UNIX socket the
2091 	 * client side may have queued data or a T_ORDREL in the
2092 	 * transport. Having now sent the T_CONN_RES we may receive
2093 	 * those queued messages at any time. Hold the acceptor
2094 	 * so_lock until its state and laddr are finalized.
2095 	 */
2096 	mutex_enter(&nso->so_lock);
2097 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2098 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2099 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2100 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2101 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2102 		nsti->sti_laddr_len = sinlen;
2103 		nsti->sti_laddr_valid = 1;
2104 	} else if (nso->so_family == AF_UNIX) {
2105 		ASSERT(so->so_family == AF_UNIX);
2106 		nsti->sti_laddr_len = sti->sti_laddr_len;
2107 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2108 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2109 		    nsti->sti_laddr_len);
2110 		nsti->sti_laddr_valid = 1;
2111 	} else {
2112 		nsti->sti_laddr_len = sti->sti_laddr_len;
2113 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2114 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2115 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2116 	}
2117 	nso->so_state |= SS_ISCONNECTED;
2118 	mutex_exit(&nso->so_lock);
2119 
2120 	freemsg(ack_mp);
2121 
2122 	mutex_enter(&so->so_lock);
2123 	so_unlock_single(so, SOLOCKED);
2124 	mutex_exit(&so->so_lock);
2125 
2126 	/*
2127 	 * Pass out new socket.
2128 	 */
2129 	if (nsop != NULL)
2130 		*nsop = nso;
2131 
2132 	return (0);
2133 
2134 
2135 eproto_disc_unl:
2136 	error = EPROTO;
2137 e_disc_unl:
2138 	eprintsoline(so, error);
2139 	goto disconnect_unlocked;
2140 
2141 pr_disc_vp_unl:
2142 	eprintsoline(so, error);
2143 disconnect_vp_unlocked:
2144 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2145 	VN_RELE(nvp);
2146 disconnect_unlocked:
2147 	(void) sodisconnect(so, SEQ_number, 0);
2148 	return (error);
2149 
2150 pr_disc_vp:
2151 	eprintsoline(so, error);
2152 disconnect_vp:
2153 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2154 	so_unlock_single(so, SOLOCKED);
2155 	mutex_exit(&so->so_lock);
2156 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2157 	VN_RELE(nvp);
2158 	return (error);
2159 
2160 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2161 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2162 	    ? EOPNOTSUPP : EINVAL;
2163 e_bad:
2164 	eprintsoline(so, error);
2165 	return (error);
2166 }
2167 
2168 /*
2169  * connect a socket.
2170  *
2171  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2172  * unconnect (by specifying a null address).
2173  */
2174 int
2175 sotpi_connect(struct sonode *so,
2176 	struct sockaddr *name,
2177 	socklen_t namelen,
2178 	int fflag,
2179 	int flags,
2180 	struct cred *cr)
2181 {
2182 	struct T_conn_req	conn_req;
2183 	int			error = 0;
2184 	mblk_t			*mp;
2185 	void			*src;
2186 	socklen_t		srclen;
2187 	void			*addr;
2188 	socklen_t		addrlen;
2189 	boolean_t		need_unlock;
2190 	sotpi_info_t		*sti = SOTOTPI(so);
2191 
2192 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2193 	    (void *)so, (void *)name, namelen, fflag, flags,
2194 	    pr_state(so->so_state, so->so_mode)));
2195 
2196 	/*
2197 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2198 	 * avoid sleeping for memory with SOLOCKED held.
2199 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2200 	 * + sizeof (struct T_opthdr).
2201 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2202 	 * exceed sti_faddr_maxlen).
2203 	 */
2204 	mp = soallocproto(sizeof (struct T_conn_req) +
2205 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2206 	    cr);
2207 	if (mp == NULL) {
2208 		/*
2209 		 * Connect can not fail with ENOBUFS. A signal was
2210 		 * caught so return EINTR.
2211 		 */
2212 		error = EINTR;
2213 		eprintsoline(so, error);
2214 		return (error);
2215 	}
2216 
2217 	mutex_enter(&so->so_lock);
2218 	/*
2219 	 * Make sure there is a preallocated T_unbind_req message
2220 	 * before any binding. This message is allocated when the
2221 	 * socket is created. Since another thread can consume
2222 	 * so_unbind_mp by the time we return from so_lock_single(),
2223 	 * we should check the availability of so_unbind_mp after
2224 	 * we return from so_lock_single().
2225 	 */
2226 
2227 	so_lock_single(so);	/* Set SOLOCKED */
2228 	need_unlock = B_TRUE;
2229 
2230 	if (sti->sti_unbind_mp == NULL) {
2231 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2232 		/* NOTE: holding so_lock while sleeping */
2233 		sti->sti_unbind_mp =
2234 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2235 		if (sti->sti_unbind_mp == NULL) {
2236 			error = EINTR;
2237 			goto done;
2238 		}
2239 	}
2240 
2241 	/*
2242 	 * Can't have done a listen before connecting.
2243 	 */
2244 	if (so->so_state & SS_ACCEPTCONN) {
2245 		error = EOPNOTSUPP;
2246 		goto done;
2247 	}
2248 
2249 	/*
2250 	 * Must be bound with the transport
2251 	 */
2252 	if (!(so->so_state & SS_ISBOUND)) {
2253 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2254 		    /*CONSTCOND*/
2255 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2256 			/*
2257 			 * Optimization for AF_INET{,6} transports
2258 			 * that can handle a T_CONN_REQ without being bound.
2259 			 */
2260 			so_automatic_bind(so);
2261 		} else {
2262 			error = sotpi_bind(so, NULL, 0,
2263 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2264 			if (error)
2265 				goto done;
2266 		}
2267 		ASSERT(so->so_state & SS_ISBOUND);
2268 		flags |= _SOCONNECT_DID_BIND;
2269 	}
2270 
2271 	/*
2272 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2273 	 * connect to a null address. This is the portable method to
2274 	 * unconnect a socket.
2275 	 */
2276 	if ((namelen >= sizeof (sa_family_t)) &&
2277 	    (name->sa_family == AF_UNSPEC)) {
2278 		name = NULL;
2279 		namelen = 0;
2280 	}
2281 
2282 	/*
2283 	 * Check that we are not already connected.
2284 	 * A connection-oriented socket cannot be reconnected.
2285 	 * A connected connection-less socket can be
2286 	 * - connected to a different address by a subsequent connect
2287 	 * - "unconnected" by a connect to the NULL address
2288 	 */
2289 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2290 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2291 		if (so->so_mode & SM_CONNREQUIRED) {
2292 			/* Connection-oriented socket */
2293 			error = so->so_state & SS_ISCONNECTED ?
2294 			    EISCONN : EALREADY;
2295 			goto done;
2296 		}
2297 		/* Connection-less socket */
2298 		if (name == NULL) {
2299 			/*
2300 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2301 			 * since it was set when the socket was connected.
2302 			 * If this is UDP also send down a T_DISCON_REQ.
2303 			 */
2304 			int val;
2305 
2306 			if ((so->so_family == AF_INET ||
2307 			    so->so_family == AF_INET6) &&
2308 			    (so->so_type == SOCK_DGRAM ||
2309 			    so->so_type == SOCK_RAW) &&
2310 			    /*CONSTCOND*/
2311 			    !soconnect_tpi_udp) {
2312 				/* XXX What about implicitly unbinding here? */
2313 				error = sodisconnect(so, -1,
2314 				    _SODISCONNECT_LOCK_HELD);
2315 			} else {
2316 				so->so_state &=
2317 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2318 				sti->sti_faddr_valid = 0;
2319 				sti->sti_faddr_len = 0;
2320 			}
2321 
2322 			/* Remove SOLOCKED since setsockopt will grab it */
2323 			so_unlock_single(so, SOLOCKED);
2324 			mutex_exit(&so->so_lock);
2325 
2326 			val = 0;
2327 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2328 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2329 			    cr);
2330 
2331 			mutex_enter(&so->so_lock);
2332 			so_lock_single(so);	/* Set SOLOCKED */
2333 			goto done;
2334 		}
2335 	}
2336 	ASSERT(so->so_state & SS_ISBOUND);
2337 
2338 	if (name == NULL || namelen == 0) {
2339 		error = EINVAL;
2340 		goto done;
2341 	}
2342 	/*
2343 	 * Mark the socket if sti_faddr_sa represents the transport level
2344 	 * address.
2345 	 */
2346 	if (flags & _SOCONNECT_NOXLATE) {
2347 		struct sockaddr_ux	*soaddr_ux;
2348 
2349 		ASSERT(so->so_family == AF_UNIX);
2350 		if (namelen != sizeof (struct sockaddr_ux)) {
2351 			error = EINVAL;
2352 			goto done;
2353 		}
2354 		soaddr_ux = (struct sockaddr_ux *)name;
2355 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2356 		namelen = sizeof (soaddr_ux->sou_addr);
2357 		sti->sti_faddr_noxlate = 1;
2358 	}
2359 
2360 	/*
2361 	 * Length and family checks.
2362 	 */
2363 	error = so_addr_verify(so, name, namelen);
2364 	if (error)
2365 		goto bad;
2366 
2367 	/*
2368 	 * Save foreign address. Needed for AF_UNIX as well as
2369 	 * transport providers that do not support TI_GETPEERNAME.
2370 	 * Also used for cached foreign address for TCP and UDP.
2371 	 */
2372 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2373 		error = EINVAL;
2374 		goto done;
2375 	}
2376 	sti->sti_faddr_len = (socklen_t)namelen;
2377 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2378 	bcopy(name, sti->sti_faddr_sa, namelen);
2379 	sti->sti_faddr_valid = 1;
2380 
2381 	if (so->so_family == AF_UNIX) {
2382 		if (sti->sti_faddr_noxlate) {
2383 			/*
2384 			 * Already have a transport internal address. Do not
2385 			 * pass any (transport internal) source address.
2386 			 */
2387 			addr = sti->sti_faddr_sa;
2388 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2389 			src = NULL;
2390 			srclen = 0;
2391 		} else {
2392 			/*
2393 			 * Pass the sockaddr_un source address as an option
2394 			 * and translate the remote address.
2395 			 * Holding so_lock thus sti_laddr_sa can not change.
2396 			 */
2397 			src = sti->sti_laddr_sa;
2398 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2399 			dprintso(so, 1,
2400 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2401 			    srclen, src));
2402 			error = so_ux_addr_xlate(so,
2403 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2404 			    (flags & _SOCONNECT_XPG4_2),
2405 			    &addr, &addrlen);
2406 			if (error)
2407 				goto bad;
2408 		}
2409 	} else {
2410 		addr = sti->sti_faddr_sa;
2411 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2412 		src = NULL;
2413 		srclen = 0;
2414 	}
2415 	/*
2416 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2417 	 * option which asks the transport provider to send T_UDERR_IND
2418 	 * messages. These T_UDERR_IND messages are used to return connected
2419 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2420 	 *
2421 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2422 	 * we send down a T_CONN_REQ. This is needed to let the
2423 	 * transport assign a local address that is consistent with
2424 	 * the remote address. Applications depend on a getsockname()
2425 	 * after a connect() to retrieve the "source" IP address for
2426 	 * the connected socket.  Invalidate the cached local address
2427 	 * to force getsockname() to enquire of the transport.
2428 	 */
2429 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2430 		/*
2431 		 * Datagram socket.
2432 		 */
2433 		int32_t val;
2434 
2435 		so_unlock_single(so, SOLOCKED);
2436 		mutex_exit(&so->so_lock);
2437 
2438 		val = 1;
2439 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2440 		    &val, (t_uscalar_t)sizeof (val), cr);
2441 
2442 		mutex_enter(&so->so_lock);
2443 		so_lock_single(so);	/* Set SOLOCKED */
2444 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2445 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2446 		    soconnect_tpi_udp) {
2447 			soisconnected(so);
2448 			goto done;
2449 		}
2450 		/*
2451 		 * Send down T_CONN_REQ etc.
2452 		 * Clear fflag to avoid returning EWOULDBLOCK.
2453 		 */
2454 		fflag = 0;
2455 		ASSERT(so->so_family != AF_UNIX);
2456 		sti->sti_laddr_valid = 0;
2457 	} else if (sti->sti_laddr_len != 0) {
2458 		/*
2459 		 * If the local address or port was "any" then it may be
2460 		 * changed by the transport as a result of the
2461 		 * connect.  Invalidate the cached version if we have one.
2462 		 */
2463 		switch (so->so_family) {
2464 		case AF_INET:
2465 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2466 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2467 			    INADDR_ANY ||
2468 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2469 				sti->sti_laddr_valid = 0;
2470 			break;
2471 
2472 		case AF_INET6:
2473 			ASSERT(sti->sti_laddr_len ==
2474 			    (socklen_t)sizeof (sin6_t));
2475 			if (IN6_IS_ADDR_UNSPECIFIED(
2476 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2477 			    IN6_IS_ADDR_V4MAPPED_ANY(
2478 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2479 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2480 				sti->sti_laddr_valid = 0;
2481 			break;
2482 
2483 		default:
2484 			break;
2485 		}
2486 	}
2487 
2488 	/*
2489 	 * Check for failure of an earlier call
2490 	 */
2491 	if (so->so_error != 0)
2492 		goto so_bad;
2493 
2494 	/*
2495 	 * Send down T_CONN_REQ. Message was allocated above.
2496 	 */
2497 	conn_req.PRIM_type = T_CONN_REQ;
2498 	conn_req.DEST_length = addrlen;
2499 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2500 	if (srclen == 0) {
2501 		conn_req.OPT_length = 0;
2502 		conn_req.OPT_offset = 0;
2503 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2504 		soappendmsg(mp, addr, addrlen);
2505 	} else {
2506 		/*
2507 		 * There is a AF_UNIX sockaddr_un to include as a source
2508 		 * address option.
2509 		 */
2510 		struct T_opthdr toh;
2511 
2512 		toh.level = SOL_SOCKET;
2513 		toh.name = SO_SRCADDR;
2514 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2515 		toh.status = 0;
2516 		conn_req.OPT_length =
2517 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2518 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2519 		    _TPI_ALIGN_TOPT(addrlen));
2520 
2521 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2522 		soappendmsg(mp, addr, addrlen);
2523 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2524 		soappendmsg(mp, &toh, sizeof (toh));
2525 		soappendmsg(mp, src, srclen);
2526 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2527 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2528 	}
2529 	/*
2530 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2531 	 * in order to have the right state when the T_CONN_CON shows up.
2532 	 */
2533 	soisconnecting(so);
2534 	mutex_exit(&so->so_lock);
2535 
2536 	if (AU_AUDITING())
2537 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2538 
2539 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2540 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2541 	mp = NULL;
2542 	mutex_enter(&so->so_lock);
2543 	if (error != 0)
2544 		goto bad;
2545 
2546 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2547 		goto bad;
2548 
2549 	/* Allow other threads to access the socket */
2550 	so_unlock_single(so, SOLOCKED);
2551 	need_unlock = B_FALSE;
2552 
2553 	/*
2554 	 * Wait until we get a T_CONN_CON or an error
2555 	 */
2556 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2557 		so_lock_single(so);	/* Set SOLOCKED */
2558 		need_unlock = B_TRUE;
2559 	}
2560 
2561 done:
2562 	freemsg(mp);
2563 	switch (error) {
2564 	case EINPROGRESS:
2565 	case EALREADY:
2566 	case EISCONN:
2567 	case EINTR:
2568 		/* Non-fatal errors */
2569 		sti->sti_laddr_valid = 0;
2570 		/* FALLTHRU */
2571 	case 0:
2572 		break;
2573 	default:
2574 		ASSERT(need_unlock);
2575 		/*
2576 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2577 		 * and invalidate local-address cache
2578 		 */
2579 		so->so_state &= ~SS_ISCONNECTING;
2580 		sti->sti_laddr_valid = 0;
2581 		/* A discon_ind might have already unbound us */
2582 		if ((flags & _SOCONNECT_DID_BIND) &&
2583 		    (so->so_state & SS_ISBOUND)) {
2584 			int err;
2585 
2586 			err = sotpi_unbind(so, 0);
2587 			/* LINTED - statement has no conseq */
2588 			if (err) {
2589 				eprintsoline(so, err);
2590 			}
2591 		}
2592 		break;
2593 	}
2594 	if (need_unlock)
2595 		so_unlock_single(so, SOLOCKED);
2596 	mutex_exit(&so->so_lock);
2597 	return (error);
2598 
2599 so_bad:	error = sogeterr(so, B_TRUE);
2600 bad:	eprintsoline(so, error);
2601 	goto done;
2602 }
2603 
2604 /* ARGSUSED */
2605 int
2606 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2607 {
2608 	struct T_ordrel_req	ordrel_req;
2609 	mblk_t			*mp;
2610 	uint_t			old_state, state_change;
2611 	int			error = 0;
2612 	sotpi_info_t		*sti = SOTOTPI(so);
2613 
2614 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2615 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2616 
2617 	mutex_enter(&so->so_lock);
2618 	so_lock_single(so);	/* Set SOLOCKED */
2619 
2620 	/*
2621 	 * SunOS 4.X has no check for datagram sockets.
2622 	 * 5.X checks that it is connected (ENOTCONN)
2623 	 * X/Open requires that we check the connected state.
2624 	 */
2625 	if (!(so->so_state & SS_ISCONNECTED)) {
2626 		if (!xnet_skip_checks) {
2627 			error = ENOTCONN;
2628 			if (xnet_check_print) {
2629 				printf("sockfs: X/Open shutdown check "
2630 				    "caused ENOTCONN\n");
2631 			}
2632 		}
2633 		goto done;
2634 	}
2635 	/*
2636 	 * Record the current state and then perform any state changes.
2637 	 * Then use the difference between the old and new states to
2638 	 * determine which messages need to be sent.
2639 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2640 	 * duplicate calls to shutdown().
2641 	 */
2642 	old_state = so->so_state;
2643 
2644 	switch (how) {
2645 	case 0:
2646 		socantrcvmore(so);
2647 		break;
2648 	case 1:
2649 		socantsendmore(so);
2650 		break;
2651 	case 2:
2652 		socantsendmore(so);
2653 		socantrcvmore(so);
2654 		break;
2655 	default:
2656 		error = EINVAL;
2657 		goto done;
2658 	}
2659 
2660 	/*
2661 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2662 	 */
2663 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2664 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2665 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2666 
2667 	switch (state_change) {
2668 	case 0:
2669 		dprintso(so, 1,
2670 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2671 		    so->so_state));
2672 		goto done;
2673 
2674 	case SS_CANTRCVMORE:
2675 		mutex_exit(&so->so_lock);
2676 		strseteof(SOTOV(so), 1);
2677 		/*
2678 		 * strseteof takes care of read side wakeups,
2679 		 * pollwakeups, and signals.
2680 		 */
2681 		/*
2682 		 * Get the read lock before flushing data to avoid problems
2683 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2684 		 */
2685 		mutex_enter(&so->so_lock);
2686 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2687 		mutex_exit(&so->so_lock);
2688 
2689 		/* Flush read side queue */
2690 		strflushrq(SOTOV(so), FLUSHALL);
2691 
2692 		mutex_enter(&so->so_lock);
2693 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2694 		break;
2695 
2696 	case SS_CANTSENDMORE:
2697 		mutex_exit(&so->so_lock);
2698 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2699 		mutex_enter(&so->so_lock);
2700 		break;
2701 
2702 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2703 		mutex_exit(&so->so_lock);
2704 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2705 		strseteof(SOTOV(so), 1);
2706 		/*
2707 		 * strseteof takes care of read side wakeups,
2708 		 * pollwakeups, and signals.
2709 		 */
2710 		/*
2711 		 * Get the read lock before flushing data to avoid problems
2712 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2713 		 */
2714 		mutex_enter(&so->so_lock);
2715 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2716 		mutex_exit(&so->so_lock);
2717 
2718 		/* Flush read side queue */
2719 		strflushrq(SOTOV(so), FLUSHALL);
2720 
2721 		mutex_enter(&so->so_lock);
2722 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2723 		break;
2724 	}
2725 
2726 	ASSERT(MUTEX_HELD(&so->so_lock));
2727 
2728 	/*
2729 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2730 	 * was set due to this call and the new state has both of them set:
2731 	 *	Send the AF_UNIX close indication
2732 	 *	For T_COTS send a discon_ind
2733 	 *
2734 	 * If cantsend was set due to this call:
2735 	 *	For T_COTSORD send an ordrel_ind
2736 	 *
2737 	 * Note that for T_CLTS there is no message sent here.
2738 	 */
2739 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2740 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2741 		/*
2742 		 * For SunOS 4.X compatibility we tell the other end
2743 		 * that we are unable to receive at this point.
2744 		 */
2745 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2746 			so_unix_close(so);
2747 
2748 		if (sti->sti_serv_type == T_COTS)
2749 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2750 	}
2751 	if ((state_change & SS_CANTSENDMORE) &&
2752 	    (sti->sti_serv_type == T_COTS_ORD)) {
2753 		/* Send an orderly release */
2754 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2755 
2756 		mutex_exit(&so->so_lock);
2757 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2758 		    0, _ALLOC_SLEEP, cr);
2759 		/*
2760 		 * Send down the T_ORDREL_REQ even if there is flow control.
2761 		 * This prevents shutdown from blocking.
2762 		 * Note that there is no T_OK_ACK for ordrel_req.
2763 		 */
2764 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2765 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2766 		mutex_enter(&so->so_lock);
2767 		if (error) {
2768 			eprintsoline(so, error);
2769 			goto done;
2770 		}
2771 	}
2772 
2773 done:
2774 	so_unlock_single(so, SOLOCKED);
2775 	mutex_exit(&so->so_lock);
2776 	return (error);
2777 }
2778 
2779 /*
2780  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2781  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2782  * that we have closed.
2783  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2784  * T_UNITDATA_REQ containing the same option.
2785  *
2786  * For SOCK_DGRAM half-connections (somebody connected to this end
2787  * but this end is not connect) we don't know where to send any
2788  * SO_UNIX_CLOSE.
2789  *
2790  * We have to ignore stream head errors just in case there has been
2791  * a shutdown(output).
2792  * Ignore any flow control to try to get the message more quickly to the peer.
2793  * While locally ignoring flow control solves the problem when there
2794  * is only the loopback transport on the stream it would not provide
2795  * the correct AF_UNIX socket semantics when one or more modules have
2796  * been pushed.
2797  */
2798 void
2799 so_unix_close(struct sonode *so)
2800 {
2801 	int		error;
2802 	struct T_opthdr	toh;
2803 	mblk_t		*mp;
2804 	sotpi_info_t	*sti = SOTOTPI(so);
2805 
2806 	ASSERT(MUTEX_HELD(&so->so_lock));
2807 
2808 	ASSERT(so->so_family == AF_UNIX);
2809 
2810 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2811 	    (SS_ISCONNECTED|SS_ISBOUND))
2812 		return;
2813 
2814 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2815 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2816 
2817 	toh.level = SOL_SOCKET;
2818 	toh.name = SO_UNIX_CLOSE;
2819 
2820 	/* zero length + header */
2821 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2822 	toh.status = 0;
2823 
2824 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2825 		struct T_optdata_req tdr;
2826 
2827 		tdr.PRIM_type = T_OPTDATA_REQ;
2828 		tdr.DATA_flag = 0;
2829 
2830 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2831 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2832 
2833 		/* NOTE: holding so_lock while sleeping */
2834 		mp = soallocproto2(&tdr, sizeof (tdr),
2835 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2836 	} else {
2837 		struct T_unitdata_req	tudr;
2838 		void			*addr;
2839 		socklen_t		addrlen;
2840 		void			*src;
2841 		socklen_t		srclen;
2842 		struct T_opthdr		toh2;
2843 		t_scalar_t		size;
2844 
2845 		/* Connecteded DGRAM socket */
2846 
2847 		/*
2848 		 * For AF_UNIX the destination address is translated to
2849 		 * an internal name and the source address is passed as
2850 		 * an option.
2851 		 */
2852 		/*
2853 		 * Length and family checks.
2854 		 */
2855 		error = so_addr_verify(so, sti->sti_faddr_sa,
2856 		    (t_uscalar_t)sti->sti_faddr_len);
2857 		if (error) {
2858 			eprintsoline(so, error);
2859 			return;
2860 		}
2861 		if (sti->sti_faddr_noxlate) {
2862 			/*
2863 			 * Already have a transport internal address. Do not
2864 			 * pass any (transport internal) source address.
2865 			 */
2866 			addr = sti->sti_faddr_sa;
2867 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2868 			src = NULL;
2869 			srclen = 0;
2870 		} else {
2871 			/*
2872 			 * Pass the sockaddr_un source address as an option
2873 			 * and translate the remote address.
2874 			 * Holding so_lock thus sti_laddr_sa can not change.
2875 			 */
2876 			src = sti->sti_laddr_sa;
2877 			srclen = (socklen_t)sti->sti_laddr_len;
2878 			dprintso(so, 1,
2879 			    ("so_ux_close: srclen %d, src %p\n",
2880 			    srclen, src));
2881 			error = so_ux_addr_xlate(so,
2882 			    sti->sti_faddr_sa,
2883 			    (socklen_t)sti->sti_faddr_len, 0,
2884 			    &addr, &addrlen);
2885 			if (error) {
2886 				eprintsoline(so, error);
2887 				return;
2888 			}
2889 		}
2890 		tudr.PRIM_type = T_UNITDATA_REQ;
2891 		tudr.DEST_length = addrlen;
2892 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2893 		if (srclen == 0) {
2894 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2895 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2896 			    _TPI_ALIGN_TOPT(addrlen));
2897 
2898 			size = tudr.OPT_offset + tudr.OPT_length;
2899 			/* NOTE: holding so_lock while sleeping */
2900 			mp = soallocproto2(&tudr, sizeof (tudr),
2901 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2902 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2903 			soappendmsg(mp, &toh, sizeof (toh));
2904 		} else {
2905 			/*
2906 			 * There is a AF_UNIX sockaddr_un to include as a
2907 			 * source address option.
2908 			 */
2909 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2910 			    _TPI_ALIGN_TOPT(srclen));
2911 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2912 			    _TPI_ALIGN_TOPT(addrlen));
2913 
2914 			toh2.level = SOL_SOCKET;
2915 			toh2.name = SO_SRCADDR;
2916 			toh2.len = (t_uscalar_t)(srclen +
2917 			    sizeof (struct T_opthdr));
2918 			toh2.status = 0;
2919 
2920 			size = tudr.OPT_offset + tudr.OPT_length;
2921 
2922 			/* NOTE: holding so_lock while sleeping */
2923 			mp = soallocproto2(&tudr, sizeof (tudr),
2924 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2925 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2926 			soappendmsg(mp, &toh, sizeof (toh));
2927 			soappendmsg(mp, &toh2, sizeof (toh2));
2928 			soappendmsg(mp, src, srclen);
2929 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2930 		}
2931 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2932 	}
2933 	mutex_exit(&so->so_lock);
2934 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2935 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2936 	mutex_enter(&so->so_lock);
2937 }
2938 
2939 /*
2940  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2941  * In addition, the caller typically verifies that there is some
2942  * potential state to clear by checking
2943  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2944  * before calling this routine.
2945  * Note that such a check can be made without holding so_lock since
2946  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2947  * decrements sti_oobsigcnt.
2948  *
2949  * When data is read *after* the point that all pending
2950  * oob data has been consumed the oob indication is cleared.
2951  *
2952  * This logic keeps select/poll returning POLLRDBAND and
2953  * SIOCATMARK returning true until we have read past
2954  * the mark.
2955  */
2956 static void
2957 sorecv_update_oobstate(struct sonode *so)
2958 {
2959 	sotpi_info_t *sti = SOTOTPI(so);
2960 
2961 	mutex_enter(&so->so_lock);
2962 	ASSERT(so_verify_oobstate(so));
2963 	dprintso(so, 1,
2964 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2965 	    sti->sti_oobsigcnt,
2966 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2967 	if (sti->sti_oobsigcnt == 0) {
2968 		/* No more pending oob indications */
2969 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2970 		freemsg(so->so_oobmsg);
2971 		so->so_oobmsg = NULL;
2972 	}
2973 	ASSERT(so_verify_oobstate(so));
2974 	mutex_exit(&so->so_lock);
2975 }
2976 
2977 /*
2978  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2979  */
2980 static int
2981 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2982 {
2983 	sotpi_info_t *sti = SOTOTPI(so);
2984 	int	error = 0;
2985 	mblk_t *tmp = NULL;
2986 	mblk_t *pmp = NULL;
2987 	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
2988 
2989 	ASSERT(nmp != NULL);
2990 
2991 	while (nmp != NULL && uiop->uio_resid > 0) {
2992 		ssize_t n;
2993 
2994 		if (DB_TYPE(nmp) == M_DATA) {
2995 			/*
2996 			 * We have some data, uiomove up to resid bytes.
2997 			 */
2998 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2999 			if (n > 0)
3000 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3001 			nmp->b_rptr += n;
3002 			if (nmp->b_rptr == nmp->b_wptr) {
3003 				pmp = nmp;
3004 				nmp = nmp->b_cont;
3005 			}
3006 			if (error)
3007 				break;
3008 		} else {
3009 			/*
3010 			 * We only handle data, save for caller to handle.
3011 			 */
3012 			if (pmp != NULL) {
3013 				pmp->b_cont = nmp->b_cont;
3014 			}
3015 			nmp->b_cont = NULL;
3016 			if (*rmp == NULL) {
3017 				*rmp = nmp;
3018 			} else {
3019 				tmp->b_cont = nmp;
3020 			}
3021 			nmp = nmp->b_cont;
3022 			tmp = nmp;
3023 		}
3024 	}
3025 	if (pmp != NULL) {
3026 		/* Free any mblk_t(s) which we have consumed */
3027 		pmp->b_cont = NULL;
3028 		freemsg(sti->sti_nl7c_rcv_mp);
3029 	}
3030 	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3031 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
3032 		if (error == 0) {
3033 			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3034 
3035 			error = p->r_v.r_v2;
3036 			p->r_v.r_v2 = 0;
3037 		}
3038 		rp->r_vals = sti->sti_nl7c_rcv_rval;
3039 		sti->sti_nl7c_rcv_rval = 0;
3040 	} else {
3041 		/* More mblk_t(s) to process so no rval to return */
3042 		rp->r_vals = 0;
3043 	}
3044 	return (error);
3045 }
3046 /*
3047  * Receive the next message on the queue.
3048  * If msg_controllen is non-zero when called the caller is interested in
3049  * any received control info (options).
3050  * If msg_namelen is non-zero when called the caller is interested in
3051  * any received source address.
3052  * The routine returns with msg_control and msg_name pointing to
3053  * kmem_alloc'ed memory which the caller has to free.
3054  */
3055 /* ARGSUSED */
3056 int
3057 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3058     struct cred *cr)
3059 {
3060 	union T_primitives	*tpr;
3061 	mblk_t			*mp;
3062 	uchar_t			pri;
3063 	int			pflag, opflag;
3064 	void			*control;
3065 	t_uscalar_t		controllen;
3066 	t_uscalar_t		namelen;
3067 	int			so_state = so->so_state; /* Snapshot */
3068 	ssize_t			saved_resid;
3069 	rval_t			rval;
3070 	int			flags;
3071 	clock_t			timout;
3072 	int			error = 0;
3073 	sotpi_info_t		*sti = SOTOTPI(so);
3074 
3075 	flags = msg->msg_flags;
3076 	msg->msg_flags = 0;
3077 
3078 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3079 	    (void *)so, (void *)msg, flags,
3080 	    pr_state(so->so_state, so->so_mode), so->so_error));
3081 
3082 	if (so->so_version == SOV_STREAM) {
3083 		so_update_attrs(so, SOACC);
3084 		/* The imaginary "sockmod" has been popped - act as a stream */
3085 		return (strread(SOTOV(so), uiop, cr));
3086 	}
3087 
3088 	/*
3089 	 * If we are not connected because we have never been connected
3090 	 * we return ENOTCONN. If we have been connected (but are no longer
3091 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3092 	 * the EOF.
3093 	 *
3094 	 * An alternative would be to post an ENOTCONN error in stream head
3095 	 * (read+write) and clear it when we're connected. However, that error
3096 	 * would cause incorrect poll/select behavior!
3097 	 */
3098 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3099 	    (so->so_mode & SM_CONNREQUIRED)) {
3100 		return (ENOTCONN);
3101 	}
3102 
3103 	/*
3104 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3105 	 * after checking that the read queue is empty) and returns zero.
3106 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3107 	 * is zero.
3108 	 */
3109 
3110 	if (flags & MSG_OOB) {
3111 		/* Check that the transport supports OOB */
3112 		if (!(so->so_mode & SM_EXDATA))
3113 			return (EOPNOTSUPP);
3114 		so_update_attrs(so, SOACC);
3115 		return (sorecvoob(so, msg, uiop, flags,
3116 		    (so->so_options & SO_OOBINLINE)));
3117 	}
3118 
3119 	so_update_attrs(so, SOACC);
3120 
3121 	/*
3122 	 * Set msg_controllen and msg_namelen to zero here to make it
3123 	 * simpler in the cases that no control or name is returned.
3124 	 */
3125 	controllen = msg->msg_controllen;
3126 	namelen = msg->msg_namelen;
3127 	msg->msg_controllen = 0;
3128 	msg->msg_namelen = 0;
3129 
3130 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3131 	    namelen, controllen));
3132 
3133 	mutex_enter(&so->so_lock);
3134 	/*
3135 	 * If an NL7C enabled socket and not waiting for write data.
3136 	 */
3137 	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3138 	    NL7C_ENABLED) {
3139 		if (sti->sti_nl7c_uri) {
3140 			/* Close uri processing for a previous request */
3141 			nl7c_close(so);
3142 		}
3143 		if ((so_state & SS_CANTRCVMORE) &&
3144 		    sti->sti_nl7c_rcv_mp == NULL) {
3145 			/* Nothing to process, EOF */
3146 			mutex_exit(&so->so_lock);
3147 			return (0);
3148 		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3149 			/* Persistent NL7C socket, try to process request */
3150 			boolean_t ret;
3151 
3152 			ret = nl7c_process(so,
3153 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3154 			rval.r_vals = sti->sti_nl7c_rcv_rval;
3155 			error = rval.r_v.r_v2;
3156 			if (error) {
3157 				/* Error of some sort, return it */
3158 				mutex_exit(&so->so_lock);
3159 				return (error);
3160 			}
3161 			if (sti->sti_nl7c_flags &&
3162 			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3163 				/*
3164 				 * Still an NL7C socket and no data
3165 				 * to pass up to the caller.
3166 				 */
3167 				mutex_exit(&so->so_lock);
3168 				if (ret) {
3169 					/* EOF */
3170 					return (0);
3171 				} else {
3172 					/* Need more data */
3173 					return (EAGAIN);
3174 				}
3175 			}
3176 		} else {
3177 			/*
3178 			 * Not persistent so no further NL7C processing.
3179 			 */
3180 			sti->sti_nl7c_flags = 0;
3181 		}
3182 	}
3183 	/*
3184 	 * Only one reader is allowed at any given time. This is needed
3185 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3186 	 *
3187 	 * This is slightly different that BSD behavior in that it fails with
3188 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3189 	 * is single-threaded using sblock(), which is dropped while waiting
3190 	 * for data to appear. The difference shows up e.g. if one
3191 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3192 	 * does use nonblocking io and different threads are reading each
3193 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3194 	 * in this case as long as the read queue doesn't get empty.
3195 	 * In this implementation the thread using nonblocking io can
3196 	 * get an EWOULDBLOCK error due to the blocking thread executing
3197 	 * e.g. in the uiomove in kstrgetmsg.
3198 	 * This difference is not believed to be significant.
3199 	 */
3200 	/* Set SOREADLOCKED */
3201 	error = so_lock_read_intr(so,
3202 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3203 	mutex_exit(&so->so_lock);
3204 	if (error)
3205 		return (error);
3206 
3207 	/*
3208 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3209 	 * queued data has been consumed.
3210 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3211 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3212 	 *
3213 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3214 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3215 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3216 	 */
3217 	pflag = MSG_ANY | MSG_DELAYERROR;
3218 	if (flags & MSG_PEEK) {
3219 		pflag |= MSG_IPEEK;
3220 		flags &= ~MSG_WAITALL;
3221 	}
3222 	if (so->so_mode & SM_ATOMIC)
3223 		pflag |= MSG_DISCARDTAIL;
3224 
3225 	if (flags & MSG_DONTWAIT)
3226 		timout = 0;
3227 	else if (so->so_rcvtimeo != 0)
3228 		timout = TICK_TO_MSEC(so->so_rcvtimeo);
3229 	else
3230 		timout = -1;
3231 	opflag = pflag;
3232 retry:
3233 	saved_resid = uiop->uio_resid;
3234 	pri = 0;
3235 	mp = NULL;
3236 	if (sti->sti_nl7c_rcv_mp != NULL) {
3237 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3238 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3239 	} else {
3240 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3241 		    timout, &rval);
3242 	}
3243 	if (error != 0) {
3244 		/* kstrgetmsg returns ETIME when timeout expires */
3245 		if (error == ETIME)
3246 			error = EWOULDBLOCK;
3247 		goto out;
3248 	}
3249 	/*
3250 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3251 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3252 	 */
3253 	ASSERT(!(rval.r_val1 & MORECTL));
3254 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3255 		msg->msg_flags |= MSG_TRUNC;
3256 
3257 	if (mp == NULL) {
3258 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3259 		/*
3260 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3261 		 * The draft Posix socket spec states that the mark should
3262 		 * not be cleared when peeking. We follow the latter.
3263 		 */
3264 		if ((so->so_state &
3265 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3266 		    (uiop->uio_resid != saved_resid) &&
3267 		    !(flags & MSG_PEEK)) {
3268 			sorecv_update_oobstate(so);
3269 		}
3270 
3271 		mutex_enter(&so->so_lock);
3272 		/* Set MSG_EOR based on MOREDATA */
3273 		if (!(rval.r_val1 & MOREDATA)) {
3274 			if (so->so_state & SS_SAVEDEOR) {
3275 				msg->msg_flags |= MSG_EOR;
3276 				so->so_state &= ~SS_SAVEDEOR;
3277 			}
3278 		}
3279 		/*
3280 		 * If some data was received (i.e. not EOF) and the
3281 		 * read/recv* has not been satisfied wait for some more.
3282 		 */
3283 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3284 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3285 			mutex_exit(&so->so_lock);
3286 			pflag = opflag | MSG_NOMARK;
3287 			goto retry;
3288 		}
3289 		goto out_locked;
3290 	}
3291 
3292 	/* strsock_proto has already verified length and alignment */
3293 	tpr = (union T_primitives *)mp->b_rptr;
3294 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3295 
3296 	switch (tpr->type) {
3297 	case T_DATA_IND: {
3298 		if ((so->so_state &
3299 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3300 		    (uiop->uio_resid != saved_resid) &&
3301 		    !(flags & MSG_PEEK)) {
3302 			sorecv_update_oobstate(so);
3303 		}
3304 
3305 		/*
3306 		 * Set msg_flags to MSG_EOR based on
3307 		 * MORE_flag and MOREDATA.
3308 		 */
3309 		mutex_enter(&so->so_lock);
3310 		so->so_state &= ~SS_SAVEDEOR;
3311 		if (!(tpr->data_ind.MORE_flag & 1)) {
3312 			if (!(rval.r_val1 & MOREDATA))
3313 				msg->msg_flags |= MSG_EOR;
3314 			else
3315 				so->so_state |= SS_SAVEDEOR;
3316 		}
3317 		freemsg(mp);
3318 		/*
3319 		 * If some data was received (i.e. not EOF) and the
3320 		 * read/recv* has not been satisfied wait for some more.
3321 		 */
3322 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3323 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3324 			mutex_exit(&so->so_lock);
3325 			pflag = opflag | MSG_NOMARK;
3326 			goto retry;
3327 		}
3328 		goto out_locked;
3329 	}
3330 	case T_UNITDATA_IND: {
3331 		void *addr;
3332 		t_uscalar_t addrlen;
3333 		void *abuf;
3334 		t_uscalar_t optlen;
3335 		void *opt;
3336 
3337 		if ((so->so_state &
3338 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3339 		    (uiop->uio_resid != saved_resid) &&
3340 		    !(flags & MSG_PEEK)) {
3341 			sorecv_update_oobstate(so);
3342 		}
3343 
3344 		if (namelen != 0) {
3345 			/* Caller wants source address */
3346 			addrlen = tpr->unitdata_ind.SRC_length;
3347 			addr = sogetoff(mp,
3348 			    tpr->unitdata_ind.SRC_offset,
3349 			    addrlen, 1);
3350 			if (addr == NULL) {
3351 				freemsg(mp);
3352 				error = EPROTO;
3353 				eprintsoline(so, error);
3354 				goto out;
3355 			}
3356 			if (so->so_family == AF_UNIX) {
3357 				/*
3358 				 * Can not use the transport level address.
3359 				 * If there is a SO_SRCADDR option carrying
3360 				 * the socket level address it will be
3361 				 * extracted below.
3362 				 */
3363 				addr = NULL;
3364 				addrlen = 0;
3365 			}
3366 		}
3367 		optlen = tpr->unitdata_ind.OPT_length;
3368 		if (optlen != 0) {
3369 			t_uscalar_t ncontrollen;
3370 
3371 			/*
3372 			 * Extract any source address option.
3373 			 * Determine how large cmsg buffer is needed.
3374 			 */
3375 			opt = sogetoff(mp,
3376 			    tpr->unitdata_ind.OPT_offset,
3377 			    optlen, __TPI_ALIGN_SIZE);
3378 
3379 			if (opt == NULL) {
3380 				freemsg(mp);
3381 				error = EPROTO;
3382 				eprintsoline(so, error);
3383 				goto out;
3384 			}
3385 			if (so->so_family == AF_UNIX)
3386 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3387 			ncontrollen = so_cmsglen(mp, opt, optlen,
3388 			    !(flags & MSG_XPG4_2));
3389 			if (controllen != 0)
3390 				controllen = ncontrollen;
3391 			else if (ncontrollen != 0)
3392 				msg->msg_flags |= MSG_CTRUNC;
3393 		} else {
3394 			controllen = 0;
3395 		}
3396 
3397 		if (namelen != 0) {
3398 			/*
3399 			 * Return address to caller.
3400 			 * Caller handles truncation if length
3401 			 * exceeds msg_namelen.
3402 			 * NOTE: AF_UNIX NUL termination is ensured by
3403 			 * the sender's copyin_name().
3404 			 */
3405 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3406 
3407 			bcopy(addr, abuf, addrlen);
3408 			msg->msg_name = abuf;
3409 			msg->msg_namelen = addrlen;
3410 		}
3411 
3412 		if (controllen != 0) {
3413 			/*
3414 			 * Return control msg to caller.
3415 			 * Caller handles truncation if length
3416 			 * exceeds msg_controllen.
3417 			 */
3418 			control = kmem_zalloc(controllen, KM_SLEEP);
3419 
3420 			error = so_opt2cmsg(mp, opt, optlen,
3421 			    !(flags & MSG_XPG4_2),
3422 			    control, controllen);
3423 			if (error) {
3424 				freemsg(mp);
3425 				if (msg->msg_namelen != 0)
3426 					kmem_free(msg->msg_name,
3427 					    msg->msg_namelen);
3428 				kmem_free(control, controllen);
3429 				eprintsoline(so, error);
3430 				goto out;
3431 			}
3432 			msg->msg_control = control;
3433 			msg->msg_controllen = controllen;
3434 		}
3435 
3436 		freemsg(mp);
3437 		goto out;
3438 	}
3439 	case T_OPTDATA_IND: {
3440 		struct T_optdata_req *tdr;
3441 		void *opt;
3442 		t_uscalar_t optlen;
3443 
3444 		if ((so->so_state &
3445 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3446 		    (uiop->uio_resid != saved_resid) &&
3447 		    !(flags & MSG_PEEK)) {
3448 			sorecv_update_oobstate(so);
3449 		}
3450 
3451 		tdr = (struct T_optdata_req *)mp->b_rptr;
3452 		optlen = tdr->OPT_length;
3453 		if (optlen != 0) {
3454 			t_uscalar_t ncontrollen;
3455 			/*
3456 			 * Determine how large cmsg buffer is needed.
3457 			 */
3458 			opt = sogetoff(mp,
3459 			    tpr->optdata_ind.OPT_offset,
3460 			    optlen, __TPI_ALIGN_SIZE);
3461 
3462 			if (opt == NULL) {
3463 				freemsg(mp);
3464 				error = EPROTO;
3465 				eprintsoline(so, error);
3466 				goto out;
3467 			}
3468 
3469 			ncontrollen = so_cmsglen(mp, opt, optlen,
3470 			    !(flags & MSG_XPG4_2));
3471 			if (controllen != 0)
3472 				controllen = ncontrollen;
3473 			else if (ncontrollen != 0)
3474 				msg->msg_flags |= MSG_CTRUNC;
3475 		} else {
3476 			controllen = 0;
3477 		}
3478 
3479 		if (controllen != 0) {
3480 			/*
3481 			 * Return control msg to caller.
3482 			 * Caller handles truncation if length
3483 			 * exceeds msg_controllen.
3484 			 */
3485 			control = kmem_zalloc(controllen, KM_SLEEP);
3486 
3487 			error = so_opt2cmsg(mp, opt, optlen,
3488 			    !(flags & MSG_XPG4_2),
3489 			    control, controllen);
3490 			if (error) {
3491 				freemsg(mp);
3492 				kmem_free(control, controllen);
3493 				eprintsoline(so, error);
3494 				goto out;
3495 			}
3496 			msg->msg_control = control;
3497 			msg->msg_controllen = controllen;
3498 		}
3499 
3500 		/*
3501 		 * Set msg_flags to MSG_EOR based on
3502 		 * DATA_flag and MOREDATA.
3503 		 */
3504 		mutex_enter(&so->so_lock);
3505 		so->so_state &= ~SS_SAVEDEOR;
3506 		if (!(tpr->data_ind.MORE_flag & 1)) {
3507 			if (!(rval.r_val1 & MOREDATA))
3508 				msg->msg_flags |= MSG_EOR;
3509 			else
3510 				so->so_state |= SS_SAVEDEOR;
3511 		}
3512 		freemsg(mp);
3513 		/*
3514 		 * If some data was received (i.e. not EOF) and the
3515 		 * read/recv* has not been satisfied wait for some more.
3516 		 * Not possible to wait if control info was received.
3517 		 */
3518 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3519 		    controllen == 0 &&
3520 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3521 			mutex_exit(&so->so_lock);
3522 			pflag = opflag | MSG_NOMARK;
3523 			goto retry;
3524 		}
3525 		goto out_locked;
3526 	}
3527 	case T_EXDATA_IND: {
3528 		dprintso(so, 1,
3529 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3530 		    "state %s\n",
3531 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3532 		    saved_resid - uiop->uio_resid,
3533 		    pr_state(so->so_state, so->so_mode)));
3534 		/*
3535 		 * kstrgetmsg handles MSGMARK so there is nothing to
3536 		 * inspect in the T_EXDATA_IND.
3537 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3538 		 * as a separate message with no M_DATA component. Furthermore,
3539 		 * the stream head does not consolidate M_DATA messages onto
3540 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3541 		 * remains a message by itself. This is needed since MSGMARK
3542 		 * marks both the whole message as well as the last byte
3543 		 * of the message.
3544 		 */
3545 		freemsg(mp);
3546 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3547 		if (flags & MSG_PEEK) {
3548 			/*
3549 			 * Even though we are peeking we consume the
3550 			 * T_EXDATA_IND thereby moving the mark information
3551 			 * to SS_RCVATMARK. Then the oob code below will
3552 			 * retry the peeking kstrgetmsg.
3553 			 * Note that the stream head read queue is
3554 			 * never flushed without holding SOREADLOCKED
3555 			 * thus the T_EXDATA_IND can not disappear
3556 			 * underneath us.
3557 			 */
3558 			dprintso(so, 1,
3559 			    ("sotpi_recvmsg: consume EXDATA_IND "
3560 			    "counts %d/%d state %s\n",
3561 			    sti->sti_oobsigcnt,
3562 			    sti->sti_oobcnt,
3563 			    pr_state(so->so_state, so->so_mode)));
3564 
3565 			pflag = MSG_ANY | MSG_DELAYERROR;
3566 			if (so->so_mode & SM_ATOMIC)
3567 				pflag |= MSG_DISCARDTAIL;
3568 
3569 			pri = 0;
3570 			mp = NULL;
3571 
3572 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3573 			    &pri, &pflag, (clock_t)-1, &rval);
3574 			ASSERT(uiop->uio_resid == saved_resid);
3575 
3576 			if (error) {
3577 #ifdef SOCK_DEBUG
3578 				if (error != EWOULDBLOCK && error != EINTR) {
3579 					eprintsoline(so, error);
3580 				}
3581 #endif /* SOCK_DEBUG */
3582 				goto out;
3583 			}
3584 			ASSERT(mp);
3585 			tpr = (union T_primitives *)mp->b_rptr;
3586 			ASSERT(tpr->type == T_EXDATA_IND);
3587 			freemsg(mp);
3588 		} /* end "if (flags & MSG_PEEK)" */
3589 
3590 		/*
3591 		 * Decrement the number of queued and pending oob.
3592 		 *
3593 		 * SS_RCVATMARK is cleared when we read past a mark.
3594 		 * SS_HAVEOOBDATA is cleared when we've read past the
3595 		 * last mark.
3596 		 * SS_OOBPEND is cleared if we've read past the last
3597 		 * mark and no (new) SIGURG has been posted.
3598 		 */
3599 		mutex_enter(&so->so_lock);
3600 		ASSERT(so_verify_oobstate(so));
3601 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3602 		ASSERT(sti->sti_oobsigcnt > 0);
3603 		sti->sti_oobsigcnt--;
3604 		ASSERT(sti->sti_oobcnt > 0);
3605 		sti->sti_oobcnt--;
3606 		/*
3607 		 * Since the T_EXDATA_IND has been removed from the stream
3608 		 * head, but we have not read data past the mark,
3609 		 * sockfs needs to track that the socket is still at the mark.
3610 		 *
3611 		 * Since no data was received call kstrgetmsg again to wait
3612 		 * for data.
3613 		 */
3614 		so->so_state |= SS_RCVATMARK;
3615 		mutex_exit(&so->so_lock);
3616 		dprintso(so, 1,
3617 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3618 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3619 		    pr_state(so->so_state, so->so_mode)));
3620 		pflag = opflag;
3621 		goto retry;
3622 	}
3623 	default:
3624 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3625 		    (void *)so, tpr->type, (void *)mp);
3626 		ASSERT(0);
3627 		freemsg(mp);
3628 		error = EPROTO;
3629 		eprintsoline(so, error);
3630 		goto out;
3631 	}
3632 	/* NOTREACHED */
3633 out:
3634 	mutex_enter(&so->so_lock);
3635 out_locked:
3636 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3637 	mutex_exit(&so->so_lock);
3638 	return (error);
3639 }
3640 
3641 /*
3642  * Sending data with options on a datagram socket.
3643  * Assumes caller has verified that SS_ISBOUND etc. are set.
3644  */
3645 static int
3646 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3647     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3648 {
3649 	struct T_unitdata_req	tudr;
3650 	mblk_t			*mp;
3651 	int			error;
3652 	void			*addr;
3653 	socklen_t		addrlen;
3654 	void			*src;
3655 	socklen_t		srclen;
3656 	ssize_t			len;
3657 	int			size;
3658 	struct T_opthdr		toh;
3659 	struct fdbuf		*fdbuf;
3660 	t_uscalar_t		optlen;
3661 	void			*fds;
3662 	int			fdlen;
3663 	sotpi_info_t		*sti = SOTOTPI(so);
3664 
3665 	ASSERT(name && namelen);
3666 	ASSERT(control && controllen);
3667 
3668 	len = uiop->uio_resid;
3669 	if (len > (ssize_t)sti->sti_tidu_size) {
3670 		return (EMSGSIZE);
3671 	}
3672 
3673 	/*
3674 	 * For AF_UNIX the destination address is translated to an internal
3675 	 * name and the source address is passed as an option.
3676 	 * Also, file descriptors are passed as file pointers in an
3677 	 * option.
3678 	 */
3679 
3680 	/*
3681 	 * Length and family checks.
3682 	 */
3683 	error = so_addr_verify(so, name, namelen);
3684 	if (error) {
3685 		eprintsoline(so, error);
3686 		return (error);
3687 	}
3688 	if (so->so_family == AF_UNIX) {
3689 		if (sti->sti_faddr_noxlate) {
3690 			/*
3691 			 * Already have a transport internal address. Do not
3692 			 * pass any (transport internal) source address.
3693 			 */
3694 			addr = name;
3695 			addrlen = namelen;
3696 			src = NULL;
3697 			srclen = 0;
3698 		} else {
3699 			/*
3700 			 * Pass the sockaddr_un source address as an option
3701 			 * and translate the remote address.
3702 			 *
3703 			 * Note that this code does not prevent sti_laddr_sa
3704 			 * from changing while it is being used. Thus
3705 			 * if an unbind+bind occurs concurrently with this
3706 			 * send the peer might see a partially new and a
3707 			 * partially old "from" address.
3708 			 */
3709 			src = sti->sti_laddr_sa;
3710 			srclen = (t_uscalar_t)sti->sti_laddr_len;
3711 			dprintso(so, 1,
3712 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3713 			    srclen, src));
3714 			error = so_ux_addr_xlate(so, name, namelen,
3715 			    (flags & MSG_XPG4_2),
3716 			    &addr, &addrlen);
3717 			if (error) {
3718 				eprintsoline(so, error);
3719 				return (error);
3720 			}
3721 		}
3722 	} else {
3723 		addr = name;
3724 		addrlen = namelen;
3725 		src = NULL;
3726 		srclen = 0;
3727 	}
3728 	optlen = so_optlen(control, controllen,
3729 	    !(flags & MSG_XPG4_2));
3730 	tudr.PRIM_type = T_UNITDATA_REQ;
3731 	tudr.DEST_length = addrlen;
3732 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3733 	if (srclen != 0)
3734 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3735 		    _TPI_ALIGN_TOPT(srclen));
3736 	else
3737 		tudr.OPT_length = optlen;
3738 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3739 	    _TPI_ALIGN_TOPT(addrlen));
3740 
3741 	size = tudr.OPT_offset + tudr.OPT_length;
3742 
3743 	/*
3744 	 * File descriptors only when SM_FDPASSING set.
3745 	 */
3746 	error = so_getfdopt(control, controllen,
3747 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3748 	if (error)
3749 		return (error);
3750 	if (fdlen != -1) {
3751 		if (!(so->so_mode & SM_FDPASSING))
3752 			return (EOPNOTSUPP);
3753 
3754 		error = fdbuf_create(fds, fdlen, &fdbuf);
3755 		if (error)
3756 			return (error);
3757 
3758 		/*
3759 		 * Pre-allocate enough additional space for lower level modules
3760 		 * to append an option (e.g. see tl_unitdata). The following
3761 		 * is enough extra space for the largest option we might append.
3762 		 */
3763 		size += sizeof (struct T_opthdr) + ucredsize;
3764 		mp = fdbuf_allocmsg(size, fdbuf);
3765 	} else {
3766 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3767 		if (mp == NULL) {
3768 			/*
3769 			 * Caught a signal waiting for memory.
3770 			 * Let send* return EINTR.
3771 			 */
3772 			return (EINTR);
3773 		}
3774 	}
3775 	soappendmsg(mp, &tudr, sizeof (tudr));
3776 	soappendmsg(mp, addr, addrlen);
3777 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3778 
3779 	if (fdlen != -1) {
3780 		ASSERT(fdbuf != NULL);
3781 		toh.level = SOL_SOCKET;
3782 		toh.name = SO_FILEP;
3783 		toh.len = fdbuf->fd_size +
3784 		    (t_uscalar_t)sizeof (struct T_opthdr);
3785 		toh.status = 0;
3786 		soappendmsg(mp, &toh, sizeof (toh));
3787 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3788 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3789 	}
3790 	if (srclen != 0) {
3791 		/*
3792 		 * There is a AF_UNIX sockaddr_un to include as a source
3793 		 * address option.
3794 		 */
3795 		toh.level = SOL_SOCKET;
3796 		toh.name = SO_SRCADDR;
3797 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3798 		toh.status = 0;
3799 		soappendmsg(mp, &toh, sizeof (toh));
3800 		soappendmsg(mp, src, srclen);
3801 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3802 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3803 	}
3804 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3805 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3806 	/*
3807 	 * Normally at most 3 bytes left in the message, but we might have
3808 	 * allowed for extra space if we're passing fd's through.
3809 	 */
3810 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3811 
3812 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3813 	if (AU_AUDITING())
3814 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3815 
3816 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3817 #ifdef SOCK_DEBUG
3818 	if (error) {
3819 		eprintsoline(so, error);
3820 	}
3821 #endif /* SOCK_DEBUG */
3822 	return (error);
3823 }
3824 
3825 /*
3826  * Sending data with options on a connected stream socket.
3827  * Assumes caller has verified that SS_ISCONNECTED is set.
3828  */
3829 static int
3830 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3831     t_uscalar_t controllen, int flags)
3832 {
3833 	struct T_optdata_req	tdr;
3834 	mblk_t			*mp;
3835 	int			error;
3836 	ssize_t			iosize;
3837 	int			size;
3838 	struct fdbuf		*fdbuf;
3839 	t_uscalar_t		optlen;
3840 	void			*fds;
3841 	int			fdlen;
3842 	struct T_opthdr		toh;
3843 	sotpi_info_t		*sti = SOTOTPI(so);
3844 
3845 	dprintso(so, 1,
3846 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3847 
3848 	/*
3849 	 * Has to be bound and connected. However, since no locks are
3850 	 * held the state could have changed after sotpi_sendmsg checked it
3851 	 * thus it is not possible to ASSERT on the state.
3852 	 */
3853 
3854 	/* Options on connection-oriented only when SM_OPTDATA set. */
3855 	if (!(so->so_mode & SM_OPTDATA))
3856 		return (EOPNOTSUPP);
3857 
3858 	do {
3859 		/*
3860 		 * Set the MORE flag if uio_resid does not fit in this
3861 		 * message or if the caller passed in "more".
3862 		 * Error for transports with zero tidu_size.
3863 		 */
3864 		tdr.PRIM_type = T_OPTDATA_REQ;
3865 		iosize = sti->sti_tidu_size;
3866 		if (iosize <= 0)
3867 			return (EMSGSIZE);
3868 		if (uiop->uio_resid > iosize) {
3869 			tdr.DATA_flag = 1;
3870 		} else {
3871 			if (more)
3872 				tdr.DATA_flag = 1;
3873 			else
3874 				tdr.DATA_flag = 0;
3875 			iosize = uiop->uio_resid;
3876 		}
3877 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3878 		    tdr.DATA_flag, iosize));
3879 
3880 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3881 		tdr.OPT_length = optlen;
3882 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3883 
3884 		size = (int)sizeof (tdr) + optlen;
3885 		/*
3886 		 * File descriptors only when SM_FDPASSING set.
3887 		 */
3888 		error = so_getfdopt(control, controllen,
3889 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3890 		if (error)
3891 			return (error);
3892 		if (fdlen != -1) {
3893 			if (!(so->so_mode & SM_FDPASSING))
3894 				return (EOPNOTSUPP);
3895 
3896 			error = fdbuf_create(fds, fdlen, &fdbuf);
3897 			if (error)
3898 				return (error);
3899 
3900 			/*
3901 			 * Pre-allocate enough additional space for lower level
3902 			 * modules to append an option (e.g. see tl_unitdata).
3903 			 * The following is enough extra space for the largest
3904 			 * option we might append.
3905 			 */
3906 			size += sizeof (struct T_opthdr) + ucredsize;
3907 			mp = fdbuf_allocmsg(size, fdbuf);
3908 		} else {
3909 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3910 			if (mp == NULL) {
3911 				/*
3912 				 * Caught a signal waiting for memory.
3913 				 * Let send* return EINTR.
3914 				 */
3915 				return (EINTR);
3916 			}
3917 		}
3918 		soappendmsg(mp, &tdr, sizeof (tdr));
3919 
3920 		if (fdlen != -1) {
3921 			ASSERT(fdbuf != NULL);
3922 			toh.level = SOL_SOCKET;
3923 			toh.name = SO_FILEP;
3924 			toh.len = fdbuf->fd_size +
3925 			    (t_uscalar_t)sizeof (struct T_opthdr);
3926 			toh.status = 0;
3927 			soappendmsg(mp, &toh, sizeof (toh));
3928 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3929 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3930 		}
3931 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3932 		/*
3933 		 * Normally at most 3 bytes left in the message, but we might
3934 		 * have allowed for extra space if we're passing fd's through.
3935 		 */
3936 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3937 
3938 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3939 
3940 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3941 		    0, MSG_BAND, 0);
3942 		if (error) {
3943 			eprintsoline(so, error);
3944 			return (error);
3945 		}
3946 		control = NULL;
3947 		if (uiop->uio_resid > 0) {
3948 			/*
3949 			 * Recheck for fatal errors. Fail write even though
3950 			 * some data have been written. This is consistent
3951 			 * with strwrite semantics and BSD sockets semantics.
3952 			 */
3953 			if (so->so_state & SS_CANTSENDMORE) {
3954 				eprintsoline(so, error);
3955 				return (EPIPE);
3956 			}
3957 			if (so->so_error != 0) {
3958 				mutex_enter(&so->so_lock);
3959 				error = sogeterr(so, B_TRUE);
3960 				mutex_exit(&so->so_lock);
3961 				if (error != 0) {
3962 					eprintsoline(so, error);
3963 					return (error);
3964 				}
3965 			}
3966 		}
3967 	} while (uiop->uio_resid > 0);
3968 	return (0);
3969 }
3970 
3971 /*
3972  * Sending data on a datagram socket.
3973  * Assumes caller has verified that SS_ISBOUND etc. are set.
3974  *
3975  * For AF_UNIX the destination address is translated to an internal
3976  * name and the source address is passed as an option.
3977  */
3978 int
3979 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3980     struct uio *uiop, int flags)
3981 {
3982 	struct T_unitdata_req	tudr;
3983 	mblk_t			*mp;
3984 	int			error;
3985 	void			*addr;
3986 	socklen_t		addrlen;
3987 	void			*src;
3988 	socklen_t		srclen;
3989 	ssize_t			len;
3990 	sotpi_info_t		*sti = SOTOTPI(so);
3991 
3992 	ASSERT(name != NULL && namelen != 0);
3993 
3994 	len = uiop->uio_resid;
3995 	if (len > sti->sti_tidu_size) {
3996 		error = EMSGSIZE;
3997 		goto done;
3998 	}
3999 
4000 	/* Length and family checks */
4001 	error = so_addr_verify(so, name, namelen);
4002 	if (error != 0)
4003 		goto done;
4004 
4005 	if (sti->sti_direct)
4006 		return (sodgram_direct(so, name, namelen, uiop, flags));
4007 
4008 	if (so->so_family == AF_UNIX) {
4009 		if (sti->sti_faddr_noxlate) {
4010 			/*
4011 			 * Already have a transport internal address. Do not
4012 			 * pass any (transport internal) source address.
4013 			 */
4014 			addr = name;
4015 			addrlen = namelen;
4016 			src = NULL;
4017 			srclen = 0;
4018 		} else {
4019 			/*
4020 			 * Pass the sockaddr_un source address as an option
4021 			 * and translate the remote address.
4022 			 *
4023 			 * Note that this code does not prevent sti_laddr_sa
4024 			 * from changing while it is being used. Thus
4025 			 * if an unbind+bind occurs concurrently with this
4026 			 * send the peer might see a partially new and a
4027 			 * partially old "from" address.
4028 			 */
4029 			src = sti->sti_laddr_sa;
4030 			srclen = (socklen_t)sti->sti_laddr_len;
4031 			dprintso(so, 1,
4032 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
4033 			    srclen, src));
4034 			error = so_ux_addr_xlate(so, name, namelen,
4035 			    (flags & MSG_XPG4_2),
4036 			    &addr, &addrlen);
4037 			if (error) {
4038 				eprintsoline(so, error);
4039 				goto done;
4040 			}
4041 		}
4042 	} else {
4043 		addr = name;
4044 		addrlen = namelen;
4045 		src = NULL;
4046 		srclen = 0;
4047 	}
4048 	tudr.PRIM_type = T_UNITDATA_REQ;
4049 	tudr.DEST_length = addrlen;
4050 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4051 	if (srclen == 0) {
4052 		tudr.OPT_length = 0;
4053 		tudr.OPT_offset = 0;
4054 
4055 		mp = soallocproto2(&tudr, sizeof (tudr),
4056 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
4057 		if (mp == NULL) {
4058 			/*
4059 			 * Caught a signal waiting for memory.
4060 			 * Let send* return EINTR.
4061 			 */
4062 			error = EINTR;
4063 			goto done;
4064 		}
4065 	} else {
4066 		/*
4067 		 * There is a AF_UNIX sockaddr_un to include as a source
4068 		 * address option.
4069 		 */
4070 		struct T_opthdr toh;
4071 		ssize_t size;
4072 
4073 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4074 		    _TPI_ALIGN_TOPT(srclen));
4075 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4076 		    _TPI_ALIGN_TOPT(addrlen));
4077 
4078 		toh.level = SOL_SOCKET;
4079 		toh.name = SO_SRCADDR;
4080 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4081 		toh.status = 0;
4082 
4083 		size = tudr.OPT_offset + tudr.OPT_length;
4084 		mp = soallocproto2(&tudr, sizeof (tudr),
4085 		    addr, addrlen, size, _ALLOC_INTR, CRED());
4086 		if (mp == NULL) {
4087 			/*
4088 			 * Caught a signal waiting for memory.
4089 			 * Let send* return EINTR.
4090 			 */
4091 			error = EINTR;
4092 			goto done;
4093 		}
4094 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4095 		soappendmsg(mp, &toh, sizeof (toh));
4096 		soappendmsg(mp, src, srclen);
4097 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4098 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4099 	}
4100 
4101 	if (AU_AUDITING())
4102 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4103 
4104 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4105 done:
4106 #ifdef SOCK_DEBUG
4107 	if (error) {
4108 		eprintsoline(so, error);
4109 	}
4110 #endif /* SOCK_DEBUG */
4111 	return (error);
4112 }
4113 
4114 /*
4115  * Sending data on a connected stream socket.
4116  * Assumes caller has verified that SS_ISCONNECTED is set.
4117  */
4118 int
4119 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4120     int sflag)
4121 {
4122 	struct T_data_req	tdr;
4123 	mblk_t			*mp;
4124 	int			error;
4125 	ssize_t			iosize;
4126 	sotpi_info_t		*sti = SOTOTPI(so);
4127 
4128 	dprintso(so, 1,
4129 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4130 	    (void *)so, uiop->uio_resid, prim, sflag));
4131 
4132 	/*
4133 	 * Has to be bound and connected. However, since no locks are
4134 	 * held the state could have changed after sotpi_sendmsg checked it
4135 	 * thus it is not possible to ASSERT on the state.
4136 	 */
4137 
4138 	do {
4139 		/*
4140 		 * Set the MORE flag if uio_resid does not fit in this
4141 		 * message or if the caller passed in "more".
4142 		 * Error for transports with zero tidu_size.
4143 		 */
4144 		tdr.PRIM_type = prim;
4145 		iosize = sti->sti_tidu_size;
4146 		if (iosize <= 0)
4147 			return (EMSGSIZE);
4148 		if (uiop->uio_resid > iosize) {
4149 			tdr.MORE_flag = 1;
4150 		} else {
4151 			if (more)
4152 				tdr.MORE_flag = 1;
4153 			else
4154 				tdr.MORE_flag = 0;
4155 			iosize = uiop->uio_resid;
4156 		}
4157 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4158 		    prim, tdr.MORE_flag, iosize));
4159 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4160 		if (mp == NULL) {
4161 			/*
4162 			 * Caught a signal waiting for memory.
4163 			 * Let send* return EINTR.
4164 			 */
4165 			return (EINTR);
4166 		}
4167 
4168 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4169 		    0, sflag | MSG_BAND, 0);
4170 		if (error) {
4171 			eprintsoline(so, error);
4172 			return (error);
4173 		}
4174 		if (uiop->uio_resid > 0) {
4175 			/*
4176 			 * Recheck for fatal errors. Fail write even though
4177 			 * some data have been written. This is consistent
4178 			 * with strwrite semantics and BSD sockets semantics.
4179 			 */
4180 			if (so->so_state & SS_CANTSENDMORE) {
4181 				eprintsoline(so, error);
4182 				return (EPIPE);
4183 			}
4184 			if (so->so_error != 0) {
4185 				mutex_enter(&so->so_lock);
4186 				error = sogeterr(so, B_TRUE);
4187 				mutex_exit(&so->so_lock);
4188 				if (error != 0) {
4189 					eprintsoline(so, error);
4190 					return (error);
4191 				}
4192 			}
4193 		}
4194 	} while (uiop->uio_resid > 0);
4195 	return (0);
4196 }
4197 
4198 /*
4199  * Check the state for errors and call the appropriate send function.
4200  *
4201  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4202  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4203  * after sending the message.
4204  */
4205 static int
4206 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4207     struct cred *cr)
4208 {
4209 	int		so_state;
4210 	int		so_mode;
4211 	int		error;
4212 	struct sockaddr *name;
4213 	t_uscalar_t	namelen;
4214 	int		dontroute;
4215 	int		flags;
4216 	sotpi_info_t	*sti = SOTOTPI(so);
4217 
4218 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4219 	    (void *)so, (void *)msg, msg->msg_flags,
4220 	    pr_state(so->so_state, so->so_mode), so->so_error));
4221 
4222 	if (so->so_version == SOV_STREAM) {
4223 		/* The imaginary "sockmod" has been popped - act as a stream */
4224 		so_update_attrs(so, SOMOD);
4225 		return (strwrite(SOTOV(so), uiop, cr));
4226 	}
4227 
4228 	mutex_enter(&so->so_lock);
4229 	so_state = so->so_state;
4230 
4231 	if (so_state & SS_CANTSENDMORE) {
4232 		mutex_exit(&so->so_lock);
4233 		return (EPIPE);
4234 	}
4235 
4236 	if (so->so_error != 0) {
4237 		error = sogeterr(so, B_TRUE);
4238 		if (error != 0) {
4239 			mutex_exit(&so->so_lock);
4240 			return (error);
4241 		}
4242 	}
4243 
4244 	name = (struct sockaddr *)msg->msg_name;
4245 	namelen = msg->msg_namelen;
4246 
4247 	so_mode = so->so_mode;
4248 
4249 	if (name == NULL) {
4250 		if (!(so_state & SS_ISCONNECTED)) {
4251 			mutex_exit(&so->so_lock);
4252 			if (so_mode & SM_CONNREQUIRED)
4253 				return (ENOTCONN);
4254 			else
4255 				return (EDESTADDRREQ);
4256 		}
4257 		if (so_mode & SM_CONNREQUIRED) {
4258 			name = NULL;
4259 			namelen = 0;
4260 		} else {
4261 			/*
4262 			 * Note that this code does not prevent sti_faddr_sa
4263 			 * from changing while it is being used. Thus
4264 			 * if an "unconnect"+connect occurs concurrently with
4265 			 * this send the datagram might be delivered to a
4266 			 * garbaled address.
4267 			 */
4268 			ASSERT(sti->sti_faddr_sa);
4269 			name = sti->sti_faddr_sa;
4270 			namelen = (t_uscalar_t)sti->sti_faddr_len;
4271 		}
4272 	} else {
4273 		if (!(so_state & SS_ISCONNECTED) &&
4274 		    (so_mode & SM_CONNREQUIRED)) {
4275 			/* Required but not connected */
4276 			mutex_exit(&so->so_lock);
4277 			return (ENOTCONN);
4278 		}
4279 		/*
4280 		 * Ignore the address on connection-oriented sockets.
4281 		 * Just like BSD this code does not generate an error for
4282 		 * TCP (a CONNREQUIRED socket) when sending to an address
4283 		 * passed in with sendto/sendmsg. Instead the data is
4284 		 * delivered on the connection as if no address had been
4285 		 * supplied.
4286 		 */
4287 		if ((so_state & SS_ISCONNECTED) &&
4288 		    !(so_mode & SM_CONNREQUIRED)) {
4289 			mutex_exit(&so->so_lock);
4290 			return (EISCONN);
4291 		}
4292 		if (!(so_state & SS_ISBOUND)) {
4293 			so_lock_single(so);	/* Set SOLOCKED */
4294 			error = sotpi_bind(so, NULL, 0,
4295 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4296 			so_unlock_single(so, SOLOCKED);
4297 			if (error) {
4298 				mutex_exit(&so->so_lock);
4299 				eprintsoline(so, error);
4300 				return (error);
4301 			}
4302 		}
4303 		/*
4304 		 * Handle delayed datagram errors. These are only queued
4305 		 * when the application sets SO_DGRAM_ERRIND.
4306 		 * Return the error if we are sending to the address
4307 		 * that was returned in the last T_UDERROR_IND.
4308 		 * If sending to some other address discard the delayed
4309 		 * error indication.
4310 		 */
4311 		if (sti->sti_delayed_error) {
4312 			struct T_uderror_ind	*tudi;
4313 			void			*addr;
4314 			t_uscalar_t		addrlen;
4315 			boolean_t		match = B_FALSE;
4316 
4317 			ASSERT(sti->sti_eaddr_mp);
4318 			error = sti->sti_delayed_error;
4319 			sti->sti_delayed_error = 0;
4320 			tudi =
4321 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4322 			addrlen = tudi->DEST_length;
4323 			addr = sogetoff(sti->sti_eaddr_mp,
4324 			    tudi->DEST_offset, addrlen, 1);
4325 			ASSERT(addr);	/* Checked by strsock_proto */
4326 			switch (so->so_family) {
4327 			case AF_INET: {
4328 				/* Compare just IP address and port */
4329 				sin_t *sin1 = (sin_t *)name;
4330 				sin_t *sin2 = (sin_t *)addr;
4331 
4332 				if (addrlen == sizeof (sin_t) &&
4333 				    namelen == addrlen &&
4334 				    sin1->sin_port == sin2->sin_port &&
4335 				    sin1->sin_addr.s_addr ==
4336 				    sin2->sin_addr.s_addr)
4337 					match = B_TRUE;
4338 				break;
4339 			}
4340 			case AF_INET6: {
4341 				/* Compare just IP address and port. Not flow */
4342 				sin6_t *sin1 = (sin6_t *)name;
4343 				sin6_t *sin2 = (sin6_t *)addr;
4344 
4345 				if (addrlen == sizeof (sin6_t) &&
4346 				    namelen == addrlen &&
4347 				    sin1->sin6_port == sin2->sin6_port &&
4348 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4349 				    &sin2->sin6_addr))
4350 					match = B_TRUE;
4351 				break;
4352 			}
4353 			case AF_UNIX:
4354 			default:
4355 				if (namelen == addrlen &&
4356 				    bcmp(name, addr, namelen) == 0)
4357 					match = B_TRUE;
4358 			}
4359 			if (match) {
4360 				freemsg(sti->sti_eaddr_mp);
4361 				sti->sti_eaddr_mp = NULL;
4362 				mutex_exit(&so->so_lock);
4363 #ifdef DEBUG
4364 				dprintso(so, 0,
4365 				    ("sockfs delayed error %d for %s\n",
4366 				    error,
4367 				    pr_addr(so->so_family, name, namelen)));
4368 #endif /* DEBUG */
4369 				return (error);
4370 			}
4371 			freemsg(sti->sti_eaddr_mp);
4372 			sti->sti_eaddr_mp = NULL;
4373 		}
4374 	}
4375 	mutex_exit(&so->so_lock);
4376 
4377 	flags = msg->msg_flags;
4378 	dontroute = 0;
4379 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4380 		uint32_t	val;
4381 
4382 		val = 1;
4383 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4384 		    &val, (t_uscalar_t)sizeof (val), cr);
4385 		if (error)
4386 			return (error);
4387 		dontroute = 1;
4388 	}
4389 
4390 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4391 		error = EOPNOTSUPP;
4392 		goto done;
4393 	}
4394 	if (msg->msg_controllen != 0) {
4395 		if (!(so_mode & SM_CONNREQUIRED)) {
4396 			so_update_attrs(so, SOMOD);
4397 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4398 			    msg->msg_control, msg->msg_controllen, flags);
4399 		} else {
4400 			if (flags & MSG_OOB) {
4401 				/* Can't generate T_EXDATA_REQ with options */
4402 				error = EOPNOTSUPP;
4403 				goto done;
4404 			}
4405 			so_update_attrs(so, SOMOD);
4406 			error = sosend_svccmsg(so, uiop,
4407 			    !(flags & MSG_EOR),
4408 			    msg->msg_control, msg->msg_controllen,
4409 			    flags);
4410 		}
4411 		goto done;
4412 	}
4413 
4414 	so_update_attrs(so, SOMOD);
4415 	if (!(so_mode & SM_CONNREQUIRED)) {
4416 		/*
4417 		 * If there is no SO_DONTROUTE to turn off return immediately
4418 		 * from send_dgram. This can allow tail-call optimizations.
4419 		 */
4420 		if (!dontroute) {
4421 			return (sosend_dgram(so, name, namelen, uiop, flags));
4422 		}
4423 		error = sosend_dgram(so, name, namelen, uiop, flags);
4424 	} else {
4425 		t_scalar_t prim;
4426 		int sflag;
4427 
4428 		/* Ignore msg_name in the connected state */
4429 		if (flags & MSG_OOB) {
4430 			prim = T_EXDATA_REQ;
4431 			/*
4432 			 * Send down T_EXDATA_REQ even if there is flow
4433 			 * control for data.
4434 			 */
4435 			sflag = MSG_IGNFLOW;
4436 		} else {
4437 			if (so_mode & SM_BYTESTREAM) {
4438 				/* Byte stream transport - use write */
4439 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4440 
4441 				/* Send M_DATA messages */
4442 				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4443 				    (error = nl7c_data(so, uiop)) >= 0) {
4444 					/* NL7C consumed the data */
4445 					return (error);
4446 				}
4447 				/*
4448 				 * If there is no SO_DONTROUTE to turn off,
4449 				 * sti_direct is on, and there is no flow
4450 				 * control, we can take the fast path.
4451 				 */
4452 				if (!dontroute && sti->sti_direct != 0 &&
4453 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4454 					return (sostream_direct(so, uiop,
4455 					    NULL, cr));
4456 				}
4457 				error = strwrite(SOTOV(so), uiop, cr);
4458 				goto done;
4459 			}
4460 			prim = T_DATA_REQ;
4461 			sflag = 0;
4462 		}
4463 		/*
4464 		 * If there is no SO_DONTROUTE to turn off return immediately
4465 		 * from sosend_svc. This can allow tail-call optimizations.
4466 		 */
4467 		if (!dontroute)
4468 			return (sosend_svc(so, uiop, prim,
4469 			    !(flags & MSG_EOR), sflag));
4470 		error = sosend_svc(so, uiop, prim,
4471 		    !(flags & MSG_EOR), sflag);
4472 	}
4473 	ASSERT(dontroute);
4474 done:
4475 	if (dontroute) {
4476 		uint32_t	val;
4477 
4478 		val = 0;
4479 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4480 		    &val, (t_uscalar_t)sizeof (val), cr);
4481 	}
4482 	return (error);
4483 }
4484 
4485 /*
4486  * kstrwritemp() has very similar semantics as that of strwrite().
4487  * The main difference is it obtains mblks from the caller and also
4488  * does not do any copy as done in strwrite() from user buffers to
4489  * kernel buffers.
4490  *
4491  * Currently, this routine is used by sendfile to send data allocated
4492  * within the kernel without any copying. This interface does not use the
4493  * synchronous stream interface as synch. stream interface implies
4494  * copying.
4495  */
4496 int
4497 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4498 {
4499 	struct stdata *stp;
4500 	struct queue *wqp;
4501 	mblk_t *newmp;
4502 	char waitflag;
4503 	int tempmode;
4504 	int error = 0;
4505 	int done = 0;
4506 	struct sonode *so;
4507 	boolean_t direct;
4508 
4509 	ASSERT(vp->v_stream);
4510 	stp = vp->v_stream;
4511 
4512 	so = VTOSO(vp);
4513 	direct = _SOTOTPI(so)->sti_direct;
4514 
4515 	/*
4516 	 * This is the sockfs direct fast path. canputnext() need
4517 	 * not be accurate so we don't grab the sd_lock here. If
4518 	 * we get flow-controlled, we grab sd_lock just before the
4519 	 * do..while loop below to emulate what strwrite() does.
4520 	 */
4521 	wqp = stp->sd_wrq;
4522 	if (canputnext(wqp) && direct &&
4523 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4524 		return (sostream_direct(so, NULL, mp, CRED()));
4525 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4526 		/* Fast check of flags before acquiring the lock */
4527 		mutex_enter(&stp->sd_lock);
4528 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4529 		mutex_exit(&stp->sd_lock);
4530 		if (error != 0) {
4531 			if (!(stp->sd_flag & STPLEX) &&
4532 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4533 				error = EPIPE;
4534 			}
4535 			return (error);
4536 		}
4537 	}
4538 
4539 	waitflag = WRITEWAIT;
4540 	if (stp->sd_flag & OLDNDELAY)
4541 		tempmode = fmode & ~FNDELAY;
4542 	else
4543 		tempmode = fmode;
4544 
4545 	mutex_enter(&stp->sd_lock);
4546 	do {
4547 		if (canputnext(wqp)) {
4548 			mutex_exit(&stp->sd_lock);
4549 			if (stp->sd_wputdatafunc != NULL) {
4550 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4551 				    NULL, NULL, NULL);
4552 				if (newmp == NULL) {
4553 					/* The caller will free mp */
4554 					return (ECOMM);
4555 				}
4556 				mp = newmp;
4557 			}
4558 			putnext(wqp, mp);
4559 			return (0);
4560 		}
4561 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4562 		    &done);
4563 	} while (error == 0 && !done);
4564 
4565 	mutex_exit(&stp->sd_lock);
4566 	/*
4567 	 * EAGAIN tells the application to try again. ENOMEM
4568 	 * is returned only if the memory allocation size
4569 	 * exceeds the physical limits of the system. ENOMEM
4570 	 * can't be true here.
4571 	 */
4572 	if (error == ENOMEM)
4573 		error = EAGAIN;
4574 	return (error);
4575 }
4576 
4577 /* ARGSUSED */
4578 static int
4579 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4580     struct cred *cr, mblk_t **mpp)
4581 {
4582 	int error;
4583 
4584 	if (so->so_family != AF_INET && so->so_family != AF_INET6)
4585 		return (EAFNOSUPPORT);
4586 
4587 	if (so->so_state & SS_CANTSENDMORE)
4588 		return (EPIPE);
4589 
4590 	if (so->so_type != SOCK_STREAM)
4591 		return (EOPNOTSUPP);
4592 
4593 	if ((so->so_state & SS_ISCONNECTED) == 0)
4594 		return (ENOTCONN);
4595 
4596 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4597 	if (error == 0)
4598 		*mpp = NULL;
4599 	return (error);
4600 }
4601 
4602 /*
4603  * Sending data on a datagram socket.
4604  * Assumes caller has verified that SS_ISBOUND etc. are set.
4605  */
4606 /* ARGSUSED */
4607 static int
4608 sodgram_direct(struct sonode *so, struct sockaddr *name,
4609     socklen_t namelen, struct uio *uiop, int flags)
4610 {
4611 	struct T_unitdata_req	tudr;
4612 	mblk_t			*mp = NULL;
4613 	int			error = 0;
4614 	void			*addr;
4615 	socklen_t		addrlen;
4616 	ssize_t			len;
4617 	struct stdata		*stp = SOTOV(so)->v_stream;
4618 	int			so_state;
4619 	queue_t			*udp_wq;
4620 	boolean_t		connected;
4621 	mblk_t			*mpdata = NULL;
4622 	sotpi_info_t		*sti = SOTOTPI(so);
4623 	uint32_t		auditing = AU_AUDITING();
4624 
4625 	ASSERT(name != NULL && namelen != 0);
4626 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4627 	ASSERT(!(so->so_mode & SM_EXDATA));
4628 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4629 	ASSERT(SOTOV(so)->v_type == VSOCK);
4630 
4631 	/* Caller checked for proper length */
4632 	len = uiop->uio_resid;
4633 	ASSERT(len <= sti->sti_tidu_size);
4634 
4635 	/* Length and family checks have been done by caller */
4636 	ASSERT(name->sa_family == so->so_family);
4637 	ASSERT(so->so_family == AF_INET ||
4638 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4639 	ASSERT(so->so_family == AF_INET6 ||
4640 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4641 
4642 	addr = name;
4643 	addrlen = namelen;
4644 
4645 	if (stp->sd_sidp != NULL &&
4646 	    (error = straccess(stp, JCWRITE)) != 0)
4647 		goto done;
4648 
4649 	so_state = so->so_state;
4650 
4651 	connected = so_state & SS_ISCONNECTED;
4652 	if (!connected) {
4653 		tudr.PRIM_type = T_UNITDATA_REQ;
4654 		tudr.DEST_length = addrlen;
4655 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4656 		tudr.OPT_length = 0;
4657 		tudr.OPT_offset = 0;
4658 
4659 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4660 		    _ALLOC_INTR, CRED());
4661 		if (mp == NULL) {
4662 			/*
4663 			 * Caught a signal waiting for memory.
4664 			 * Let send* return EINTR.
4665 			 */
4666 			error = EINTR;
4667 			goto done;
4668 		}
4669 	}
4670 
4671 	/*
4672 	 * For UDP we don't break up the copyin into smaller pieces
4673 	 * as in the TCP case.  That means if ENOMEM is returned by
4674 	 * mcopyinuio() then the uio vector has not been modified at
4675 	 * all and we fallback to either strwrite() or kstrputmsg()
4676 	 * below.  Note also that we never generate priority messages
4677 	 * from here.
4678 	 */
4679 	udp_wq = stp->sd_wrq->q_next;
4680 	if (canput(udp_wq) &&
4681 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4682 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4683 		ASSERT(uiop->uio_resid == 0);
4684 		if (!connected)
4685 			linkb(mp, mpdata);
4686 		else
4687 			mp = mpdata;
4688 		if (auditing)
4689 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4690 
4691 		udp_wput(udp_wq, mp);
4692 		return (0);
4693 	}
4694 
4695 	ASSERT(mpdata == NULL);
4696 	if (error != 0 && error != ENOMEM) {
4697 		freemsg(mp);
4698 		return (error);
4699 	}
4700 
4701 	/*
4702 	 * For connected, let strwrite() handle the blocking case.
4703 	 * Otherwise we fall thru and use kstrputmsg().
4704 	 */
4705 	if (connected)
4706 		return (strwrite(SOTOV(so), uiop, CRED()));
4707 
4708 	if (auditing)
4709 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4710 
4711 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4712 done:
4713 #ifdef SOCK_DEBUG
4714 	if (error != 0) {
4715 		eprintsoline(so, error);
4716 	}
4717 #endif /* SOCK_DEBUG */
4718 	return (error);
4719 }
4720 
4721 int
4722 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4723 {
4724 	struct stdata *stp = SOTOV(so)->v_stream;
4725 	ssize_t iosize, rmax, maxblk;
4726 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4727 	mblk_t *newmp;
4728 	int error = 0, wflag = 0;
4729 
4730 	ASSERT(so->so_mode & SM_BYTESTREAM);
4731 	ASSERT(SOTOV(so)->v_type == VSOCK);
4732 
4733 	if (stp->sd_sidp != NULL &&
4734 	    (error = straccess(stp, JCWRITE)) != 0)
4735 		return (error);
4736 
4737 	if (uiop == NULL) {
4738 		/*
4739 		 * kstrwritemp() should have checked sd_flag and
4740 		 * flow-control before coming here.  If we end up
4741 		 * here it means that we can simply pass down the
4742 		 * data to tcp.
4743 		 */
4744 		ASSERT(mp != NULL);
4745 		if (stp->sd_wputdatafunc != NULL) {
4746 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4747 			    NULL, NULL, NULL);
4748 			if (newmp == NULL) {
4749 				/* The caller will free mp */
4750 				return (ECOMM);
4751 			}
4752 			mp = newmp;
4753 		}
4754 		tcp_wput(tcp_wq, mp);
4755 		return (0);
4756 	}
4757 
4758 	/* Fallback to strwrite() to do proper error handling */
4759 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4760 		return (strwrite(SOTOV(so), uiop, cr));
4761 
4762 	rmax = stp->sd_qn_maxpsz;
4763 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4764 	if (rmax == 0 || uiop->uio_resid <= 0)
4765 		return (0);
4766 
4767 	if (rmax == INFPSZ)
4768 		rmax = uiop->uio_resid;
4769 
4770 	maxblk = stp->sd_maxblk;
4771 
4772 	for (;;) {
4773 		iosize = MIN(uiop->uio_resid, rmax);
4774 
4775 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4776 		if (mp == NULL) {
4777 			/*
4778 			 * Fallback to strwrite() for ENOMEM; if this
4779 			 * is our first time in this routine and the uio
4780 			 * vector has not been modified, we will end up
4781 			 * calling strwrite() without any flag set.
4782 			 */
4783 			if (error == ENOMEM)
4784 				goto slow_send;
4785 			else
4786 				return (error);
4787 		}
4788 		ASSERT(uiop->uio_resid >= 0);
4789 		/*
4790 		 * If mp is non-NULL and ENOMEM is set, it means that
4791 		 * mcopyinuio() was able to break down some of the user
4792 		 * data into one or more mblks.  Send the partial data
4793 		 * to tcp and let the rest be handled in strwrite().
4794 		 */
4795 		ASSERT(error == 0 || error == ENOMEM);
4796 		if (stp->sd_wputdatafunc != NULL) {
4797 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4798 			    NULL, NULL, NULL);
4799 			if (newmp == NULL) {
4800 				/* The caller will free mp */
4801 				return (ECOMM);
4802 			}
4803 			mp = newmp;
4804 		}
4805 		tcp_wput(tcp_wq, mp);
4806 
4807 		wflag |= NOINTR;
4808 
4809 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4810 			ASSERT(error == 0);
4811 			break;
4812 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4813 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4814 slow_send:
4815 			/*
4816 			 * We were able to send down partial data using
4817 			 * the direct call interface, but are now relying
4818 			 * on strwrite() to handle the non-fastpath cases.
4819 			 * If the socket is blocking we will sleep in
4820 			 * strwaitq() until write is permitted, otherwise,
4821 			 * we will need to return the amount of bytes
4822 			 * written so far back to the app.  This is the
4823 			 * reason why we pass NOINTR flag to strwrite()
4824 			 * for non-blocking socket, because we don't want
4825 			 * to return EAGAIN when portion of the user data
4826 			 * has actually been sent down.
4827 			 */
4828 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4829 		}
4830 	}
4831 	return (0);
4832 }
4833 
4834 /*
4835  * Update sti_faddr by asking the transport (unless AF_UNIX).
4836  */
4837 /* ARGSUSED */
4838 int
4839 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4840     boolean_t accept, struct cred *cr)
4841 {
4842 	struct strbuf	strbuf;
4843 	int		error = 0, res;
4844 	void		*addr;
4845 	t_uscalar_t	addrlen;
4846 	k_sigset_t	smask;
4847 	sotpi_info_t	*sti = SOTOTPI(so);
4848 
4849 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4850 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4851 
4852 	ASSERT(*namelen > 0);
4853 	mutex_enter(&so->so_lock);
4854 	so_lock_single(so);	/* Set SOLOCKED */
4855 
4856 	if (accept) {
4857 		bcopy(sti->sti_faddr_sa, name,
4858 		    MIN(*namelen, sti->sti_faddr_len));
4859 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4860 		goto done;
4861 	}
4862 
4863 	if (!(so->so_state & SS_ISCONNECTED)) {
4864 		error = ENOTCONN;
4865 		goto done;
4866 	}
4867 	/* Added this check for X/Open */
4868 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4869 		error = EINVAL;
4870 		if (xnet_check_print) {
4871 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4872 		}
4873 		goto done;
4874 	}
4875 
4876 	if (sti->sti_faddr_valid) {
4877 		bcopy(sti->sti_faddr_sa, name,
4878 		    MIN(*namelen, sti->sti_faddr_len));
4879 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4880 		goto done;
4881 	}
4882 
4883 #ifdef DEBUG
4884 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4885 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4886 	    (t_uscalar_t)sti->sti_faddr_len)));
4887 #endif /* DEBUG */
4888 
4889 	if (so->so_family == AF_UNIX) {
4890 		/* Transport has different name space - return local info */
4891 		if (sti->sti_faddr_noxlate)
4892 			*namelen = 0;
4893 		error = 0;
4894 		goto done;
4895 	}
4896 
4897 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4898 
4899 	ASSERT(sti->sti_faddr_sa);
4900 	/* Allocate local buffer to use with ioctl */
4901 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4902 	mutex_exit(&so->so_lock);
4903 	addr = kmem_alloc(addrlen, KM_SLEEP);
4904 
4905 	/*
4906 	 * Issue TI_GETPEERNAME with signals masked.
4907 	 * Put the result in sti_faddr_sa so that getpeername works after
4908 	 * a shutdown(output).
4909 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4910 	 * back to the socket.
4911 	 */
4912 	strbuf.buf = addr;
4913 	strbuf.maxlen = addrlen;
4914 	strbuf.len = 0;
4915 
4916 	sigintr(&smask, 0);
4917 	res = 0;
4918 	ASSERT(cr);
4919 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4920 	    0, K_TO_K, cr, &res);
4921 	sigunintr(&smask);
4922 
4923 	mutex_enter(&so->so_lock);
4924 	/*
4925 	 * If there is an error record the error in so_error put don't fail
4926 	 * the getpeername. Instead fallback on the recorded
4927 	 * sti->sti_faddr_sa.
4928 	 */
4929 	if (error) {
4930 		/*
4931 		 * Various stream head errors can be returned to the ioctl.
4932 		 * However, it is impossible to determine which ones of
4933 		 * these are really socket level errors that were incorrectly
4934 		 * consumed by the ioctl. Thus this code silently ignores the
4935 		 * error - to code explicitly does not reinstate the error
4936 		 * using soseterror().
4937 		 * Experiments have shows that at least this set of
4938 		 * errors are reported and should not be reinstated on the
4939 		 * socket:
4940 		 *	EINVAL	E.g. if an I_LINK was in effect when
4941 		 *		getpeername was called.
4942 		 *	EPIPE	The ioctl error semantics prefer the write
4943 		 *		side error over the read side error.
4944 		 *	ENOTCONN The transport just got disconnected but
4945 		 *		sockfs had not yet seen the T_DISCON_IND
4946 		 *		when issuing the ioctl.
4947 		 */
4948 		error = 0;
4949 	} else if (res == 0 && strbuf.len > 0 &&
4950 	    (so->so_state & SS_ISCONNECTED)) {
4951 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4952 		sti->sti_faddr_len = (socklen_t)strbuf.len;
4953 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4954 		sti->sti_faddr_valid = 1;
4955 
4956 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4957 		*namelen = sti->sti_faddr_len;
4958 	}
4959 	kmem_free(addr, addrlen);
4960 #ifdef DEBUG
4961 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4962 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4963 	    (t_uscalar_t)sti->sti_faddr_len)));
4964 #endif /* DEBUG */
4965 done:
4966 	so_unlock_single(so, SOLOCKED);
4967 	mutex_exit(&so->so_lock);
4968 	return (error);
4969 }
4970 
4971 /*
4972  * Update sti_laddr by asking the transport (unless AF_UNIX).
4973  */
4974 int
4975 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4976     struct cred *cr)
4977 {
4978 	struct strbuf	strbuf;
4979 	int		error = 0, res;
4980 	void		*addr;
4981 	t_uscalar_t	addrlen;
4982 	k_sigset_t	smask;
4983 	sotpi_info_t	*sti = SOTOTPI(so);
4984 
4985 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4986 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4987 
4988 	ASSERT(*namelen > 0);
4989 	mutex_enter(&so->so_lock);
4990 	so_lock_single(so);	/* Set SOLOCKED */
4991 
4992 #ifdef DEBUG
4993 
4994 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4995 	    pr_addr(so->so_family, sti->sti_laddr_sa,
4996 	    (t_uscalar_t)sti->sti_laddr_len)));
4997 #endif /* DEBUG */
4998 	if (sti->sti_laddr_valid) {
4999 		bcopy(sti->sti_laddr_sa, name,
5000 		    MIN(*namelen, sti->sti_laddr_len));
5001 		*namelen = sti->sti_laddr_len;
5002 		goto done;
5003 	}
5004 
5005 	if (so->so_family == AF_UNIX) {
5006 		/*
5007 		 * Transport has different name space - return local info. If we
5008 		 * have enough space, let consumers know the family.
5009 		 */
5010 		if (*namelen >= sizeof (sa_family_t)) {
5011 			name->sa_family = AF_UNIX;
5012 			*namelen = sizeof (sa_family_t);
5013 		} else {
5014 			*namelen = 0;
5015 		}
5016 		error = 0;
5017 		goto done;
5018 	}
5019 	if (!(so->so_state & SS_ISBOUND)) {
5020 		/* If not bound, then nothing to return. */
5021 		error = 0;
5022 		goto done;
5023 	}
5024 
5025 	/* Allocate local buffer to use with ioctl */
5026 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5027 	mutex_exit(&so->so_lock);
5028 	addr = kmem_alloc(addrlen, KM_SLEEP);
5029 
5030 	/*
5031 	 * Issue TI_GETMYNAME with signals masked.
5032 	 * Put the result in sti_laddr_sa so that getsockname works after
5033 	 * a shutdown(output).
5034 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5035 	 * back to the socket.
5036 	 */
5037 	strbuf.buf = addr;
5038 	strbuf.maxlen = addrlen;
5039 	strbuf.len = 0;
5040 
5041 	sigintr(&smask, 0);
5042 	res = 0;
5043 	ASSERT(cr);
5044 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5045 	    0, K_TO_K, cr, &res);
5046 	sigunintr(&smask);
5047 
5048 	mutex_enter(&so->so_lock);
5049 	/*
5050 	 * If there is an error record the error in so_error put don't fail
5051 	 * the getsockname. Instead fallback on the recorded
5052 	 * sti->sti_laddr_sa.
5053 	 */
5054 	if (error) {
5055 		/*
5056 		 * Various stream head errors can be returned to the ioctl.
5057 		 * However, it is impossible to determine which ones of
5058 		 * these are really socket level errors that were incorrectly
5059 		 * consumed by the ioctl. Thus this code silently ignores the
5060 		 * error - to code explicitly does not reinstate the error
5061 		 * using soseterror().
5062 		 * Experiments have shows that at least this set of
5063 		 * errors are reported and should not be reinstated on the
5064 		 * socket:
5065 		 *	EINVAL	E.g. if an I_LINK was in effect when
5066 		 *		getsockname was called.
5067 		 *	EPIPE	The ioctl error semantics prefer the write
5068 		 *		side error over the read side error.
5069 		 */
5070 		error = 0;
5071 	} else if (res == 0 && strbuf.len > 0 &&
5072 	    (so->so_state & SS_ISBOUND)) {
5073 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5074 		sti->sti_laddr_len = (socklen_t)strbuf.len;
5075 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5076 		sti->sti_laddr_valid = 1;
5077 
5078 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5079 		*namelen = sti->sti_laddr_len;
5080 	}
5081 	kmem_free(addr, addrlen);
5082 #ifdef DEBUG
5083 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5084 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5085 	    (t_uscalar_t)sti->sti_laddr_len)));
5086 #endif /* DEBUG */
5087 done:
5088 	so_unlock_single(so, SOLOCKED);
5089 	mutex_exit(&so->so_lock);
5090 	return (error);
5091 }
5092 
5093 /*
5094  * Get socket options. For SOL_SOCKET options some options are handled
5095  * by the sockfs while others use the value recorded in the sonode as a
5096  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5097  *
5098  * On the return most *optlenp bytes are copied to optval.
5099  */
5100 /* ARGSUSED */
5101 int
5102 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5103 		void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5104 {
5105 	struct T_optmgmt_req	optmgmt_req;
5106 	struct T_optmgmt_ack	*optmgmt_ack;
5107 	struct opthdr		oh;
5108 	struct opthdr		*opt_res;
5109 	mblk_t			*mp = NULL;
5110 	int			error = 0;
5111 	void			*option = NULL;	/* Set if fallback value */
5112 	t_uscalar_t		maxlen = *optlenp;
5113 	t_uscalar_t		len;
5114 	uint32_t		value;
5115 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5116 	struct timeval32	tmo_val32;
5117 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5118 
5119 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5120 	    (void *)so, level, option_name, optval, (void *)optlenp,
5121 	    pr_state(so->so_state, so->so_mode)));
5122 
5123 	mutex_enter(&so->so_lock);
5124 	so_lock_single(so);	/* Set SOLOCKED */
5125 
5126 	/*
5127 	 * Check for SOL_SOCKET options.
5128 	 * Certain SOL_SOCKET options are returned directly whereas
5129 	 * others only provide a default (fallback) value should
5130 	 * the T_SVR4_OPTMGMT_REQ fail.
5131 	 */
5132 	if (level == SOL_SOCKET) {
5133 		/* Check parameters */
5134 		switch (option_name) {
5135 		case SO_TYPE:
5136 		case SO_ERROR:
5137 		case SO_DEBUG:
5138 		case SO_ACCEPTCONN:
5139 		case SO_REUSEADDR:
5140 		case SO_KEEPALIVE:
5141 		case SO_DONTROUTE:
5142 		case SO_BROADCAST:
5143 		case SO_USELOOPBACK:
5144 		case SO_OOBINLINE:
5145 		case SO_SNDBUF:
5146 		case SO_RCVBUF:
5147 #ifdef notyet
5148 		case SO_SNDLOWAT:
5149 		case SO_RCVLOWAT:
5150 #endif /* notyet */
5151 		case SO_DOMAIN:
5152 		case SO_DGRAM_ERRIND:
5153 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5154 				error = EINVAL;
5155 				eprintsoline(so, error);
5156 				goto done2;
5157 			}
5158 			break;
5159 		case SO_RCVTIMEO:
5160 		case SO_SNDTIMEO:
5161 			if (get_udatamodel() == DATAMODEL_NONE ||
5162 			    get_udatamodel() == DATAMODEL_NATIVE) {
5163 				if (maxlen < sizeof (struct timeval)) {
5164 					error = EINVAL;
5165 					eprintsoline(so, error);
5166 					goto done2;
5167 				}
5168 			} else {
5169 				if (maxlen < sizeof (struct timeval32)) {
5170 					error = EINVAL;
5171 					eprintsoline(so, error);
5172 					goto done2;
5173 				}
5174 
5175 			}
5176 			break;
5177 		case SO_LINGER:
5178 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5179 				error = EINVAL;
5180 				eprintsoline(so, error);
5181 				goto done2;
5182 			}
5183 			break;
5184 		case SO_SND_BUFINFO:
5185 			if (maxlen < (t_uscalar_t)
5186 			    sizeof (struct so_snd_bufinfo)) {
5187 				error = EINVAL;
5188 				eprintsoline(so, error);
5189 				goto done2;
5190 			}
5191 			break;
5192 		}
5193 
5194 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5195 
5196 		switch (option_name) {
5197 		case SO_TYPE:
5198 			value = so->so_type;
5199 			option = &value;
5200 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5201 
5202 		case SO_ERROR:
5203 			value = sogeterr(so, B_TRUE);
5204 			option = &value;
5205 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5206 
5207 		case SO_ACCEPTCONN:
5208 			if (so->so_state & SS_ACCEPTCONN)
5209 				value = SO_ACCEPTCONN;
5210 			else
5211 				value = 0;
5212 #ifdef DEBUG
5213 			if (value) {
5214 				dprintso(so, 1,
5215 				    ("sotpi_getsockopt: 0x%x is set\n",
5216 				    option_name));
5217 			} else {
5218 				dprintso(so, 1,
5219 				    ("sotpi_getsockopt: 0x%x not set\n",
5220 				    option_name));
5221 			}
5222 #endif /* DEBUG */
5223 			option = &value;
5224 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5225 
5226 		case SO_DEBUG:
5227 		case SO_REUSEADDR:
5228 		case SO_KEEPALIVE:
5229 		case SO_DONTROUTE:
5230 		case SO_BROADCAST:
5231 		case SO_USELOOPBACK:
5232 		case SO_OOBINLINE:
5233 		case SO_DGRAM_ERRIND:
5234 			value = (so->so_options & option_name);
5235 #ifdef DEBUG
5236 			if (value) {
5237 				dprintso(so, 1,
5238 				    ("sotpi_getsockopt: 0x%x is set\n",
5239 				    option_name));
5240 			} else {
5241 				dprintso(so, 1,
5242 				    ("sotpi_getsockopt: 0x%x not set\n",
5243 				    option_name));
5244 			}
5245 #endif /* DEBUG */
5246 			option = &value;
5247 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5248 
5249 		/*
5250 		 * The following options are only returned by sockfs when the
5251 		 * T_SVR4_OPTMGMT_REQ fails.
5252 		 */
5253 		case SO_LINGER:
5254 			option = &so->so_linger;
5255 			len = (t_uscalar_t)sizeof (struct linger);
5256 			break;
5257 		case SO_SNDBUF: {
5258 			ssize_t lvalue;
5259 
5260 			/*
5261 			 * If the option has not been set then get a default
5262 			 * value from the read queue. This value is
5263 			 * returned if the transport fails
5264 			 * the T_SVR4_OPTMGMT_REQ.
5265 			 */
5266 			lvalue = so->so_sndbuf;
5267 			if (lvalue == 0) {
5268 				mutex_exit(&so->so_lock);
5269 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5270 				    QHIWAT, 0, &lvalue);
5271 				mutex_enter(&so->so_lock);
5272 				dprintso(so, 1,
5273 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5274 			}
5275 			value = (int)lvalue;
5276 			option = &value;
5277 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5278 			break;
5279 		}
5280 		case SO_RCVBUF: {
5281 			ssize_t lvalue;
5282 
5283 			/*
5284 			 * If the option has not been set then get a default
5285 			 * value from the read queue. This value is
5286 			 * returned if the transport fails
5287 			 * the T_SVR4_OPTMGMT_REQ.
5288 			 *
5289 			 * XXX If SO_RCVBUF has been set and this is an
5290 			 * XPG 4.2 application then do not ask the transport
5291 			 * since the transport might adjust the value and not
5292 			 * return exactly what was set by the application.
5293 			 * For non-XPG 4.2 application we return the value
5294 			 * that the transport is actually using.
5295 			 */
5296 			lvalue = so->so_rcvbuf;
5297 			if (lvalue == 0) {
5298 				mutex_exit(&so->so_lock);
5299 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5300 				    QHIWAT, 0, &lvalue);
5301 				mutex_enter(&so->so_lock);
5302 				dprintso(so, 1,
5303 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5304 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5305 				value = (int)lvalue;
5306 				option = &value;
5307 				goto copyout;	/* skip asking transport */
5308 			}
5309 			value = (int)lvalue;
5310 			option = &value;
5311 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5312 			break;
5313 		}
5314 		case SO_DOMAIN:
5315 			value = so->so_family;
5316 			option = &value;
5317 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5318 
5319 #ifdef notyet
5320 		/*
5321 		 * We do not implement the semantics of these options
5322 		 * thus we shouldn't implement the options either.
5323 		 */
5324 		case SO_SNDLOWAT:
5325 			value = so->so_sndlowat;
5326 			option = &value;
5327 			break;
5328 		case SO_RCVLOWAT:
5329 			value = so->so_rcvlowat;
5330 			option = &value;
5331 			break;
5332 #endif /* notyet */
5333 		case SO_SNDTIMEO:
5334 		case SO_RCVTIMEO: {
5335 			clock_t val;
5336 
5337 			if (option_name == SO_RCVTIMEO)
5338 				val = drv_hztousec(so->so_rcvtimeo);
5339 			else
5340 				val = drv_hztousec(so->so_sndtimeo);
5341 			tmo_val.tv_sec = val / (1000 * 1000);
5342 			tmo_val.tv_usec = val % (1000 * 1000);
5343 			if (get_udatamodel() == DATAMODEL_NONE ||
5344 			    get_udatamodel() == DATAMODEL_NATIVE) {
5345 				option = &tmo_val;
5346 				len = sizeof (struct timeval);
5347 			} else {
5348 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5349 				option = &tmo_val32;
5350 				len = sizeof (struct timeval32);
5351 			}
5352 			break;
5353 		}
5354 		case SO_SND_BUFINFO: {
5355 			snd_bufinfo.sbi_wroff =
5356 			    (so->so_proto_props).sopp_wroff;
5357 			snd_bufinfo.sbi_maxblk =
5358 			    (so->so_proto_props).sopp_maxblk;
5359 			snd_bufinfo.sbi_maxpsz =
5360 			    (so->so_proto_props).sopp_maxpsz;
5361 			snd_bufinfo.sbi_tail =
5362 			    (so->so_proto_props).sopp_tail;
5363 			option = &snd_bufinfo;
5364 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5365 			break;
5366 		}
5367 		}
5368 	}
5369 
5370 	mutex_exit(&so->so_lock);
5371 
5372 	/* Send request */
5373 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5374 	optmgmt_req.MGMT_flags = T_CHECK;
5375 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5376 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5377 
5378 	oh.level = level;
5379 	oh.name = option_name;
5380 	oh.len = maxlen;
5381 
5382 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5383 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5384 	/* Let option management work in the presence of data flow control */
5385 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5386 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5387 	mp = NULL;
5388 	mutex_enter(&so->so_lock);
5389 	if (error) {
5390 		eprintsoline(so, error);
5391 		goto done2;
5392 	}
5393 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5394 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5395 	if (error) {
5396 		if (option != NULL) {
5397 			/* We have a fallback value */
5398 			error = 0;
5399 			goto copyout;
5400 		}
5401 		eprintsoline(so, error);
5402 		goto done2;
5403 	}
5404 	ASSERT(mp);
5405 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5406 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5407 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5408 	if (opt_res == NULL) {
5409 		if (option != NULL) {
5410 			/* We have a fallback value */
5411 			error = 0;
5412 			goto copyout;
5413 		}
5414 		error = EPROTO;
5415 		eprintsoline(so, error);
5416 		goto done;
5417 	}
5418 	option = &opt_res[1];
5419 
5420 	/* check to ensure that the option is within bounds */
5421 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5422 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5423 		if (option != NULL) {
5424 			/* We have a fallback value */
5425 			error = 0;
5426 			goto copyout;
5427 		}
5428 		error = EPROTO;
5429 		eprintsoline(so, error);
5430 		goto done;
5431 	}
5432 
5433 	len = opt_res->len;
5434 
5435 copyout: {
5436 		t_uscalar_t size = MIN(len, maxlen);
5437 		bcopy(option, optval, size);
5438 		bcopy(&size, optlenp, sizeof (size));
5439 	}
5440 done:
5441 	freemsg(mp);
5442 done2:
5443 	so_unlock_single(so, SOLOCKED);
5444 	mutex_exit(&so->so_lock);
5445 
5446 	return (error);
5447 }
5448 
5449 /*
5450  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5451  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5452  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5453  * setsockopt has to work even if the transport does not support the option.
5454  */
5455 /* ARGSUSED */
5456 int
5457 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5458 	const void *optval, t_uscalar_t optlen, struct cred *cr)
5459 {
5460 	struct T_optmgmt_req	optmgmt_req;
5461 	struct opthdr		oh;
5462 	mblk_t			*mp;
5463 	int			error = 0;
5464 	boolean_t		handled = B_FALSE;
5465 
5466 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5467 	    (void *)so, level, option_name, optval, optlen,
5468 	    pr_state(so->so_state, so->so_mode)));
5469 
5470 	/* X/Open requires this check */
5471 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5472 		if (xnet_check_print)
5473 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5474 		return (EINVAL);
5475 	}
5476 
5477 	mutex_enter(&so->so_lock);
5478 	so_lock_single(so);	/* Set SOLOCKED */
5479 	mutex_exit(&so->so_lock);
5480 
5481 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5482 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5483 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5484 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5485 
5486 	oh.level = level;
5487 	oh.name = option_name;
5488 	oh.len = optlen;
5489 
5490 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5491 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5492 	/* Let option management work in the presence of data flow control */
5493 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5494 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5495 	mp = NULL;
5496 	mutex_enter(&so->so_lock);
5497 	if (error) {
5498 		eprintsoline(so, error);
5499 		goto done2;
5500 	}
5501 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5502 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5503 	if (error) {
5504 		eprintsoline(so, error);
5505 		goto done;
5506 	}
5507 	ASSERT(mp);
5508 	/* No need to verify T_optmgmt_ack */
5509 	freemsg(mp);
5510 done:
5511 	/*
5512 	 * Check for SOL_SOCKET options and record their values.
5513 	 * If we know about a SOL_SOCKET parameter and the transport
5514 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5515 	 * EPROTO) we let the setsockopt succeed.
5516 	 */
5517 	if (level == SOL_SOCKET) {
5518 		/* Check parameters */
5519 		switch (option_name) {
5520 		case SO_DEBUG:
5521 		case SO_REUSEADDR:
5522 		case SO_KEEPALIVE:
5523 		case SO_DONTROUTE:
5524 		case SO_BROADCAST:
5525 		case SO_USELOOPBACK:
5526 		case SO_OOBINLINE:
5527 		case SO_SNDBUF:
5528 		case SO_RCVBUF:
5529 #ifdef notyet
5530 		case SO_SNDLOWAT:
5531 		case SO_RCVLOWAT:
5532 #endif /* notyet */
5533 		case SO_DGRAM_ERRIND:
5534 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5535 				error = EINVAL;
5536 				eprintsoline(so, error);
5537 				goto done2;
5538 			}
5539 			ASSERT(optval);
5540 			handled = B_TRUE;
5541 			break;
5542 		case SO_SNDTIMEO:
5543 		case SO_RCVTIMEO:
5544 			if (get_udatamodel() == DATAMODEL_NONE ||
5545 			    get_udatamodel() == DATAMODEL_NATIVE) {
5546 				if (optlen != sizeof (struct timeval)) {
5547 					error = EINVAL;
5548 					eprintsoline(so, error);
5549 					goto done2;
5550 				}
5551 			} else {
5552 				if (optlen != sizeof (struct timeval32)) {
5553 					error = EINVAL;
5554 					eprintsoline(so, error);
5555 					goto done2;
5556 				}
5557 			}
5558 			ASSERT(optval);
5559 			handled = B_TRUE;
5560 			break;
5561 		case SO_LINGER:
5562 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5563 				error = EINVAL;
5564 				eprintsoline(so, error);
5565 				goto done2;
5566 			}
5567 			ASSERT(optval);
5568 			handled = B_TRUE;
5569 			break;
5570 		}
5571 
5572 #define	intvalue	(*(int32_t *)optval)
5573 
5574 		switch (option_name) {
5575 		case SO_TYPE:
5576 		case SO_ERROR:
5577 		case SO_ACCEPTCONN:
5578 			/* Can't be set */
5579 			error = ENOPROTOOPT;
5580 			goto done2;
5581 		case SO_LINGER: {
5582 			struct linger *l = (struct linger *)optval;
5583 
5584 			so->so_linger.l_linger = l->l_linger;
5585 			if (l->l_onoff) {
5586 				so->so_linger.l_onoff = SO_LINGER;
5587 				so->so_options |= SO_LINGER;
5588 			} else {
5589 				so->so_linger.l_onoff = 0;
5590 				so->so_options &= ~SO_LINGER;
5591 			}
5592 			break;
5593 		}
5594 
5595 		case SO_DEBUG:
5596 #ifdef SOCK_TEST
5597 			if (intvalue & 2)
5598 				sock_test_timelimit = 10 * hz;
5599 			else
5600 				sock_test_timelimit = 0;
5601 
5602 			if (intvalue & 4)
5603 				do_useracc = 0;
5604 			else
5605 				do_useracc = 1;
5606 #endif /* SOCK_TEST */
5607 			/* FALLTHRU */
5608 		case SO_REUSEADDR:
5609 		case SO_KEEPALIVE:
5610 		case SO_DONTROUTE:
5611 		case SO_BROADCAST:
5612 		case SO_USELOOPBACK:
5613 		case SO_OOBINLINE:
5614 		case SO_DGRAM_ERRIND:
5615 			if (intvalue != 0) {
5616 				dprintso(so, 1,
5617 				    ("socket_setsockopt: setting 0x%x\n",
5618 				    option_name));
5619 				so->so_options |= option_name;
5620 			} else {
5621 				dprintso(so, 1,
5622 				    ("socket_setsockopt: clearing 0x%x\n",
5623 				    option_name));
5624 				so->so_options &= ~option_name;
5625 			}
5626 			break;
5627 		/*
5628 		 * The following options are only returned by us when the
5629 		 * transport layer fails.
5630 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5631 		 * since the transport might adjust the value and not
5632 		 * return exactly what was set by the application.
5633 		 */
5634 		case SO_SNDBUF:
5635 			so->so_sndbuf = intvalue;
5636 			break;
5637 		case SO_RCVBUF:
5638 			so->so_rcvbuf = intvalue;
5639 			break;
5640 		case SO_RCVPSH:
5641 			so->so_rcv_timer_interval = intvalue;
5642 			break;
5643 #ifdef notyet
5644 		/*
5645 		 * We do not implement the semantics of these options
5646 		 * thus we shouldn't implement the options either.
5647 		 */
5648 		case SO_SNDLOWAT:
5649 			so->so_sndlowat = intvalue;
5650 			break;
5651 		case SO_RCVLOWAT:
5652 			so->so_rcvlowat = intvalue;
5653 			break;
5654 #endif /* notyet */
5655 		case SO_SNDTIMEO:
5656 		case SO_RCVTIMEO: {
5657 			struct timeval tl;
5658 			clock_t val;
5659 
5660 			if (get_udatamodel() == DATAMODEL_NONE ||
5661 			    get_udatamodel() == DATAMODEL_NATIVE)
5662 				bcopy(&tl, (struct timeval *)optval,
5663 				    sizeof (struct timeval));
5664 			else
5665 				TIMEVAL32_TO_TIMEVAL(&tl,
5666 				    (struct timeval32 *)optval);
5667 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5668 			if (option_name == SO_RCVTIMEO)
5669 				so->so_rcvtimeo = drv_usectohz(val);
5670 			else
5671 				so->so_sndtimeo = drv_usectohz(val);
5672 			break;
5673 		}
5674 		}
5675 #undef	intvalue
5676 
5677 		if (error) {
5678 			if ((error == ENOPROTOOPT || error == EPROTO ||
5679 			    error == EINVAL) && handled) {
5680 				dprintso(so, 1,
5681 				    ("setsockopt: ignoring error %d for 0x%x\n",
5682 				    error, option_name));
5683 				error = 0;
5684 			}
5685 		}
5686 	}
5687 done2:
5688 	so_unlock_single(so, SOLOCKED);
5689 	mutex_exit(&so->so_lock);
5690 	return (error);
5691 }
5692 
5693 /*
5694  * sotpi_close() is called when the last open reference goes away.
5695  */
5696 /* ARGSUSED */
5697 int
5698 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5699 {
5700 	struct vnode *vp = SOTOV(so);
5701 	dev_t dev;
5702 	int error = 0;
5703 	sotpi_info_t *sti = SOTOTPI(so);
5704 
5705 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5706 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5707 
5708 	dev = sti->sti_dev;
5709 
5710 	ASSERT(STREAMSTAB(getmajor(dev)));
5711 
5712 	mutex_enter(&so->so_lock);
5713 	so_lock_single(so);	/* Set SOLOCKED */
5714 
5715 	ASSERT(so_verify_oobstate(so));
5716 
5717 	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5718 		sti->sti_nl7c_flags = 0;
5719 		nl7c_close(so);
5720 	}
5721 
5722 	if (vp->v_stream != NULL) {
5723 		vnode_t *ux_vp;
5724 
5725 		if (so->so_family == AF_UNIX) {
5726 			/* Could avoid this when CANTSENDMORE for !dgram */
5727 			so_unix_close(so);
5728 		}
5729 
5730 		mutex_exit(&so->so_lock);
5731 		/*
5732 		 * Disassemble the linkage from the AF_UNIX underlying file
5733 		 * system vnode to this socket (by atomically clearing
5734 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5735 		 * and frees the stream head.
5736 		 */
5737 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5738 			ASSERT(ux_vp->v_stream);
5739 			sti->sti_ux_bound_vp = NULL;
5740 			vn_rele_stream(ux_vp);
5741 		}
5742 		error = strclose(vp, flag, cr);
5743 		vp->v_stream = NULL;
5744 		mutex_enter(&so->so_lock);
5745 	}
5746 
5747 	/*
5748 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5749 	 */
5750 	so_flush_discon_ind(so);
5751 
5752 	so_unlock_single(so, SOLOCKED);
5753 	mutex_exit(&so->so_lock);
5754 
5755 	/*
5756 	 * Needed for STREAMs.
5757 	 * Decrement the device driver's reference count for streams
5758 	 * opened via the clone dip. The driver was held in clone_open().
5759 	 * The absence of clone_close() forces this asymmetry.
5760 	 */
5761 	if (so->so_flag & SOCLONE)
5762 		ddi_rele_driver(getmajor(dev));
5763 
5764 	return (error);
5765 }
5766 
5767 static int
5768 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5769     struct cred *cr, int32_t *rvalp)
5770 {
5771 	struct vnode *vp = SOTOV(so);
5772 	sotpi_info_t *sti = SOTOTPI(so);
5773 	int error = 0;
5774 
5775 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5776 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5777 
5778 	switch (cmd) {
5779 	case SIOCSQPTR:
5780 		/*
5781 		 * SIOCSQPTR is valid only when helper stream is created
5782 		 * by the protocol.
5783 		 */
5784 	case _I_INSERT:
5785 	case _I_REMOVE:
5786 		/*
5787 		 * Since there's no compelling reason to support these ioctls
5788 		 * on sockets, and doing so would increase the complexity
5789 		 * markedly, prevent it.
5790 		 */
5791 		return (EOPNOTSUPP);
5792 
5793 	case I_FIND:
5794 	case I_LIST:
5795 	case I_LOOK:
5796 	case I_POP:
5797 	case I_PUSH:
5798 		/*
5799 		 * To prevent races and inconsistencies between the actual
5800 		 * state of the stream and the state according to the sonode,
5801 		 * we serialize all operations which modify or operate on the
5802 		 * list of modules on the socket's stream.
5803 		 */
5804 		mutex_enter(&sti->sti_plumb_lock);
5805 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5806 		mutex_exit(&sti->sti_plumb_lock);
5807 		return (error);
5808 
5809 	default:
5810 		if (so->so_version != SOV_STREAM)
5811 			break;
5812 
5813 		/*
5814 		 * The imaginary "sockmod" has been popped; act as a stream.
5815 		 */
5816 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5817 	}
5818 
5819 	ASSERT(so->so_version != SOV_STREAM);
5820 
5821 	/*
5822 	 * Process socket-specific ioctls.
5823 	 */
5824 	switch (cmd) {
5825 	case FIONBIO: {
5826 		int32_t value;
5827 
5828 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5829 		    (mode & (int)FKIOCTL)))
5830 			return (EFAULT);
5831 
5832 		mutex_enter(&so->so_lock);
5833 		if (value) {
5834 			so->so_state |= SS_NDELAY;
5835 		} else {
5836 			so->so_state &= ~SS_NDELAY;
5837 		}
5838 		mutex_exit(&so->so_lock);
5839 		return (0);
5840 	}
5841 
5842 	case FIOASYNC: {
5843 		int32_t value;
5844 
5845 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5846 		    (mode & (int)FKIOCTL)))
5847 			return (EFAULT);
5848 
5849 		mutex_enter(&so->so_lock);
5850 		/*
5851 		 * SS_ASYNC flag not already set correctly?
5852 		 * (!value != !(so->so_state & SS_ASYNC))
5853 		 * but some engineers find that too hard to read.
5854 		 */
5855 		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5856 		    value != 0 && (so->so_state & SS_ASYNC) == 0)
5857 			error = so_flip_async(so, vp, mode, cr);
5858 		mutex_exit(&so->so_lock);
5859 		return (error);
5860 	}
5861 
5862 	case SIOCSPGRP:
5863 	case FIOSETOWN: {
5864 		pid_t pgrp;
5865 
5866 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5867 		    (mode & (int)FKIOCTL)))
5868 			return (EFAULT);
5869 
5870 		mutex_enter(&so->so_lock);
5871 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5872 		/* Any change? */
5873 		if (pgrp != so->so_pgrp)
5874 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
5875 		mutex_exit(&so->so_lock);
5876 		return (error);
5877 	}
5878 	case SIOCGPGRP:
5879 	case FIOGETOWN:
5880 		if (so_copyout(&so->so_pgrp, (void *)arg,
5881 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
5882 			return (EFAULT);
5883 		return (0);
5884 
5885 	case SIOCATMARK: {
5886 		int retval;
5887 		uint_t so_state;
5888 
5889 		/*
5890 		 * strwaitmark has a finite timeout after which it
5891 		 * returns -1 if the mark state is undetermined.
5892 		 * In order to avoid any race between the mark state
5893 		 * in sockfs and the mark state in the stream head this
5894 		 * routine loops until the mark state can be determined
5895 		 * (or the urgent data indication has been removed by some
5896 		 * other thread).
5897 		 */
5898 		do {
5899 			mutex_enter(&so->so_lock);
5900 			so_state = so->so_state;
5901 			mutex_exit(&so->so_lock);
5902 			if (so_state & SS_RCVATMARK) {
5903 				retval = 1;
5904 			} else if (!(so_state & SS_OOBPEND)) {
5905 				/*
5906 				 * No SIGURG has been generated -- there is no
5907 				 * pending or present urgent data. Thus can't
5908 				 * possibly be at the mark.
5909 				 */
5910 				retval = 0;
5911 			} else {
5912 				/*
5913 				 * Have the stream head wait until there is
5914 				 * either some messages on the read queue, or
5915 				 * STRATMARK or STRNOTATMARK gets set. The
5916 				 * STRNOTATMARK flag is used so that the
5917 				 * transport can send up a MSGNOTMARKNEXT
5918 				 * M_DATA to indicate that it is not
5919 				 * at the mark and additional data is not about
5920 				 * to be send upstream.
5921 				 *
5922 				 * If the mark state is undetermined this will
5923 				 * return -1 and we will loop rechecking the
5924 				 * socket state.
5925 				 */
5926 				retval = strwaitmark(vp);
5927 			}
5928 		} while (retval == -1);
5929 
5930 		if (so_copyout(&retval, (void *)arg, sizeof (int),
5931 		    (mode & (int)FKIOCTL)))
5932 			return (EFAULT);
5933 		return (0);
5934 	}
5935 
5936 	case I_FDINSERT:
5937 	case I_SENDFD:
5938 	case I_RECVFD:
5939 	case I_ATMARK:
5940 	case _SIOCSOCKFALLBACK:
5941 		/*
5942 		 * These ioctls do not apply to sockets. I_FDINSERT can be
5943 		 * used to send M_PROTO messages without modifying the socket
5944 		 * state. I_SENDFD/RECVFD should not be used for socket file
5945 		 * descriptor passing since they assume a twisted stream.
5946 		 * SIOCATMARK must be used instead of I_ATMARK.
5947 		 *
5948 		 * _SIOCSOCKFALLBACK from an application should never be
5949 		 * processed.  It is only generated by socktpi_open() or
5950 		 * in response to I_POP or I_PUSH.
5951 		 */
5952 #ifdef DEBUG
5953 		zcmn_err(getzoneid(), CE_WARN,
5954 		    "Unsupported STREAMS ioctl 0x%x on socket. "
5955 		    "Pid = %d\n", cmd, curproc->p_pid);
5956 #endif /* DEBUG */
5957 		return (EOPNOTSUPP);
5958 
5959 	case _I_GETPEERCRED:
5960 		if ((mode & FKIOCTL) == 0)
5961 			return (EINVAL);
5962 
5963 		mutex_enter(&so->so_lock);
5964 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5965 			error = ENOTSUP;
5966 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
5967 			error = ENOTCONN;
5968 		} else if (so->so_peercred != NULL) {
5969 			k_peercred_t *kp = (k_peercred_t *)arg;
5970 			kp->pc_cr = so->so_peercred;
5971 			kp->pc_cpid = so->so_cpid;
5972 			crhold(so->so_peercred);
5973 		} else {
5974 			error = EINVAL;
5975 		}
5976 		mutex_exit(&so->so_lock);
5977 		return (error);
5978 
5979 	default:
5980 		/*
5981 		 * Do the higher-order bits of the ioctl cmd indicate
5982 		 * that it is an I_* streams ioctl?
5983 		 */
5984 		if ((cmd & 0xffffff00U) == STR &&
5985 		    so->so_version == SOV_SOCKBSD) {
5986 #ifdef DEBUG
5987 			zcmn_err(getzoneid(), CE_WARN,
5988 			    "Unsupported STREAMS ioctl 0x%x on socket. "
5989 			    "Pid = %d\n", cmd, 	curproc->p_pid);
5990 #endif /* DEBUG */
5991 			return (EOPNOTSUPP);
5992 		}
5993 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5994 	}
5995 }
5996 
5997 /*
5998  * Handle plumbing-related ioctls.
5999  */
6000 static int
6001 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6002     struct cred *cr, int32_t *rvalp)
6003 {
6004 	static const char sockmod_name[] = "sockmod";
6005 	struct sonode	*so = VTOSO(vp);
6006 	char		mname[FMNAMESZ + 1];
6007 	int		error;
6008 	sotpi_info_t	*sti = SOTOTPI(so);
6009 
6010 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6011 
6012 	if (so->so_version == SOV_SOCKBSD)
6013 		return (EOPNOTSUPP);
6014 
6015 	if (so->so_version == SOV_STREAM) {
6016 		/*
6017 		 * The imaginary "sockmod" has been popped - act as a stream.
6018 		 * If this is a push of sockmod then change back to a socket.
6019 		 */
6020 		if (cmd == I_PUSH) {
6021 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6022 			    (void *)arg, mname, sizeof (mname), NULL);
6023 
6024 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6025 				dprintso(so, 0, ("socktpi_ioctl: going to "
6026 				    "socket version\n"));
6027 				so_stream2sock(so);
6028 				return (0);
6029 			}
6030 		}
6031 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6032 	}
6033 
6034 	switch (cmd) {
6035 	case I_PUSH:
6036 		if (sti->sti_direct) {
6037 			mutex_enter(&so->so_lock);
6038 			so_lock_single(so);
6039 			mutex_exit(&so->so_lock);
6040 
6041 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6042 			    cr, rvalp);
6043 
6044 			mutex_enter(&so->so_lock);
6045 			if (error == 0)
6046 				sti->sti_direct = 0;
6047 			so_unlock_single(so, SOLOCKED);
6048 			mutex_exit(&so->so_lock);
6049 
6050 			if (error != 0)
6051 				return (error);
6052 		}
6053 
6054 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6055 		if (error == 0)
6056 			sti->sti_pushcnt++;
6057 		return (error);
6058 
6059 	case I_POP:
6060 		if (sti->sti_pushcnt == 0) {
6061 			/* Emulate sockmod being popped */
6062 			dprintso(so, 0,
6063 			    ("socktpi_ioctl: going to STREAMS version\n"));
6064 			return (so_sock2stream(so));
6065 		}
6066 
6067 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6068 		if (error == 0)
6069 			sti->sti_pushcnt--;
6070 		return (error);
6071 
6072 	case I_LIST: {
6073 		struct str_mlist *kmlistp, *umlistp;
6074 		struct str_list	kstrlist;
6075 		ssize_t		kstrlistsize;
6076 		int		i, nmods;
6077 
6078 		STRUCT_DECL(str_list, ustrlist);
6079 		STRUCT_INIT(ustrlist, mode);
6080 
6081 		if (arg == NULL) {
6082 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6083 			if (error == 0)
6084 				(*rvalp)++;	/* Add one for sockmod */
6085 			return (error);
6086 		}
6087 
6088 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6089 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6090 		if (error != 0)
6091 			return (error);
6092 
6093 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6094 		if (nmods <= 0)
6095 			return (EINVAL);
6096 		/*
6097 		 * Ceiling nmods at nstrpush to prevent someone from
6098 		 * maliciously consuming lots of kernel memory.
6099 		 */
6100 		nmods = MIN(nmods, nstrpush);
6101 
6102 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6103 		kstrlist.sl_nmods = nmods;
6104 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6105 
6106 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6107 		    cr, rvalp);
6108 		if (error != 0)
6109 			goto done;
6110 
6111 		/*
6112 		 * Considering the module list as a 0-based array of sl_nmods
6113 		 * modules, sockmod should conceptually exist at slot
6114 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6115 		 * of the module names after so_pushcnt over by one.  We know
6116 		 * that there will be room to do this since we allocated
6117 		 * sl_modlist with an additional slot.
6118 		 */
6119 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6120 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6121 
6122 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6123 		kstrlist.sl_nmods++;
6124 
6125 		/*
6126 		 * Copy all of the entries out to ustrlist.
6127 		 */
6128 		kmlistp = kstrlist.sl_modlist;
6129 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6130 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6131 			error = so_copyout(kmlistp++, umlistp++,
6132 			    sizeof (struct str_mlist), mode & FKIOCTL);
6133 			if (error != 0)
6134 				goto done;
6135 		}
6136 
6137 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6138 		    mode & FKIOCTL);
6139 		if (error == 0)
6140 			*rvalp = 0;
6141 	done:
6142 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6143 		return (error);
6144 	}
6145 	case I_LOOK:
6146 		if (sti->sti_pushcnt == 0) {
6147 			return (so_copyout(sockmod_name, (void *)arg,
6148 			    sizeof (sockmod_name), mode & FKIOCTL));
6149 		}
6150 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6151 
6152 	case I_FIND:
6153 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6154 		if (error && error != EINVAL)
6155 			return (error);
6156 
6157 		/* if not found and string was sockmod return 1 */
6158 		if (*rvalp == 0 || error == EINVAL) {
6159 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6160 			    (void *)arg, mname, sizeof (mname), NULL);
6161 			if (error == ENAMETOOLONG)
6162 				error = EINVAL;
6163 
6164 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6165 				*rvalp = 1;
6166 		}
6167 		return (error);
6168 
6169 	default:
6170 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6171 		break;
6172 	}
6173 
6174 	return (0);
6175 }
6176 
6177 /*
6178  * Wrapper around the streams poll routine that implements socket poll
6179  * semantics.
6180  * The sockfs never calls pollwakeup itself - the stream head take care
6181  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6182  * stream head there can never be a deadlock due to holding so_lock across
6183  * pollwakeup and acquiring so_lock in this routine.
6184  *
6185  * However, since the performance of VOP_POLL is critical we avoid
6186  * acquiring so_lock here. This is based on two assumptions:
6187  *  - The poll implementation holds locks to serialize the VOP_POLL call
6188  *    and a pollwakeup for the same pollhead. This ensures that should
6189  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6190  *    (which strsock_* and strrput conspire to issue) is issued after
6191  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6192  *    returned and then wake up poll and have it call VOP_POLL again.
6193  *  - The reading of so_state without holding so_lock does not result in
6194  *    stale data that is older than the latest state change that has dropped
6195  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6196  *    memory barrier to force the data into the coherency domain.
6197  */
6198 static int
6199 sotpi_poll(
6200 	struct sonode	*so,
6201 	short		events,
6202 	int		anyyet,
6203 	short		*reventsp,
6204 	struct pollhead **phpp)
6205 {
6206 	short origevents = events;
6207 	struct vnode *vp = SOTOV(so);
6208 	int error;
6209 	int so_state = so->so_state;	/* snapshot */
6210 	sotpi_info_t *sti = SOTOTPI(so);
6211 
6212 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6213 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6214 
6215 	ASSERT(vp->v_type == VSOCK);
6216 	ASSERT(vp->v_stream != NULL);
6217 
6218 	if (so->so_version == SOV_STREAM) {
6219 		/* The imaginary "sockmod" has been popped - act as a stream */
6220 		return (strpoll(vp->v_stream, events, anyyet,
6221 		    reventsp, phpp));
6222 	}
6223 
6224 	if (!(so_state & SS_ISCONNECTED) &&
6225 	    (so->so_mode & SM_CONNREQUIRED)) {
6226 		/* Not connected yet - turn off write side events */
6227 		events &= ~(POLLOUT|POLLWRBAND);
6228 	}
6229 	/*
6230 	 * Check for errors without calling strpoll if the caller wants them.
6231 	 * In sockets the errors are represented as input/output events
6232 	 * and there is no need to ask the stream head for this information.
6233 	 */
6234 	if (so->so_error != 0 &&
6235 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6236 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6237 		return (0);
6238 	}
6239 	/*
6240 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6241 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6242 	 * will not trigger a POLLIN event with POLLRDDATA set.
6243 	 * The handling of urgent data (causing POLLRDBAND) is done by
6244 	 * inspecting SS_OOBPEND below.
6245 	 */
6246 	events |= POLLRDDATA;
6247 
6248 	/*
6249 	 * After shutdown(output) a stream head write error is set.
6250 	 * However, we should not return output events.
6251 	 */
6252 	events |= POLLNOERR;
6253 	error = strpoll(vp->v_stream, events, anyyet,
6254 	    reventsp, phpp);
6255 	if (error)
6256 		return (error);
6257 
6258 	ASSERT(!(*reventsp & POLLERR));
6259 
6260 	/*
6261 	 * Notes on T_CONN_IND handling for sockets.
6262 	 *
6263 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6264 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6265 	 *
6266 	 * Since the so_lock is not held, soqueueconnind() may have run
6267 	 * and a T_CONN_IND may be waiting. We now check for any queued
6268 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6269 	 * to ensure poll returns.
6270 	 *
6271 	 * However:
6272 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6273 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6274 	 * the following actions will occur; taken together they ensure the
6275 	 * syscall will return.
6276 	 *
6277 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6278 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6279 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6280 	 *    process the message. Additionally socktpi_poll() has probably
6281 	 *    proceeded past the sti_conn_ind_head check below.
6282 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6283 	 *    this thread,  however that could occur before poll_common()
6284 	 *    has entered cv_wait.
6285 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6286 	 *
6287 	 * Before proceeding to cv_wait() in poll_common() for an event,
6288 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6289 	 * and if set, re-calls strpoll() to ensure the late arriving
6290 	 * T_CONN_IND is recognized, and pollsys() returns.
6291 	 */
6292 
6293 	if (sti->sti_conn_ind_head != NULL)
6294 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6295 
6296 	if (so->so_state & SS_CANTRCVMORE) {
6297 		*reventsp |= POLLRDHUP & events;
6298 
6299 		if (so->so_state & SS_CANTSENDMORE)
6300 			*reventsp |= POLLHUP;
6301 	}
6302 
6303 	if (so->so_state & SS_OOBPEND)
6304 		*reventsp |= POLLRDBAND & events;
6305 
6306 	if (sti->sti_nl7c_rcv_mp != NULL) {
6307 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6308 	}
6309 	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6310 	    ((POLLIN|POLLRDNORM) & *reventsp)) {
6311 		sti->sti_nl7c_flags |= NL7C_POLLIN;
6312 	}
6313 
6314 	return (0);
6315 }
6316 
6317 /*ARGSUSED*/
6318 static int
6319 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6320 {
6321 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6322 	int error = 0;
6323 
6324 	error = sonode_constructor(buf, cdrarg, kmflags);
6325 	if (error != 0)
6326 		return (error);
6327 
6328 	error = i_sotpi_info_constructor(&st->st_info);
6329 	if (error != 0)
6330 		sonode_destructor(buf, cdrarg);
6331 
6332 	st->st_sonode.so_priv = &st->st_info;
6333 
6334 	return (error);
6335 }
6336 
6337 /*ARGSUSED1*/
6338 static void
6339 socktpi_destructor(void *buf, void *cdrarg)
6340 {
6341 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6342 
6343 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6344 	st->st_sonode.so_priv = NULL;
6345 
6346 	i_sotpi_info_destructor(&st->st_info);
6347 	sonode_destructor(buf, cdrarg);
6348 }
6349 
6350 static int
6351 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6352 {
6353 	int retval;
6354 
6355 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6356 		struct sonode *so = (struct sonode *)buf;
6357 		sotpi_info_t *sti = SOTOTPI(so);
6358 
6359 		mutex_enter(&socklist.sl_lock);
6360 
6361 		sti->sti_next_so = socklist.sl_list;
6362 		sti->sti_prev_so = NULL;
6363 		if (sti->sti_next_so != NULL)
6364 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6365 		socklist.sl_list = so;
6366 
6367 		mutex_exit(&socklist.sl_lock);
6368 
6369 	}
6370 	return (retval);
6371 }
6372 
6373 static void
6374 socktpi_unix_destructor(void *buf, void *cdrarg)
6375 {
6376 	struct sonode	*so = (struct sonode *)buf;
6377 	sotpi_info_t	*sti = SOTOTPI(so);
6378 
6379 	mutex_enter(&socklist.sl_lock);
6380 
6381 	if (sti->sti_next_so != NULL)
6382 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6383 	if (sti->sti_prev_so != NULL)
6384 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6385 	else
6386 		socklist.sl_list = sti->sti_next_so;
6387 
6388 	mutex_exit(&socklist.sl_lock);
6389 
6390 	socktpi_destructor(buf, cdrarg);
6391 }
6392 
6393 int
6394 socktpi_init(void)
6395 {
6396 	/*
6397 	 * Create sonode caches.  We create a special one for AF_UNIX so
6398 	 * that we can track them for netstat(1m).
6399 	 */
6400 	socktpi_cache = kmem_cache_create("socktpi_cache",
6401 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6402 	    socktpi_destructor, NULL, NULL, NULL, 0);
6403 
6404 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6405 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6406 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6407 
6408 	return (0);
6409 }
6410 
6411 /*
6412  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6413  *
6414  * Caller must still update state and mode using sotpi_update_state().
6415  */
6416 int
6417 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6418     boolean_t *direct, queue_t **qp, struct cred *cr)
6419 {
6420 	sotpi_info_t *sti;
6421 	struct sockparams *origsp = so->so_sockparams;
6422 	sock_lower_handle_t handle = so->so_proto_handle;
6423 	struct stdata *stp;
6424 	struct vnode *vp;
6425 	queue_t *q;
6426 	int error = 0;
6427 
6428 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6429 	    SS_FALLBACK_PENDING);
6430 	ASSERT(SOCK_IS_NONSTR(so));
6431 
6432 	*qp = NULL;
6433 	*direct = B_FALSE;
6434 	so->so_sockparams = newsp;
6435 	/*
6436 	 * Allocate and initalize fields required by TPI.
6437 	 */
6438 	(void) sotpi_info_create(so, KM_SLEEP);
6439 	sotpi_info_init(so);
6440 
6441 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6442 		sotpi_info_fini(so);
6443 		sotpi_info_destroy(so);
6444 		return (error);
6445 	}
6446 	ASSERT(handle == so->so_proto_handle);
6447 	sti = SOTOTPI(so);
6448 	if (sti->sti_direct != 0)
6449 		*direct = B_TRUE;
6450 
6451 	/*
6452 	 * Keep the original sp around so we can properly dispose of the
6453 	 * sonode when the socket is being closed.
6454 	 */
6455 	sti->sti_orig_sp = origsp;
6456 
6457 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6458 	so_alloc_addr(so, so->so_max_addr_len);
6459 
6460 	/*
6461 	 * If the application has done a SIOCSPGRP, make sure the
6462 	 * STREAM head is aware. This needs to take place before
6463 	 * the protocol start sending up messages. Otherwise we
6464 	 * might miss to generate SIGPOLL.
6465 	 *
6466 	 * It is possible that the application will receive duplicate
6467 	 * signals if some were already generated for either data or
6468 	 * connection indications.
6469 	 */
6470 	if (so->so_pgrp != 0) {
6471 		if (so_set_events(so, so->so_vnode, cr) != 0)
6472 			so->so_pgrp = 0;
6473 	}
6474 
6475 	/*
6476 	 * Determine which queue to use.
6477 	 */
6478 	vp = SOTOV(so);
6479 	stp = vp->v_stream;
6480 	ASSERT(stp != NULL);
6481 	q = stp->sd_wrq->q_next;
6482 
6483 	/*
6484 	 * Skip any modules that may have been auto pushed when the device
6485 	 * was opened
6486 	 */
6487 	while (q->q_next != NULL)
6488 		q = q->q_next;
6489 	*qp = _RD(q);
6490 
6491 	/* This is now a STREAMS sockets */
6492 	so->so_not_str = B_FALSE;
6493 
6494 	return (error);
6495 }
6496 
6497 /*
6498  * Revert a TPI sonode. It is only allowed to revert the sonode during
6499  * the fallback process.
6500  */
6501 void
6502 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6503 {
6504 	vnode_t *vp = SOTOV(so);
6505 
6506 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6507 	    SS_FALLBACK_PENDING);
6508 	ASSERT(!SOCK_IS_NONSTR(so));
6509 	ASSERT(vp->v_stream != NULL);
6510 
6511 	strclean(vp);
6512 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6513 
6514 	/*
6515 	 * Restore the original sockparams. The caller is responsible for
6516 	 * dropping the ref to the new sp.
6517 	 */
6518 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6519 
6520 	sotpi_info_fini(so);
6521 	sotpi_info_destroy(so);
6522 
6523 	/* This is no longer a STREAMS sockets */
6524 	so->so_not_str = B_TRUE;
6525 }
6526 
6527 void
6528 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6529     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6530     socklen_t faddrlen, short opts)
6531 {
6532 	sotpi_info_t *sti = SOTOTPI(so);
6533 
6534 	so_proc_tcapability_ack(so, tcap);
6535 
6536 	so->so_options |= opts;
6537 
6538 	/*
6539 	 * Determine whether the foreign and local address are valid
6540 	 */
6541 	if (laddrlen != 0) {
6542 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6543 		sti->sti_laddr_len = laddrlen;
6544 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6545 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6546 	}
6547 
6548 	if (faddrlen != 0) {
6549 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6550 		sti->sti_faddr_len = faddrlen;
6551 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6552 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6553 	}
6554 
6555 }
6556 
6557 /*
6558  * Allocate enough space to cache the local and foreign addresses.
6559  */
6560 void
6561 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6562 {
6563 	sotpi_info_t *sti = SOTOTPI(so);
6564 
6565 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6566 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6567 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6568 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6569 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6570 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6571 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6572 	    + sti->sti_laddr_maxlen);
6573 
6574 	if (so->so_family == AF_UNIX) {
6575 		/*
6576 		 * Initialize AF_UNIX related fields.
6577 		 */
6578 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6579 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6580 	}
6581 }
6582 
6583 
6584 sotpi_info_t *
6585 sotpi_sototpi(struct sonode *so)
6586 {
6587 	sotpi_info_t *sti;
6588 
6589 	ASSERT(so != NULL);
6590 
6591 	sti = (sotpi_info_t *)so->so_priv;
6592 
6593 	ASSERT(sti != NULL);
6594 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6595 
6596 	return (sti);
6597 }
6598 
6599 static int
6600 i_sotpi_info_constructor(sotpi_info_t *sti)
6601 {
6602 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6603 	sti->sti_ack_mp		= NULL;
6604 	sti->sti_discon_ind_mp	= NULL;
6605 	sti->sti_ux_bound_vp	= NULL;
6606 	sti->sti_unbind_mp	= NULL;
6607 
6608 	sti->sti_conn_ind_head	= NULL;
6609 	sti->sti_conn_ind_tail	= NULL;
6610 
6611 	sti->sti_laddr_sa	= NULL;
6612 	sti->sti_faddr_sa	= NULL;
6613 
6614 	sti->sti_nl7c_flags	= 0;
6615 	sti->sti_nl7c_uri	= NULL;
6616 	sti->sti_nl7c_rcv_mp	= NULL;
6617 
6618 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6619 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6620 
6621 	return (0);
6622 }
6623 
6624 static void
6625 i_sotpi_info_destructor(sotpi_info_t *sti)
6626 {
6627 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6628 	ASSERT(sti->sti_ack_mp == NULL);
6629 	ASSERT(sti->sti_discon_ind_mp == NULL);
6630 	ASSERT(sti->sti_ux_bound_vp == NULL);
6631 	ASSERT(sti->sti_unbind_mp == NULL);
6632 
6633 	ASSERT(sti->sti_conn_ind_head == NULL);
6634 	ASSERT(sti->sti_conn_ind_tail == NULL);
6635 
6636 	ASSERT(sti->sti_laddr_sa == NULL);
6637 	ASSERT(sti->sti_faddr_sa == NULL);
6638 
6639 	ASSERT(sti->sti_nl7c_flags == 0);
6640 	ASSERT(sti->sti_nl7c_uri == NULL);
6641 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6642 
6643 	mutex_destroy(&sti->sti_plumb_lock);
6644 	cv_destroy(&sti->sti_ack_cv);
6645 }
6646 
6647 /*
6648  * Creates and attaches TPI information to the given sonode
6649  */
6650 static boolean_t
6651 sotpi_info_create(struct sonode *so, int kmflags)
6652 {
6653 	sotpi_info_t *sti;
6654 
6655 	ASSERT(so->so_priv == NULL);
6656 
6657 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6658 		return (B_FALSE);
6659 
6660 	if (i_sotpi_info_constructor(sti) != 0) {
6661 		kmem_free(sti, sizeof (*sti));
6662 		return (B_FALSE);
6663 	}
6664 
6665 	so->so_priv = (void *)sti;
6666 	return (B_TRUE);
6667 }
6668 
6669 /*
6670  * Initializes the TPI information.
6671  */
6672 static void
6673 sotpi_info_init(struct sonode *so)
6674 {
6675 	struct vnode *vp = SOTOV(so);
6676 	sotpi_info_t *sti = SOTOTPI(so);
6677 	time_t now;
6678 
6679 	sti->sti_dev 	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6680 	vp->v_rdev	= sti->sti_dev;
6681 
6682 	sti->sti_orig_sp = NULL;
6683 
6684 	sti->sti_pushcnt = 0;
6685 
6686 	now = gethrestime_sec();
6687 	sti->sti_atime	= now;
6688 	sti->sti_mtime	= now;
6689 	sti->sti_ctime	= now;
6690 
6691 	sti->sti_eaddr_mp = NULL;
6692 	sti->sti_delayed_error = 0;
6693 
6694 	sti->sti_provinfo = NULL;
6695 
6696 	sti->sti_oobcnt = 0;
6697 	sti->sti_oobsigcnt = 0;
6698 
6699 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6700 
6701 	sti->sti_laddr_sa	= 0;
6702 	sti->sti_faddr_sa	= 0;
6703 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6704 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6705 
6706 	sti->sti_laddr_valid = 0;
6707 	sti->sti_faddr_valid = 0;
6708 	sti->sti_faddr_noxlate = 0;
6709 
6710 	sti->sti_direct = 0;
6711 
6712 	ASSERT(sti->sti_ack_mp == NULL);
6713 	ASSERT(sti->sti_ux_bound_vp == NULL);
6714 	ASSERT(sti->sti_unbind_mp == NULL);
6715 
6716 	ASSERT(sti->sti_conn_ind_head == NULL);
6717 	ASSERT(sti->sti_conn_ind_tail == NULL);
6718 }
6719 
6720 /*
6721  * Given a sonode, grab the TPI info and free any data.
6722  */
6723 static void
6724 sotpi_info_fini(struct sonode *so)
6725 {
6726 	sotpi_info_t *sti = SOTOTPI(so);
6727 	mblk_t *mp;
6728 
6729 	ASSERT(sti->sti_discon_ind_mp == NULL);
6730 
6731 	if ((mp = sti->sti_conn_ind_head) != NULL) {
6732 		mblk_t *mp1;
6733 
6734 		while (mp) {
6735 			mp1 = mp->b_next;
6736 			mp->b_next = NULL;
6737 			freemsg(mp);
6738 			mp = mp1;
6739 		}
6740 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6741 	}
6742 
6743 	/*
6744 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6745 	 * indirect them.  It also uses so_count as a validity test.
6746 	 */
6747 	mutex_enter(&so->so_lock);
6748 
6749 	if (sti->sti_laddr_sa) {
6750 		ASSERT((caddr_t)sti->sti_faddr_sa ==
6751 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6752 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6753 		sti->sti_laddr_valid = 0;
6754 		sti->sti_faddr_valid = 0;
6755 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6756 		sti->sti_laddr_sa = NULL;
6757 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6758 		sti->sti_faddr_sa = NULL;
6759 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6760 	}
6761 
6762 	mutex_exit(&so->so_lock);
6763 
6764 	if ((mp = sti->sti_eaddr_mp) != NULL) {
6765 		freemsg(mp);
6766 		sti->sti_eaddr_mp = NULL;
6767 		sti->sti_delayed_error = 0;
6768 	}
6769 
6770 	if ((mp = sti->sti_ack_mp) != NULL) {
6771 		freemsg(mp);
6772 		sti->sti_ack_mp = NULL;
6773 	}
6774 
6775 	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6776 		sti->sti_nl7c_rcv_mp = NULL;
6777 		freemsg(mp);
6778 	}
6779 	sti->sti_nl7c_rcv_rval = 0;
6780 	if (sti->sti_nl7c_uri != NULL) {
6781 		nl7c_urifree(so);
6782 		/* urifree() cleared nl7c_uri */
6783 	}
6784 	if (sti->sti_nl7c_flags) {
6785 		sti->sti_nl7c_flags = 0;
6786 	}
6787 
6788 	ASSERT(sti->sti_ux_bound_vp == NULL);
6789 	if ((mp = sti->sti_unbind_mp) != NULL) {
6790 		freemsg(mp);
6791 		sti->sti_unbind_mp = NULL;
6792 	}
6793 }
6794 
6795 /*
6796  * Destroys the TPI information attached to a sonode.
6797  */
6798 static void
6799 sotpi_info_destroy(struct sonode *so)
6800 {
6801 	sotpi_info_t *sti = SOTOTPI(so);
6802 
6803 	i_sotpi_info_destructor(sti);
6804 	kmem_free(sti, sizeof (*sti));
6805 
6806 	so->so_priv = NULL;
6807 }
6808 
6809 /*
6810  * Create the global sotpi socket module entry. It will never be freed.
6811  */
6812 smod_info_t *
6813 sotpi_smod_create(void)
6814 {
6815 	smod_info_t *smodp;
6816 
6817 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6818 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6819 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6820 	/*
6821 	 * Initialize the smod_refcnt to 1 so it will never be freed.
6822 	 */
6823 	smodp->smod_refcnt = 1;
6824 	smodp->smod_uc_version = SOCK_UC_VERSION;
6825 	smodp->smod_dc_version = SOCK_DC_VERSION;
6826 	smodp->smod_sock_create_func = &sotpi_create;
6827 	smodp->smod_sock_destroy_func = &sotpi_destroy;
6828 	return (smodp);
6829 }
6830